")
- |> String.replace(@regexes[:replaceFontsRe], "<\1span>")
- |> Floki.find("html")
- |> Floki.filter_out(:comment)
- end
-
- def title do
- html |> Floki.find("title") |> Floki.text
- end
-
- def content do
- html
- |> Floki.filter_out("script")
- |> Floki.filter_out("style")
- end
-
- def page do
- {:ok, f} = File.read("test/features/nytimes.html")
- f
- end
-
- def default_options do
- @default_options
- end
-
- def regexes do
- @regexes
- end
-end
diff --git a/lib/readability.ex b/lib/readability.ex
index 87e2840..720f42e 100644
--- a/lib/readability.ex
+++ b/lib/readability.ex
@@ -1,10 +1,86 @@
defmodule Readability do
+ @moduledoc """
+ """
+
alias Readability.TitleFinder
+ alias Readability.ArticleBuilder
+
+ @default_options [retry_length: 250,
+ min_text_length: 25,
+ remove_unlikely_candidates: true,
+ weight_classes: true,
+ clean_conditionally: true,
+ remove_empty_nodes: true,
+ min_image_width: 130,
+ min_image_height: 80,
+ ignore_image_format: [],
+ blacklist: nil,
+ whitelist: nil
+ ]
+
+ @regexes [unlikely_candidate: ~r/combx|comment|community|disqus|extra|foot|header|hidden|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
+ ok_maybe_its_a_candidate: ~r/and|article|body|column|main|shadow/i,
+ positive: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
+ negative: ~r/hidden|^hid|combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
+ div_to_p_elements: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
+ replace_brs: ~r/( ]*>[ \n\r\t]*){2,}/i,
+ replace_fonts: ~r/<(\/?)font[^>]*>/i,
+ normalize: ~r/\s{2,}/,
+ video: ~r/\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i
+ ]
@type html_tree :: tuple | list
+ @type options :: list
- def title(html) when is_binary(html), do: parse(html) |> title
+ def title(html) when is_binary(html), do: html |> parse |> title
def title(html_tree), do: TitleFinder.title(html_tree)
- def parse(raw_html), do: Floki.parse(raw_html)
+ @doc """
+ Using a variety of metrics (content score, classname, element types), find the content that is
+ most likely to be the stuff a user wants to read
+ """
+ @spec content(binary, options) :: binary
+ def content(raw_html, opts \\ @default_options) do
+ opts = Keyword.merge(@default_options, opts)
+ raw_html
+ |> parse
+ |> ArticleBuilder.build(opts)
+ end
+
+ @doc """
+ Normalize and Parse to html tree(tuple or list)) from binary html
+ """
+ @spec parse(binary) :: html_tree
+ def parse(raw_html) do
+ raw_html
+ |> String.replace(Readability.regexes[:replace_brs], "
")
+ |> String.replace(Readability.regexes[:replace_fonts], "<\1span>")
+ |> String.replace(Readability.regexes[:normalize], " ")
+ |> Floki.parse
+ |> Floki.filter_out(:comment)
+ end
+
+ @doc """
+ return raw html binary from html tree tuple
+ """
+ @spec raw_html(html_tree) :: binary
+ def raw_html(html_tree) do
+ html_tree |> Floki.raw_html
+ end
+
+ @doc """
+ return only text binary from html tree tuple
+ """
+ @spec raw_html(html_tree) :: binary
+ def readabl_text(html_tree) do
+ tags_to_br = ~r/<\/(p|div|article|h\d)/i
+ html_str = html_tree |> raw_html
+ Regex.replace(tags_to_br, html_str, &("\n#{&1}"))
+ |> Floki.parse
+ |> Floki.text
+ end
+
+ def regexes, do: @regexes
+
+ def default_options, do: @default_options
end
diff --git a/lib/readability/article_builder.ex b/lib/readability/article_builder.ex
new file mode 100644
index 0000000..e7b6254
--- /dev/null
+++ b/lib/readability/article_builder.ex
@@ -0,0 +1,100 @@
+defmodule Readability.ArticleBuilder do
+ @moduledoc """
+ build article for readability
+ """
+
+ alias Readability.Helper
+ alias Readability.Sanitizer
+ alias Readability.Candidate
+ alias Readability.CandidateFinder
+ alias Readability.Candidate.Cleaner
+ alias Readability.Candidate.Scoring
+
+ @type html_tree :: tuple | list
+ @type options :: list
+
+ @doc """
+ Prepare the article node for display.
+ Clean out any inline styles, iframes, forms, strip extraneous
tags, etc.
+ """
+ @spec build(html_tree, options) :: html_tree
+ def build(html_tree, opts) do
+ origin_tree = html_tree
+ html_tree = html_tree
+ |> Helper.remove_tag(fn({tag, _, _}) ->
+ Enum.member?(["script", "style"], tag)
+ end)
+
+ if opts[:remove_unlikely_candidates] do
+ html_tree = Cleaner.remove_unlikely_tree(html_tree)
+ end
+ html_tree = Cleaner.transform_misused_div_to_p(html_tree)
+
+ candidates = CandidateFinder.find(html_tree, opts)
+ article = find_article(candidates, html_tree)
+
+ html_tree = Sanitizer.sanitize(article, candidates, opts)
+
+ if Helper.text_length(html_tree) < opts[:retry_length] do
+ if opts = next_try_opts(opts) do
+ build(origin_tree, opts)
+ else
+ html_tree
+ end
+ else
+ html_tree
+ end
+ end
+
+ defp next_try_opts(opts) do
+ cond do
+ opts[:remove_unlikely_candidates] ->
+ Keyword.put(opts, :remove_unlikely_candidates, false)
+ opts[:weight_classes] ->
+ Keyword.put(opts, :weight_classes, false)
+ opts[:clean_conditionally] ->
+ Keyword.put(opts, :clean_conditionally, false)
+ true -> nil
+ end
+ end
+
+ defp find_article(candidates, html_tree) do
+ best_candidate = CandidateFinder.find_best_candidate(candidates)
+ unless best_candidate do
+ tree = html_tree |> Floki.find("body") |> hd
+ best_candidate = %Candidate{html_tree: tree}
+ end
+ article_trees = find_article_trees(best_candidate, candidates)
+ {"div", [], article_trees}
+ end
+
+ defp find_article_trees(best_candidate, candidates) do
+ score_threshold = Enum.max([10, best_candidate.score * 0.2])
+
+ candidates
+ |> Enum.filter(&(&1.tree_depth == best_candidate.tree_depth))
+ |> Enum.filter_map(fn(candidate) ->
+ candidate == best_candidate
+ || candidate.score >= score_threshold
+ || append?(candidate)
+ end, &(to_article_tag(&1.html_tree)))
+ end
+
+ defp append?(%Candidate{html_tree: html_tree}) when elem(html_tree, 0) == "p" do
+ link_density = Scoring.calc_link_density(html_tree)
+ inner_text = html_tree |> Floki.text
+ inner_length = inner_text |> String.length
+
+ (inner_length > 80 && link_density < 0.25)
+ || (inner_length < 80 && link_density == 0 && inner_text =~ ~r/\.( |$)/)
+ end
+ defp append?(_), do: false
+
+ defp to_article_tag({tag, attrs, inner_tree} = html_tree) do
+ if tag =~ ~r/^p$|^div$/ do
+ html_tree
+ else
+ {"div", attrs, inner_tree}
+ end
+ end
+end
diff --git a/lib/readability/candidate.ex b/lib/readability/candidate.ex
new file mode 100644
index 0000000..7655d37
--- /dev/null
+++ b/lib/readability/candidate.ex
@@ -0,0 +1,6 @@
+defmodule Readability.Candidate do
+ @moduledoc """
+ Candidate can be article
+ """
+ defstruct html_tree: {}, score: 0, tree_depth: 0
+end
diff --git a/lib/readability/candidate/cleaner.ex b/lib/readability/candidate/cleaner.ex
new file mode 100644
index 0000000..41c65aa
--- /dev/null
+++ b/lib/readability/candidate/cleaner.ex
@@ -0,0 +1,48 @@
+defmodule Readability.Candidate.Cleaner do
+ @moduledoc """
+ Clean html tree for prepare candidates.
+ It transforms misused tags and removes unlikely candidates.
+ """
+
+ alias Readability.Helper
+
+ @type html_tree :: tuple | list
+
+ @doc """
+ Transform misused divs
s that do not contain other block elements into
s
+ """
+ @spec transform_misused_div_to_p(html_tree) :: html_tree
+ def transform_misused_div_to_p(content) when is_binary(content), do: content
+ def transform_misused_div_to_p([]), do: []
+ def transform_misused_div_to_p([h|t]) do
+ [transform_misused_div_to_p(h)|transform_misused_div_to_p(t)]
+ end
+ def transform_misused_div_to_p({tag, attrs, inner_tree}) do
+ if misused_divs?(tag, inner_tree), do: tag = "p"
+ {tag, attrs, transform_misused_div_to_p(inner_tree)}
+ end
+
+ @doc """
+ Remove unlikely html tree
+ """
+ @spec remove_unlikely_tree(html_tree) :: html_tree
+ def remove_unlikely_tree(html_tree) do
+ Helper.remove_tag(html_tree, &unlikely_tree?(&1))
+ end
+
+ defp misused_divs?("div", inner_tree) do
+ !(Floki.raw_html(inner_tree) =~ Readability.regexes[:div_to_p_elements])
+ end
+ defp misused_divs?(_, _), do: false
+
+ defp unlikely_tree?({tag, attrs, _}) do
+ idclass_str = attrs
+ |> Enum.filter_map(&(elem(&1, 0) =~ ~r/id|class/i), &(elem(&1, 1)))
+ |> Enum.join("")
+ str = tag <> idclass_str
+
+ str =~ Readability.regexes[:unlikely_candidate]
+ && !(str =~ Readability.regexes[:ok_maybe_its_a_candidate])
+ && tag != "html"
+ end
+end
diff --git a/lib/readability/candidate/scoring.ex b/lib/readability/candidate/scoring.ex
new file mode 100644
index 0000000..ed9edbb
--- /dev/null
+++ b/lib/readability/candidate/scoring.ex
@@ -0,0 +1,89 @@
+defmodule Readability.Candidate.Scoring do
+ @moduledoc """
+ Score html tree
+ """
+ alias Readability.Helper
+
+ @element_scores %{"div" => 5,
+ "blockquote" => 3,
+ "form" => -3,
+ "th" => -5
+ }
+
+ @type html_tree :: tuple | list
+ @type options :: list
+
+ @doc """
+ Score html tree by some algorithm that check children nodes, attributes, link densities, etcs..
+ options -> weight_classes :: boolean, calculate weight class
+ """
+ @spec calc_score(html_tree, options) :: number
+ def calc_score(html_tree, opts \\ []) do
+ score = calc_node_score(html_tree, opts)
+ score = score + calc_children_content_score(html_tree) + calc_grand_children_content_score(html_tree)
+ score * (1 - calc_link_density(html_tree))
+ end
+
+ defp calc_content_score(html_tree) do
+ score = 1
+ inner_text = html_tree |> Floki.text
+ split_score = inner_text |> String.split(",") |> length
+ length_score = [(String.length(inner_text) / 100), 3] |> Enum.min
+ score + split_score + length_score
+ end
+
+ defp calc_node_score({tag, attrs, _}, opts) do
+ score = 0
+ if opts[:weight_classes], do: score = score + class_weight(attrs)
+ score + (@element_scores[tag] || 0)
+ end
+ defp calc_node_score([h|t], opts) do
+ calc_node_score(h, opts) + calc_node_score(t, opts)
+ end
+ defp calc_node_score([], _), do: 0
+
+ def class_weight(attrs) do
+ weight = 0
+ class = attrs |> List.keyfind("class", 0, {"", ""}) |> elem(1)
+ id = attrs |> List.keyfind("id", 0, {"", ""}) |> elem(1)
+
+ if class =~ Readability.regexes[:positive], do: weight = weight + 25
+ if id =~ Readability.regexes[:positive], do: weight = weight + 25
+ if class =~ Readability.regexes[:negative], do: weight = weight - 25
+ if id =~ Readability.regexes[:negative], do: weight = weight - 25
+
+ weight
+ end
+
+ def calc_link_density(html_tree) do
+ link_length = html_tree
+ |> Floki.find("a")
+ |> Floki.text
+ |> String.length
+
+ text_length = html_tree
+ |> Floki.text
+ |> String.length
+
+ if text_length == 0 do
+ 0
+ else
+ link_length / text_length
+ end
+ end
+
+ defp calc_children_content_score({_, _, children_tree}) do
+ children_tree
+ |> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1)))
+ |> calc_content_score
+ end
+
+ defp calc_grand_children_content_score({_, _, children_tree}) do
+ score = children_tree
+ |> Enum.filter_map(&is_tuple(&1), &elem(&1, 2))
+ |> List.flatten
+ |> Enum.filter(&(is_tuple(&1) && Helper.candidate_tag?(&1)))
+ |> calc_content_score
+ score / 2
+ end
+end
diff --git a/lib/readability/candidate_finder.ex b/lib/readability/candidate_finder.ex
new file mode 100644
index 0000000..4f4896a
--- /dev/null
+++ b/lib/readability/candidate_finder.ex
@@ -0,0 +1,60 @@
+defmodule Readability.CandidateFinder do
+ @moduledoc """
+ The builing and finding candidates engine
+ It traverses the HTML tree searching, removing, socring nodes
+ """
+
+ alias Readability.Helper
+ alias Readability.Candidate
+ alias Readability.Candidate.Scoring
+
+ @type html_tree :: tuple | list
+ @type options :: list
+
+ @doc """
+ Find candidates that shuld be meaningful article by analysing nodes
+ """
+ @spec find(html_tree, options, number) :: [Candidate.t]
+ def find(_, opts \\ [], tree_depth \\ 0)
+ def find([], _, _), do: []
+ def find([h|t], opts, tree_depth) do
+ [find(h, opts, tree_depth) | find(t, opts, tree_depth)]
+ |> List.flatten
+ end
+ def find(text, _, _) when is_binary(text), do: []
+ def find({tag, attrs, inner_tree}, opts, tree_depth) do
+ html_tree = {tag, attrs, inner_tree}
+ if candidate?(html_tree) do
+ candidate = %Candidate{html_tree: html_tree,
+ score: Scoring.calc_score(html_tree, opts),
+ tree_depth: tree_depth}
+
+ [candidate | find(inner_tree, opts, tree_depth + 1)]
+ else
+ find(inner_tree, opts, tree_depth + 1)
+ end
+ end
+
+ @doc """
+ Find the highest score candidate.
+ """
+ @spec find_best_candidate([Candidate.t]) :: Candidate.t
+ def find_best_candidate([]), do: nil
+ def find_best_candidate(candidates) do
+ candidates
+ |> Enum.max_by(fn(candidate) -> candidate.score end)
+ end
+
+ defp candidate?(_, depth \\ 0)
+ defp candidate?(_, depth) when depth > 2, do: false
+ defp candidate?([h|t], depth), do: candidate?(h, depth) || candidate?(t, depth)
+ defp candidate?([], _), do: false
+ defp candidate?(text, _) when is_binary(text), do: false
+ defp candidate?({_, _, inner_tree} = html_tree, depth) do
+ if Helper.candidate_tag?(html_tree) do
+ true
+ else
+ candidate?(inner_tree, depth + 1)
+ end
+ end
+end
diff --git a/lib/readability/content_finder.ex b/lib/readability/content_finder.ex
deleted file mode 100644
index 43c5991..0000000
--- a/lib/readability/content_finder.ex
+++ /dev/null
@@ -1,94 +0,0 @@
-defmodule Readability.ContentFinder do
- @moduledoc """
- ContentFinder uses a variety of metrics for finding the content
- that is most likely to be the stuff a user wants to read.
- Then return it wrapped up in a div.
- """
-
- @regexes [ unlikelyCandidatesRe: ~r/combx|comment|community|disqus|extra|foot|header|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
- okMaybeItsACandidateRe: ~r/and|article|body|column|main|shadow/i,
- positiveRe: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
- negativeRe: ~r/combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
- divToPElementsRe: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
- replaceBrsRe: ~r/( ]*>[ \n\r\t]*){2,}/i,
- replaceFontsRe: ~r/<(\/?)font[^>]*>/i,
- trimRe: ~r/^\s+|\s+$/,
- normalizeRe: ~r/\s{2,}/,
- killBreaksRe: ~r/( (\s| ?)*){1,}/,
- videoRe: ~r/http:\/\/(www\.)?(youtube|vimeo)\.com/i
- ]
-
- @type html_tree :: tuple | list
-
- @spec content(html_tree) :: html_tree
-
- def content(html_tree, options \\ []) do
- candidate = html_tree
- |> preapre_cadidates
-
- best_candidate = candidate
- |> select_best_candidate
-
- candidate
- |> fix_relative_uris
- end
-
- defp preapre_cadidates(html_tree) do
- html_tree
- |> Floki.filter_out("script")
- |> Floki.filter_out("style")
- |> remove_unlikely_candidates
- |> transform_misused_divs_into_paragraphs
- end
-
- @doc """
- Remove unlikely tag nodes
- """
-
- @spec remove_unlikely_candidates(html_tree) :: html_tree
-
- def remove_unlikely_candidates(content) when is_binary(content), do: content
- def remove_unlikely_candidates([]), do: []
- def remove_unlikely_candidates([h|t]) do
- case remove_unlikely_candidates(h) do
- nil -> remove_unlikely_candidates(t)
- html_tree -> [html_tree|remove_unlikely_candidates(t)]
- end
- end
- def remove_unlikely_candidates({tag_name, attrs, inner_tree}) do
- cond do
- unlikely_candidate?(tag_name, attrs) -> nil
- true -> {tag_name, attrs, remove_unlikely_candidates(inner_tree)}
- end
- end
- defp unlikely_candidate?(tag_name, attrs) do
- idclass_str = attrs
- |> Enum.filter_map(fn(attr) -> elem(attr, 0) =~ ~r/id|class/i end,
- fn(attr) -> elem(attr, 1) end)
- |> Enum.join("")
- str = tag_name <> idclass_str
- str =~ @regexes[:unlikelyCandidatesRe] && !(str =~ @regexes[:okMaybeItsACandidateRe]) && tag_name != "html"
- end
-
- def transform_misused_divs_into_paragraphs(content) when is_binary(content), do: content
- def transform_misused_divs_into_paragraphs([]), do: []
- def transform_misused_divs_into_paragraphs([h|t]) do
- [transform_misused_divs_into_paragraphs(h)|transform_misused_divs_into_paragraphs(t)]
- end
- def transform_misused_divs_into_paragraphs({tag_name, attrs, inner_tree} = html_tree) do
- if misused_divs?(tag_name, inner_tree), do: tag_name = "p"
- {tag_name, attrs, transform_misused_divs_into_paragraphs(inner_tree)}
- end
- defp misused_divs?("div", inner_tree) do
- !(Floki.raw_html(inner_tree) =~ @regexes[:divToPElementsRe])
- end
- defp misused_divs?(_, _), do: false
-
- defp select_best_candidate(html_tree) do
- html_tree
- end
-
- defp fix_relative_uris(html_tree) do
- html_tree
- end
-end
diff --git a/lib/readability/helper.ex b/lib/readability/helper.ex
index 3650cf6..9551da3 100644
--- a/lib/readability/helper.ex
+++ b/lib/readability/helper.ex
@@ -1,25 +1,93 @@
defmodule Readability.Helper do
@moduledoc """
- Utilities
+ Helpers for parsing, updating, removing html tree
"""
@type html_tree :: tuple | list
@doc """
- change existing tags by selector
+ Change existing tags by selector
"""
-
@spec change_tag(html_tree, String.t, String.t) :: html_tree
-
+ def change_tag(content, _, _) when is_binary(content), do: content
+ def change_tag([], _, _), do: []
+ def change_tag([h|t], selector, tag) do
+ [change_tag(h, selector, tag)|change_tag(t, selector, tag)]
+ end
def change_tag({tag_name, attrs, inner_tree}, tag_name, tag) do
{tag, attrs, change_tag(inner_tree, tag_name, tag)}
end
def change_tag({tag_name, attrs, html_tree}, selector, tag) do
{tag_name, attrs, change_tag(html_tree, selector, tag)}
end
- def change_tag([h|t], selector, tag) do
- [change_tag(h, selector, tag)|change_tag(t, selector, tag)]
+
+ @spec remove_attrs(html_tree, String.t | [String.t] | Regex.t) :: html_tree
+ def remove_attrs(content, _) when is_binary(content), do: content
+ def remove_attrs([], _), do: []
+ def remove_attrs([h|t], t_attrs) do
+ [remove_attrs(h, t_attrs)|remove_attrs(t, t_attrs)]
+ end
+ def remove_attrs({tag_name, attrs, inner_tree}, target_attr) do
+ reject_fun = fn(attr) -> attr end
+ cond do
+ is_binary(target_attr) ->
+ reject_fun = fn(attr) -> elem(attr, 0) == target_attr end
+ Regex.regex?(target_attr) ->
+ reject_fun = fn(attr) -> elem(attr, 0) =~ target_attr end
+ is_list(target_attr) ->
+ reject_fun = fn(attr) -> Enum.member?(target_attr, elem(attr, 0)) end
+ true -> nil
+ end
+ {tag_name, Enum.reject(attrs, reject_fun), remove_attrs(inner_tree, target_attr)}
+ end
+
+
+ @doc """
+ Remove tags
+ """
+ @spec remove_tag(html_tree, fun) :: html_tree
+ def remove_tag(content, _) when is_binary(content), do: content
+ def remove_tag([], _), do: []
+ def remove_tag([h|t], fun) do
+ node = remove_tag(h, fun)
+ if is_nil(node) do
+ remove_tag(t, fun)
+ else
+ [node|remove_tag(t, fun)]
+ end
+ end
+ def remove_tag({tag, attrs, inner_tree} = html_tree, fun) do
+ if fun.(html_tree) do
+ nil
+ else
+ {tag, attrs, remove_tag(inner_tree, fun)}
+ end
+ end
+
+ @doc """
+ count only text length
+ """
+ @spec text_length(html_tree) :: number
+ def text_length(html_tree) do
+ html_tree |> Floki.text |> String.strip |> String.length
+ end
+
+ @doc """
+ Check html_tree can be candidate or not.
+ """
+ @spec candidate_tag?(html_tree) :: boolean
+ def candidate_tag?(html_tree) do
+ Enum.any?(candidates_selector, fn(selector) ->
+ Floki.Selector.match?(html_tree, selector)
+ && (text_length(html_tree)) >= Readability.default_options[:min_text_length]
+ end)
+ end
+
+ defp candidates_selector do
+ ["p", "td"]
+ |> Enum.map(fn(s) ->
+ tokens = Floki.SelectorTokenizer.tokenize(s)
+ Floki.SelectorParser.parse(tokens)
+ end)
end
- def change_tag([], selector, tag), do: []
- def change_tag(content, selector, tag) when is_binary(content), do: content
end
diff --git a/lib/readability/sanitizer.ex b/lib/readability/sanitizer.ex
new file mode 100644
index 0000000..b8eae7a
--- /dev/null
+++ b/lib/readability/sanitizer.ex
@@ -0,0 +1,85 @@
+defmodule Readability.Sanitizer do
+ @moduledoc """
+ Clean an element of all tags of type "tag" if they look fishy.
+ "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
+ """
+
+ alias Readability.Helper
+ alias Readability.Candidate
+ alias Readability.Candidate.Scoring
+
+ @type html_tree :: tuple | list
+
+ @doc """
+ Sanitizes article html tree
+ """
+ @spec sanitize(html_tree, [Candidate.t], list) :: html_tree
+ def sanitize(html_tree, candidates, opts \\ []) do
+ html_tree = html_tree
+ |> Helper.remove_tag(&clean_headline_tag?(&1))
+ |> Helper.remove_tag(&clean_unlikely_tag?(&1))
+ |> Helper.remove_tag(&clean_empty_p?(&1))
+
+ if opts[:clean_conditionally] do
+ html_tree = html_tree
+ |> Helper.remove_tag(conditionally_cleaing_fn(candidates))
+ end
+
+ html_tree |> Helper.remove_attrs("style")
+ end
+
+ defp conditionally_cleaing_fn(candidates) do
+ fn({tag, attrs, _} = tree) ->
+ if Enum.any?(["table", "ul", "div"], &(&1 == tag)) do
+ weight = Scoring.class_weight(attrs)
+ same_tree = candidates
+ |> Enum.find(%Candidate{}, &(&1.html_tree == tree))
+ list? = tag == "ul"
+ cond do
+ weight + same_tree.score < 0
+ -> true
+
+ length(Regex.scan(~r/\,/, Floki.text(tree))) < 10 ->
+ # If there are not very many commas, and the number of
+ # non-paragraph elements is more than paragraphs or other
+ # ominous signs, remove the element.
+ p_len = tree |> Floki.find("p") |> length
+ img_len = tree |> Floki.find("img") |> length
+ li_len = tree |> Floki.find("li") |> length
+ input_len = tree |> Floki.find("input") |> length
+ embed_len = tree
+ |> Floki.find("embed")
+ |> Enum.reject(&(&1 =~ Readability.regexes[:video]))
+ |> length
+
+ link_density = Scoring.calc_link_density(tree)
+ conent_len = Helper.text_length(tree)
+
+ img_len > p_len # too many image
+ || (!list? && li_len > p_len) # more
s than
s
+ || input_len > (p_len / 3) # less than 3x
s than s
+ || (!list? && conent_len < Readability.regexes[:min_text_length] && img_len != 1) # too short a content length without a single image
+ || (weight < 25 && link_density > 0.2) # too many links for its weight (#{weight})
+ || (weight >= 25 && link_density > 0.5) # too many links for its weight (#{weight})
+ || ((embed_len == 1 && conent_len < 75) || embed_len > 1) #
').gsub(REGEXES[:replaceFontsRe], '<\1span>')
- @remove_unlikely_candidates = @options[:remove_unlikely_candidates]
- @weight_classes = @options[:weight_classes]
- @clean_conditionally = @options[:clean_conditionally]
- @best_candidate_has_image = true
- make_html
- handle_exclusions!(@options[:whitelist], @options[:blacklist])
- end
-
- def images(content=nil, reload=false)
- begin
- require 'fastimage'
- rescue LoadError
- raise "Please install fastimage in order to use the #images feature."
- end
-
- @best_candidate_has_image = false if reload
-
- prepare_candidates
- list_images = []
- tested_images = []
- content = @best_candidate[:elem] unless reload
-
- return list_images if content.nil?
- elements = content.css("img").map(&:attributes)
-
- elements.each do |element|
- next unless element["src"]
-
- url = element["src"].value
- height = element["height"].nil? ? 0 : element["height"].value.to_i
- width = element["width"].nil? ? 0 : element["width"].value.to_i
-
- if url =~ /\Ahttps?:\/\//i && (height.zero? || width.zero?)
- image = get_image_size(url)
- next unless image
- else
- image = {:width => width, :height => height}
- end
-
- image[:format] = File.extname(url).gsub(".", "")
-
- if tested_images.include?(url)
- debug("Image was tested: #{url}")
- next
- end
-
- tested_images.push(url)
- if image_meets_criteria?(image)
- list_images << url
- else
- debug("Image discarded: #{url} - height: #{image[:height]} - width: #{image[:width]} - format: #{image[:format]}")
- end
- end
-
- (list_images.empty? and content != @html) ? images(@html, true) : list_images
- end
-
- def images_with_fqdn_uris!(source_uri)
- images_with_fqdn_uris(@html, source_uri)
- end
-
- def images_with_fqdn_uris(document = @html.dup, source_uri)
- uri = URI.parse(source_uri)
- host = uri.host
- scheme = uri.scheme
- port = uri.port # defaults to 80
-
- base = "#{scheme}://#{host}:#{port}/"
-
- images = []
- document.css("img").each do |elem|
- begin
- elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
- images << elem['src'].to_s
- rescue URI::InvalidURIError => exc
- elem.remove
- end
- end
-
- images(document,true)
- end
-
- def get_image_size(url)
- w, h = FastImage.size(url)
- raise "Couldn't get size." if w.nil? || h.nil?
- {:width => w, :height => h}
- rescue => e
- debug("Image error: #{e}")
- nil
- end
-
- def image_meets_criteria?(image)
- return false if options[:ignore_image_format].include?(image[:format].downcase)
- image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0)
- end
-
- def title
- title = @html.css("title").first
- title ? title.text : nil
- end
-
- # Look through the @html document looking for the author
- # Precedence Information here on the wiki: (TODO attach wiki URL if it is accepted)
- # Returns nil if no author is detected
- def author
- # Let's grab this author:
- #
- author_elements = @html.xpath('//meta[@name = "dc.creator"]')
- unless author_elements.empty?
- author_elements.each do |element|
- return element['content'].strip if element['content']
- end
- end
-
- # Now let's try to grab this
- # ByAustin Fonacier
- #
- author_elements = @html.xpath('//*[contains(@class, "vcard")]//*[contains(@class, "fn")]')
- unless author_elements.empty?
- author_elements.each do |element|
- return element.text.strip if element.text
- end
- end
-
- # Now let's try to grab this
- # Danny Banks (rel)
- # TODO: strip out the (rel)?
- author_elements = @html.xpath('//a[@rel = "author"]')
- unless author_elements.empty?
- author_elements.each do |element|
- return element.text.strip if element.text
- end
- end
-
- author_elements = @html.xpath('//*[@id = "author"]')
- unless author_elements.empty?
- author_elements.each do |element|
- return element.text.strip if element.text
- end
- end
- end
-
- def content(remove_unlikely_candidates = :default)
- @remove_unlikely_candidates = false if remove_unlikely_candidates == false
-
- prepare_candidates
- article = get_article(@candidates, @best_candidate)
-
- cleaned_article = sanitize(article, @candidates, options)
- if article.text.strip.length < options[:retry_length]
- if @remove_unlikely_candidates
- @remove_unlikely_candidates = false
- elsif @weight_classes
- @weight_classes = false
- elsif @clean_conditionally
- @clean_conditionally = false
- else
- # nothing we can do
- return cleaned_article
- end
-
- make_html
- content
- else
- cleaned_article
- end
- end
-
- def get_article(candidates, best_candidate)
- # Now that we have the top candidate, look through its siblings for content that might also be related.
- # Things like preambles, content split by ads that we removed, etc.
-
- sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
- output = Nokogiri::XML::Node.new('div', @html)
- best_candidate[:elem].parent.children.each do |sibling|
- append = false
- append = true if sibling == best_candidate[:elem]
- append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
-
- if sibling.name.downcase == "p"
- link_density = get_link_density(sibling)
- node_content = sibling.text
- node_length = node_content.length
-
- append = if node_length > 80 && link_density < 0.25
- true
- elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
- true
- end
- end
-
- if append
- sibling_dup = sibling.dup # otherwise the state of the document in processing will change, thus creating side effects
- sibling_dup.name = "div" unless %w[div p].include?(sibling.name.downcase)
- output << sibling_dup
- end
- end
-
- output
- end
-
- def select_best_candidate(candidates)
- sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
-
- debug("Top 5 candidates:")
- sorted_candidates[0...5].each do |candidate|
- debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
- end
-
- best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
- debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
-
- best_candidate
- end
-
- def get_link_density(elem)
- link_length = elem.css("a").map(&:text).join("").length
- text_length = elem.text.length
- link_length / text_length.to_f
- end
-
- def class_weight(e)
- weight = 0
- return weight unless @weight_classes
-
- if e[:class] && e[:class] != ""
- weight -= 25 if e[:class] =~ REGEXES[:negativeRe]
- weight += 25 if e[:class] =~ REGEXES[:positiveRe]
- end
-
- if e[:id] && e[:id] != ""
- weight -= 25 if e[:id] =~ REGEXES[:negativeRe]
- weight += 25 if e[:id] =~ REGEXES[:positiveRe]
- end
-
- weight
- end
-
- ELEMENT_SCORES = {
- 'div' => 5,
- 'blockquote' => 3,
- 'form' => -3,
- 'th' => -5
- }.freeze
-
- def score_node(elem)
- content_score = class_weight(elem)
- content_score += ELEMENT_SCORES.fetch(elem.name.downcase, 0)
- { :content_score => content_score, :elem => elem }
- end
-
- def debug(str)
- puts str if options[:debug]
- end
-
- def sanitize(node, candidates, options = {})
- node.css("h1, h2, h3, h4, h5, h6").each do |header|
- header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
- end
-
- node.css("form, object, iframe, embed").each do |elem|
- elem.remove
- end
-
- if @options[:remove_empty_nodes]
- # remove
tags that have no text content - this will also remove p tags that contain only images.
- node.css("p").each do |elem|
- elem.remove if elem.content.strip.empty?
- end
- end
-
- # Conditionally clean
s,
s, and
s
- clean_conditionally(node, candidates, "table, ul, div")
-
- # We'll sanitize all elements using a whitelist
- base_whitelist = @options[:tags] || %w[div p]
- # We'll add whitespace instead of block elements,
- # so a b will have a nice space between them
- base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
-
- # Use a hash for speed (don't want to make a million calls to include?)
- whitelist = Hash.new
- base_whitelist.each {|tag| whitelist[tag] = true }
- replace_with_whitespace = Hash.new
- base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true }
-
- ([node] + node.css("*")).each do |el|
- # If element is in whitelist, delete all its attributes
- if whitelist[el.node_name]
- el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
-
- # Otherwise, replace the element with its contents
- else
- # If element is root, replace the node as a text node
- if el.parent.nil?
- node = Nokogiri::XML::Text.new(el.text, el.document)
- break
- else
- if replace_with_whitespace[el.node_name]
- el.swap(Nokogiri::XML::Text.new(' ' << el.text << ' ', el.document))
- else
- el.swap(Nokogiri::XML::Text.new(el.text, el.document))
- end
- end
- end
-
- end
-
- s = Nokogiri::XML::Node::SaveOptions
- save_opts = s::NO_DECLARATION | s::NO_EMPTY_TAGS | s::AS_XHTML
- html = node.serialize(:save_with => save_opts)
-
- # Get rid of duplicate whitespace
- return html.gsub(/[\r\n\f]+/, "\n" )
- end
-
- def clean_conditionally(node, candidates, selector)
- return unless @clean_conditionally
- node.css(selector).each do |el|
- weight = class_weight(el)
- content_score = candidates[el] ? candidates[el][:content_score] : 0
- name = el.name.downcase
-
- if weight + content_score < 0
- el.remove
- debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
- elsif el.text.count(",") < 10
- counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
- counts["li"] -= 100
-
- # For every img under a noscript tag discount one from the count to avoid double counting
- counts["img"] -= el.css("noscript").css("img").length
-
- content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
- link_density = get_link_density(el)
-
- reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
- if reason
- debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
- el.remove
- end
- end
- end
- end
-
- def clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
- if (counts["img"] > counts["p"]) && (counts["img"] > 1)
- "too many images"
- elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
- "more
s than s"
- elsif (content_length < options[:min_text_length]) && (counts["img"] != 1)
- "too short a content length without a single image"
- elsif weight < 25 && link_density > 0.2
- "too many links for its weight (#{weight})"
- elsif weight >= 25 && link_density > 0.5
- "too many links for its weight (#{weight})"
- elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
- "s with too short a content length, or too many s"
- else
- nil
- end
- end
-
- private
-
- # 제거항목 추가항목을 지정한다.
- def handle_exclusions!(whitelist, blacklist)
- return unless whitelist || blacklist
-
- if blacklist
- elems = @html.css(blacklist)
- if elems
- elems.each do |e|
- e.remove
- end
- end
- end
-
- if whitelist
- elems = @html.css(whitelist).to_s
-
- if body = @html.at_css('body')
- body.inner_html = elems
- end
- end
-
- @input = @html.to_s
- end
-
- # 코멘트가 제거된 기본 html 노드 반환
- def make_html(whitelist=nil, blacklist=nil)
- @html = Nokogiri::HTML(@input, nil, @options[:encoding])
- # In case document has no body, such as from empty string or redirect
- @html = Nokogiri::HTML('', nil, @options[:encoding]) if @html.css('body').length == 0
- # Remove html comment tags
- @html.xpath('//comment()').each { |i| i.remove }
- end
-
-
- def prepare_candidates
- @html.css("script, style").each { |i| i.remove }
- remove_unlikely_candidates! if @remove_unlikely_candidates
- transform_misused_divs_into_paragraphs!
-
- @candidates = score_paragraphs(options[:min_text_length])
- @best_candidate = select_best_candidate(@candidates)
- end
-
- # 가망없는 후보자를 제거한다. (명확한 후보자는 제외하고 제거한다.)
- def remove_unlikely_candidates!
- @html.css("*").each do |elem|
- str = "#{elem[:class]}#{elem[:id]}"
- if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && (elem.name.downcase != 'html') && (elem.name.downcase != 'body')
- debug("Removing unlikely candidate - #{str}")
- elem.remove
- end
- end
- end
-
- # 잘못 사용되고 있는 DIV를 p로 변환한다.
- def transform_misused_divs_into_paragraphs!
- @html.css("*").each do |elem|
- if elem.name.downcase == "div"
- # transform
s that do not contain other block elements into
s
- if elem.inner_html !~ REGEXES[:divToPElementsRe]
- debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
- elem.name = "p"
- end
- else
- # wrap text nodes in p tags
-# elem.children.each do |child|
-# if child.text?
-# debug("wrapping text node with a p")
-# child.swap("
#{child.text}
")
-# end
-# end
- end
- end
- end
-
- # 가능노드에 점수를 매긴다.
- def score_paragraphs(min_text_length)
- candidates = {}
- @html.css("p,td").each do |elem|
- parent_node = elem.parent
- grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
- inner_text = elem.text
-
- # If this paragraph is less than 25 characters, don't even count it.
- next if inner_text.length < min_text_length
-
- candidates[parent_node] ||= score_node(parent_node)
- candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
-
- content_score = 1
- content_score += inner_text.split(',').length
- content_score += [(inner_text.length / 100).to_i, 3].min
-
- candidates[parent_node][:content_score] += content_score
- candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
- end
-
- # Scale the final candidates score based on link density. Good content should have a
- # relatively small link density (5% or less) and be mostly unaffected by this operation.
- candidates.each do |elem, candidate|
- candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
- end
-
- candidates
- end
- end
-end
diff --git a/mix.exs b/mix.exs
index 2720fad..cd6a2a9 100644
--- a/mix.exs
+++ b/mix.exs
@@ -1,4 +1,7 @@
defmodule Readability.Mixfile do
+ @moduledoc """
+ """
+
use Mix.Project
def project do
@@ -15,7 +18,8 @@ defmodule Readability.Mixfile do
# Type "mix help compile.app" for more information
def application do
[applications: [:logger,
- :floki
+ :floki,
+ :httpoison
]]
end
@@ -29,6 +33,10 @@ defmodule Readability.Mixfile do
#
# Type "mix help deps" for more examples and options
defp deps do
- [{:floki, "~> 0.8.0"}]
+ [{:floki, "~> 0.8.0"},
+ {:httpoison, "~> 0.8.0"},
+ {:credo, "~> 0.3", only: [:dev, :test]},
+ {:dialyxir, "~> 0.3", only: [:dev]}
+ ]
end
end
diff --git a/mix.lock b/mix.lock
index 7874674..9aaf1ae 100644
--- a/mix.lock
+++ b/mix.lock
@@ -1,2 +1,12 @@
-%{"floki": {:hex, :floki, "0.8.0"},
- "mochiweb_html": {:hex, :mochiweb_html, "2.13.0"}}
+%{"bunt": {:hex, :bunt, "0.1.5"},
+ "certifi": {:hex, :certifi, "0.4.0"},
+ "credo": {:hex, :credo, "0.3.12"},
+ "dialyxir": {:hex, :dialyxir, "0.3.3"},
+ "floki": {:hex, :floki, "0.8.0"},
+ "hackney": {:hex, :hackney, "1.6.0"},
+ "httpoison": {:hex, :httpoison, "0.8.3"},
+ "idna": {:hex, :idna, "1.2.0"},
+ "metrics": {:hex, :metrics, "1.0.1"},
+ "mimerl": {:hex, :mimerl, "1.0.2"},
+ "mochiweb_html": {:hex, :mochiweb_html, "2.13.0"},
+ "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.0"}}
diff --git a/test.html b/test.html
new file mode 100644
index 0000000..b20208c
--- /dev/null
+++ b/test.html
@@ -0,0 +1 @@
+
One afternoon in December, the Arts of Korea Gallery at the was abuzz, as dignitaries from the South Korean government inspected the display, while members of the press watched. Then, Oh Seung-je, the director of the Korean Cultural Service of New York, and Daniel H. Weiss, the Met’s president, ceremonially signed a pact establishing a long-term partnership between the Met and South Korea’s Ministry of Culture, Sports and Tourism.
Along with the agreement came a $1 million gift to the Met from South Korea. It will pay for enhancement of the gallery, loans from Korean museums, a major exhibition of Korean art in 2018, new research, and residencies at the Met for curators from the National Museum of Korea.
It was far from the first time the South Korean government or its affiliates, in tandem with Korean corporations, had rained money on American museums to ensure that Korean art was seen here. The Met’s Arts of Korea Gallery was created in 1998 with a gift from the Korea Foundation, an independent organization affiliated with South Korea’s Ministry of Foreign Affairs.
Other countries, including Italy, Japan and the Netherlands, promote their visual culture in the United States — sponsoring museum exhibitions, lending valuable artworks and so on. What is striking about South Korea is that it has systematically invested in building and maintaining permanent places to see Korean art at American museums, and in underwriting exhibitions that promote the country’s ancient and contemporary art in the United States.
“Since the American public only has rather limited opportunities to view and appreciate Korean artworks, concerted efforts are necessary to bring attention to the richness of Korea’s culture and arts,” Yoon Keum-jin, the executive vice president in the Korea Foundation’s Washington office, wrote in an email.
Since its founding 25 years ago, the Korea Foundation alone has midwifed the birth of permanent Korean art galleries at 18 American museums, including the Museum of Fine Arts, Houston; the Cleveland Museum of Art; the Seattle Art Museum; the Los Angeles County Museum of Art; and the Museum of Fine Arts, Boston. It has spent more than $9 million to construct these galleries, Ms. Yoon said. Without such funding, the museums say, many of these galleries would probably not exist.
Reasons for this effort are both political and cultural. “The Koreans have a tradition of centralized planning and the government being involved in long-term economic planning and investment,” said Timothy F. Rub, director of the Philadelphia Museum of Art, which received a grant from the Korea Foundation to upgrade its Korean gallery, along with other aid over the years. “They see the promulgation of culture as an instrument of economic policy.”
They also perceive a real need. “Many are aware of Korea through the rapid development of its technology and electronics sectors, but it is a country equally abundant in the cultural arts,” said Mr. Oh of the Korean Culture Service.
Historically, Mr. Rub added, Korea’s culture has been overshadowed. “Look at the collections in the United States of East Asian art. Japan and China are far more prominent than Korea.” Yet, he said, “look at the history of Korean art; it was very much prized by the Japanese and the Chinese.”
Korea Foundation Day at the Museum of Fine Arts, Boston in 2012. Museum of Fine Arts, Boston
For museums in the Korean fold, the support can be steady. Soon after the Houston museum received $470,000 to build its gallery, the Korea Foundation provided $30,000 for programming. In 2010 it gave the museum $150,000 to organize and present “Your Bright Future: 12 Contemporary Artists From Korea,” the first major American museum exhibition of contemporary Korean art in years. Since then, the Korea Foundation has donated more money for programming of Korean art.
The Los Angeles County Museum of Art, which was a co-organizer of “Your Bright Future,” also received money from the Korea Foundation for the show, which was sponsored by a Korean container company.
The Koreans have also proved receptive to American ideas. Some years ago, Mr. Rub approached the National Museum of Korea with the notion of showcasing the art of the Joseon dynasty, which ruled Korea from 1392 to 1910. That turned into a cultural exchange that included the Museum of Fine Arts, Houston, and the Los Angeles County Museum of Art. They each presented the Korean exhibition and sent an exhibition of American art to South Korea.
South Korea also supported a series of workshops between 1999 and 2013. The Korea Foundation hosted curators from 26 countries, who traveled to South Korea to hear art experts, take field trips to cultural sites and learn technical information about Korean art and its proper display.
Despite the current economic uncertainties that have prompted many governments to cut their funding of the arts, Mr. Oh said that South Korea would continue its overseas largess. “I believe that economic prosperity and cultural wealth go hand in hand,” he said in an email. “This is why it is important to even further promote the cultural arts during times of economic slowdown.”
What’s more, he added, South Korea’s president, Park Geun-hye, has made cultural enrichment one of her major priorities.
Media captionMr Obama told the BBC that gun control was his biggest frustration
President Barack Obama has admitted that his failure to pass "common sense gun safety laws" in the US is the greatest frustration of his presidency.
In an interview with the BBC, Mr Obama said it was "distressing" not to have made progress on the issue "even in the face of repeated mass killings".
He vowed to keep trying, but the BBC's North America editor Jon Sopel said the president did not sound very confident.
However, Mr Obama said race relations had improved during his presidency.
Hours after the interview, a gunman opened fire at a cinema in the US state of Louisiana, killing two people and injuring several others before shooting himself.
Mr Obama lands in Kenya later on Friday for his first visit since becoming president.
But with just 18 months left in power, he said gun control was the area where he has been "most frustrated and most stymied" since coming to power in 2009.
"If you look at the number of Americans killed since 9/11 by terrorism, it's less than 100. If you look at the number that have been killed by gun violence, it's in the tens of thousands," Mr Obama said.
Media captionBarack Obama: "Great Britain has always been our best partner"
+
+
+ The president said he would continue fighting for greater gun control laws
+
+
"For us not to be able to resolve that issue has been something that is distressing," he added.
Mr Obama has pushed for stricter gun control throughout his presidency but has been unable to secure any significant changes to the laws.
After nine African-American churchgoers were killed in South Carolina in June, he admitted "politics in this town" meant there were few options available.
Analysis: Jon Sopel, BBC News, Washington
Nine months ago, the president seemed like a spent force, after taking a beating in the midterm elections, during which members of his own party were reluctant to campaign on his record.
But the man sat before me today was relaxed and confident, buoyed by a string of "wins" on healthcare, Cuba and Iran, after bitter and ongoing battles with his many critics.
There was a momentary flicker across the president's face as if to say "You think you got me?" before his smile returned and he proceeded to talk about how Congress would come round.
But notably, he did not give a direct answer to that question, which leaves me with the impression that he has persuaded precisely zero.
Media captionThe BBC contrasts President Obama's reactions after mass shootings, with the levels of US gun ownership during his terms in office. (Video by David Botti)
On race relations, Mr Obama said recent concerns around policing and mass incarcerations were "legitimate and deserve intense attention" but insisted progress had been made.
Children growing up during the eight years of his presidency "will have a different view of race relations in this country and what's possible," he said.
"There are going to be tensions that arise. But if you look at my daughters' generation, they have an attitude about race that's entirely different than even my generation."
Talking about how he was feeling after his recent successes, he said "every president, every leader has strengths and weaknesses".
"One of my strengths is I have a pretty even temperament. I don't get too high when it's high and I don't get too low when it's low," he said.
+
+
+ Kenya is getting ready to welcome the US president
+
+
Kenya trip
Mr Obama was speaking to the BBC at the White House before departing for Kenya.
His father was Kenyan and the president is expected to meet relatives in Nairobi.
Mr Obama has faced criticism in the country after the US legalised gay marriage. However, in his interview, the president said he would not fall silent on the issue.
Media captionPresident Obama told the BBC he would deliver a blunt message on gay rights when he travelled to Africa
"I am not a fan of discrimination and bullying of anybody on the basis of race, on the basis of religion, on the basis of sexual orientation or gender," he said.
The president also admitted that some African governments, including Kenya's, needed to improve their records on human rights and democracy. However, he defended his decision to engage with and visit those governments.
"Well, they're not ideal institutions. But what we found is, is that when we combined blunt talk with engagement, that gives us the best opportunity to influence and open up space for civil society."
Mr Obama will become the first US president to address the African Union when he travels on to Ethiopia on Sunday.
Seventy years ago, off the Greek island of Kefalonia, the British submarine HMS Perseus hit an Italian mine, sparking one of the greatest and most controversial survival stories of World War II.
-
The clear waters of the Mediterranean were a death trap for British submarines in World War II.
-
Some were bombed from the air, others hunted with sonar and depth charges, and many, perhaps most, collided with mines.
-
Two fifths of the subs that ventured into the Mediterranean were sunk and when a submarine sank it became a communal coffin - everyone on board died. That was the rule.
-
In fact, during the whole of the war there were only four escapes from stricken British submarines. And the most remarkable of these took place on 6 December 1941, when HMS Perseus plummeted to the seabed.
- Enigma
-
When she left the British submarine base at Malta at the end of November 1941, HMS Perseus had on board her 59 crew and two passengers, one of whom was John Capes, a 31-year-old Navy stoker en route to Alexandria.
-
-
-
- John Capes: Stoker on the Perseus
-
-
Tall, dark, handsome and a bit of an enigma, Capes had been educated at Dulwich College, and as the son of a diplomat he would naturally have been officer class rather than one of the lowliest of the mechanics who looked after the engines.
-
On the rough winter night of 6 December, Perseus was on the surface of the sea 3km (two miles) off the coast of Kefalonia, recharging her batteries under cover of darkness in preparation for another day underwater.
-
According to newspaper articles Capes later wrote or contributed to, he was relaxing in a makeshift bunk converted from a spare torpedo tube when, with no warning, there was a devastating explosion.
-
The boat twisted, plunged, and hit the bottom with what Capes described as a "nerve-shattering jolt".
-
His bunk reared up and threw him across the compartment. The lights went out.
Louis de Bernieres returns to Kefalonia to tell the story of John Capes and HMS Perseus
-
Tim Clayton acted as a programme consultant
-
Broadcast on Friday 2 December 2011 at 1100 GMT on BBC Radio 4, or listen again on iPlayer
-
-
-
Capes guessed they had hit a mine. Finding that he could stand, he groped for a torch. In the increasingly foul air and rising water of the engine room he found "the mangled bodies of a dozen dead".
-
But that was as far as he could get. The engine room door was forced shut by the pressure of water on the other side. "It was creaking under the great pressure. Jets and trickles from the rubber joint were seeping through," said Capes.
-
He dragged any stokers who showed signs of life towards the escape hatch and fitted them and himself with Davis Submarine Escape Apparatus, a rubber lung with an oxygen bottle, mouthpiece and goggles.
HMS Umpire sank near Norfolk, England on 19 July 1941. Escapees: 14-15
-
HMS Stratagem sank near Malacca, Malaysia on 22 November 1944. Escapees: 10
-
HMS Perseus sank near Kefalonia, Greece on 6 December 1941. Escapees: 1
-
HMS P32 sank near Tripoli, Libya on 18 August 1941 (but the wreck was discovered only in 1999). Escapees: 2
-
-
-
This equipment had only been tested to a depth of 100ft (30m). The depth gauge showed just over 270ft, and as far as Capes knew, no-one had ever made an escape from such a depth.
-
In fact the gauge was broken, over-estimating the depth by 100ft, but time was running out. It was difficult to breathe now.
-
He flooded the compartment, lowered the canvas trunk beneath the escape hatch and with great difficulty released the damaged bolts on the hatch.
-
He pushed his injured companions into the trunk, up through the hatch and away into the cold sea above. Then he took a last swig of rum from his blitz bottle, ducked under and passed through the hatch himself.
-
"I let go, and the buoyant oxygen lifted me quickly upward. Suddenly I was alone in the middle of the great ocean.
-
"The pain became frantic, my lungs and whole body as fit to burst apart. Agony made me dizzy. How long can I last?
-
"Then, with the suddenness of certainty, I burst to the surface and wallowed in a slight swell with whitecaps here and there."
-
But having made the deepest escape yet recorded, his ordeal was not over.
-
His fellow injured stokers had not made it to the surface with him so he found himself alone in the middle of a cold December sea.
-
In the darkness he spotted a band of white cliffs and realised he had no choice but to strike out for those.
- Story doubted
-
The next morning, Capes was found unconscious by two fishermen on the shore of Kefalonia.
-
For the following 18 months he was passed from house to house, to evade the Italian occupiers. He lost 70lb (32kg) in weight and dyed his hair black in an effort to blend in.
-
He recalled later: "Always, at the moment of despair, some utterly poor but friendly and patriotic islander would risk the lives of all his family for my sake.
-
-
-
- Kostas Thoctarides and his dive team found the wreck of HMS Perseus in 1997
-
-
"They even gave me one of their prize possessions, a donkey called Mareeka. There was one condition attached to her - I had to take a solemn vow not to eat her."
-
He was finally taken off the island on a fishing boat in May 1943, in a clandestine operation organised by the Royal Navy.
-
A dangerous, roundabout journey of 640km took him to Turkey and from there back to the submarine service in Alexandria.
-
Despite being awarded a medal for his escape, Capes's story was so extraordinary that many people, both within and outside the Navy, doubted it.
-
Was he really on the boat at all? After all, he was not on the crew list. And submarine commanders had been ordered to bolt escape hatches shut from the outside to prevent them lifting during depth charge attacks.
-
There were no witnesses, he had a reputation as a great storyteller, and his own written accounts after the war varied in their details.
-
And the depth gauge reading 270ft made his story all the harder to believe.
-
John Capes died in 1985 but it was not until 1997 that his story was finally verified.
-
In a series of dives to the wreck of Perseus, Kostas Thoctarides discovered Capes's empty torpedo tube bunk, the hatch and compartment exactly as he had described it, and finally, his blitz bottle from which he had taken that last fortifying swig of rum.
-
Tim Clayton is the author of Sea Wolves: the Extraordinary Story of Britain's WW2 Submarines.
-
BBC Radio 4's Escape from the Deep is broadcast on Friday 2 December 2011 at 1100 GMT. Or listen again on BBC iPlayer.
We received hundreds of emails in response to our story about the large numbers of British people giving up on life in Australia. Some readers have been sharing their experiences of leaving - and staying - Down Under.
This page is best viewed in an up-to-date web browser with style sheets (CSS) enabled. While you will be able to view the content of this page in your current browser, you will not be able to get the full visual experience. Please consider upgrading your browser software or enabling style sheets (CSS) if you are able to do so.