defmodule Readability.Document do @default_options [retry_length: 250, min_text_length: 25, remove_unlikely_candidates: true, weight_classes: true, clean_conditionally: true, remove_empty_nodes: true, min_image_width: 130, min_image_height: 80, ignore_image_format: [], blacklist: nil, whitelist: nil ] @regexes [ unlikelyCandidatesRe: ~r/combx|comment|community|disqus|extra|foot|header|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i, okMaybeItsACandidateRe: ~r/and|article|body|column|main|shadow/i, positiveRe: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, negativeRe: ~r/combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i, divToPElementsRe: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, replaceBrsRe: ~r/(]*>[ \n\r\t]*){2,}/i, replaceFontsRe: ~r/<(\/?)font[^>]*>/i, trimRe: ~r/^\s+|\s+$/, normalizeRe: ~r/\s{2,}/, killBreaksRe: ~r/((\s| ?)*){1,}/, videoRe: ~r/http:\/\/(www\.)?(youtube|vimeo)\.com/i ] def html do page |> String.replace(@regexes[:replaceBrsRe], "

") |> String.replace(@regexes[:replaceFontsRe], "<\1span>") |> Floki.find("html") |> Floki.filter_out(:comment) end def title do html |> Floki.find("title") |> Floki.text end def content do html |> Floki.filter_out("script") |> Floki.filter_out("style") end def page do {:ok, f} = File.read("test/features/nytimes.html") f end def default_options do @default_options end def regexes do @regexes end end