initial commit

2016-04-15 20:51:29 +09:00 · 2016-04-15 20:51:29 +09:00 · d91604a519
commit d91604a519
14 changed files with 3832 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
+/_build
+/cover
+/deps
+erl_crash.dump
+*.ez
--- a/README.md
+++ b/README.md
@ -0,0 +1,20 @@
+# Readability
+
+**TODO: Add description**
+
+## Installation
+
+If [available in Hex](https://hex.pm/docs/publish), the package can be installed as:
+
+  1. Add readability to your list of dependencies in `mix.exs`:
+
+        def deps do
+          [{:readability, "~> 0.0.1"}]
+        end
+
+  2. Ensure readability is started before your application:
+
+        def application do
+          [applications: [:readability]]
+        end
+
--- a/config/config.exs
+++ b/config/config.exs
@ -0,0 +1,30 @@
+# This file is responsible for configuring your application
+# and its dependencies with the aid of the Mix.Config module.
+use Mix.Config
+
+# This configuration is loaded before any dependency and is restricted
+# to this project. If another project depends on this project, this
+# file won't be loaded nor affect the parent project. For this reason,
+# if you want to provide default values for your application for
+# 3rd-party users, it should be done in your "mix.exs" file.
+
+# You can configure for your application as:
+#
+#     config :readability, key: :value
+#
+# And access this configuration in your application as:
+#
+#     Application.get_env(:readability, :key)
+#
+# Or configure a 3rd-party app:
+#
+#     config :logger, level: :info
+#
+
+# It is also possible to import configuration files, relative to this
+# directory. For example, you can emulate configuration per environment
+# by uncommenting the line below and defining dev.exs, test.exs and such.
+# Configuration from the imported file will override the ones defined
+# here (which is why it is important to import them last).
+#
+#     import_config "#{Mix.env}.exs"
--- a/lib/document.ex
+++ b/lib/document.ex
@ -0,0 +1,58 @@
+defmodule Readability.Document do
+  @default_options  [retry_length: 250,
+                     min_text_length: 25,
+                     remove_unlikely_candidates: true,
+                     weight_classes: true,
+                     clean_conditionally: true,
+                     remove_empty_nodes: true,
+                     min_image_width: 130,
+                     min_image_height: 80,
+                     ignore_image_format: [],
+                     blacklist: nil,
+                     whitelist: nil
+                   ]
+
+  @regexes [ unlikelyCandidatesRe: ~r/combx|comment|community|disqus|extra|foot|header|lightbox|modal|menu|meta|nav|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
+             okMaybeItsACandidateRe: ~r/and|article|body|column|main|shadow/i,
+             positiveRe: ~r/article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
+             negativeRe: ~r/combx|comment|com-|contact|foot|footer|footnote|link|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|utility|widget/i,
+             divToPElementsRe: ~r/<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
+             replaceBrsRe: ~r/(<br[^>]*>[ \n\r\t]*){2,}/i,
+             replaceFontsRe: ~r/<(\/?)font[^>]*>/i,
+             trimRe: ~r/^\s+|\s+$/,
+             normalizeRe: ~r/\s{2,}/,
+             killBreaksRe: ~r/(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
+             videoRe: ~r/http:\/\/(www\.)?(youtube|vimeo)\.com/i
+           ]
+
+  def html do
+    page
+    |> String.replace(@regexes[:replaceBrsRe], "</p><p>")
+    |> String.replace(@regexes[:replaceFontsRe], "<\1span>")
+    |> Floki.find("html")
+    |> Floki.filter_out(:comment)
+  end
+
+  def title do
+    html |> Floki.find("title") |> Floki.text
+  end
+
+  def content do
+    html
+    |> Floki.filter_out("script")
+    |> Floki.filter_out("style")
+  end
+
+  def page do
+    {:ok, f} = File.read("test/features/nytimes.html")
+    f
+  end
+
+  def default_options do
+    @default_options
+  end
+
+  def regexes do
+    @regexes
+  end
+end
--- a/lib/readability.ex
+++ b/lib/readability.ex
@ -0,0 +1,10 @@
+defmodule Readability do
+  alias Readability.TitleFinder
+
+  @type html_tree :: tuple | list
+
+  def title(html) when is_binary(html), do: parse(html) |> title
+  def title(html_tree), do: TitleFinder.title(html_tree)
+
+  defp parse(raw_html), do: Floki.parse(raw_html)
+end
--- a/lib/test.js
+++ b/lib/test.js
--- a/lib/test.rb
+++ b/lib/test.rb
@ -0,0 +1,522 @@
+# encoding: utf-8
+
+require 'rubygems'
+require 'nokogiri'
+require 'guess_html_encoding'
+
+module Readability
+  class Document
+    DEFAULT_OPTIONS = {
+      :retry_length               => 250,
+      :min_text_length            => 25,
+      :remove_unlikely_candidates => true,
+      :weight_classes             => true,
+      :clean_conditionally        => true,
+      :remove_empty_nodes         => true,
+      :min_image_width            => 130,
+      :min_image_height           => 80,
+      :ignore_image_format        => [],
+      :blacklist                  => nil,
+      :whitelist                  => nil
+    }.freeze
+
+    REGEXES = {
+        :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
+        :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
+        :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
+        :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
+        :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
+        :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
+        :replaceFontsRe => /<(\/?)font[^>]*>/i,
+        :trimRe => /^\s+|\s+$/,
+        :normalizeRe => /\s{2,}/,
+        :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
+        :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
+    }
+
+    attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
+
+    def initialize(input, options = {})
+      @options = DEFAULT_OPTIONS.merge(options)
+      @input = input
+
+      if RUBY_VERSION =~ /^(1\.9|2)/ && !@options[:encoding]
+        @input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding]
+        @options[:encoding] = @input.encoding.to_s
+      end
+
+      @input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
+      @remove_unlikely_candidates = @options[:remove_unlikely_candidates]
+      @weight_classes = @options[:weight_classes]
+      @clean_conditionally = @options[:clean_conditionally]
+      @best_candidate_has_image = true
+      make_html
+      handle_exclusions!(@options[:whitelist], @options[:blacklist])
+    end
+
+    def images(content=nil, reload=false)
+      begin
+        require 'fastimage'
+      rescue LoadError
+        raise "Please install fastimage in order to use the #images feature."
+      end
+
+      @best_candidate_has_image = false if reload
+
+      prepare_candidates
+      list_images   = []
+      tested_images = []
+      content       = @best_candidate[:elem] unless reload
+
+      return list_images if content.nil?
+      elements = content.css("img").map(&:attributes)
+
+        elements.each do |element|
+          next unless element["src"]
+
+          url     = element["src"].value
+          height  = element["height"].nil?  ? 0 : element["height"].value.to_i
+          width   = element["width"].nil?   ? 0 : element["width"].value.to_i
+
+          if url =~ /\Ahttps?:\/\//i && (height.zero? || width.zero?)
+            image   = get_image_size(url)
+            next unless image
+          else
+            image = {:width => width, :height => height}
+          end
+
+          image[:format] = File.extname(url).gsub(".", "")
+
+          if tested_images.include?(url)
+            debug("Image was tested: #{url}")
+            next
+          end
+
+          tested_images.push(url)
+          if image_meets_criteria?(image)
+            list_images << url
+          else
+            debug("Image discarded: #{url} - height: #{image[:height]} - width: #{image[:width]} - format: #{image[:format]}")
+          end
+        end
+
+      (list_images.empty? and content != @html) ? images(@html, true) : list_images
+    end
+
+    def images_with_fqdn_uris!(source_uri)
+      images_with_fqdn_uris(@html, source_uri)
+    end
+
+    def images_with_fqdn_uris(document = @html.dup, source_uri)
+      uri = URI.parse(source_uri)
+      host = uri.host
+      scheme = uri.scheme
+      port = uri.port # defaults to 80
+
+      base = "#{scheme}://#{host}:#{port}/"
+
+      images = []
+      document.css("img").each do |elem|
+        begin
+          elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
+          images << elem['src'].to_s
+        rescue URI::InvalidURIError => exc
+          elem.remove
+        end
+      end
+
+      images(document,true)
+    end
+
+    def get_image_size(url)
+      w, h = FastImage.size(url)
+      raise "Couldn't get size." if w.nil? || h.nil?
+      {:width => w, :height => h}
+    rescue => e
+      debug("Image error: #{e}")
+      nil
+    end
+
+    def image_meets_criteria?(image)
+      return false if options[:ignore_image_format].include?(image[:format].downcase)
+      image[:width] >= (options[:min_image_width] || 0) && image[:height] >= (options[:min_image_height] || 0)
+    end
+
+    def title
+      title = @html.css("title").first
+      title ? title.text : nil
+    end
+
+    # Look through the @html document looking for the author
+    # Precedence Information here on the wiki: (TODO attach wiki URL if it is accepted)
+    # Returns nil if no author is detected
+    def author
+      # Let's grab this author:
+      # <meta name="dc.creator" content="Finch - http://www.getfinch.com" />
+      author_elements = @html.xpath('//meta[@name = "dc.creator"]')
+      unless author_elements.empty?
+        author_elements.each do |element|
+          return element['content'].strip if element['content']
+        end
+      end
+
+      # Now let's try to grab this
+      # <span class="byline author vcard"><span>By</span><cite class="fn">Austin Fonacier</cite></span>
+      # <div class="author">By</div><div class="author vcard"><a class="url fn" href="http://austinlivesinyoapp.com/">Austin Fonacier</a></div>
+      author_elements = @html.xpath('//*[contains(@class, "vcard")]//*[contains(@class, "fn")]')
+      unless author_elements.empty?
+        author_elements.each do |element|
+          return element.text.strip if element.text
+        end
+      end
+
+      # Now let's try to grab this
+      # <a rel="author" href="http://dbanksdesign.com">Danny Banks (rel)</a>
+      # TODO: strip out the (rel)?
+      author_elements = @html.xpath('//a[@rel = "author"]')
+      unless author_elements.empty?
+        author_elements.each do |element|
+          return element.text.strip if element.text
+        end
+      end
+
+      author_elements = @html.xpath('//*[@id = "author"]')
+      unless author_elements.empty?
+        author_elements.each do |element|
+          return element.text.strip if element.text
+        end
+      end
+    end
+
+    def content(remove_unlikely_candidates = :default)
+      @remove_unlikely_candidates = false if remove_unlikely_candidates == false
+
+      prepare_candidates
+      article = get_article(@candidates, @best_candidate)
+
+      cleaned_article = sanitize(article, @candidates, options)
+      if article.text.strip.length < options[:retry_length]
+        if @remove_unlikely_candidates
+          @remove_unlikely_candidates = false
+        elsif @weight_classes
+          @weight_classes = false
+        elsif @clean_conditionally
+          @clean_conditionally = false
+        else
+          # nothing we can do
+          return cleaned_article
+        end
+
+        make_html
+        content
+      else
+        cleaned_article
+      end
+    end
+
+    def get_article(candidates, best_candidate)
+      # Now that we have the top candidate, look through its siblings for content that might also be related.
+      # Things like preambles, content split by ads that we removed, etc.
+
+      sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
+      output = Nokogiri::XML::Node.new('div', @html)
+      best_candidate[:elem].parent.children.each do |sibling|
+        append = false
+        append = true if sibling == best_candidate[:elem]
+        append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
+
+        if sibling.name.downcase == "p"
+          link_density = get_link_density(sibling)
+          node_content = sibling.text
+          node_length = node_content.length
+
+          append = if node_length > 80 && link_density < 0.25
+            true
+          elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
+            true
+          end
+        end
+
+        if append
+          sibling_dup = sibling.dup # otherwise the state of the document in processing will change, thus creating side effects
+          sibling_dup.name = "div" unless %w[div p].include?(sibling.name.downcase)
+          output << sibling_dup
+        end
+      end
+
+      output
+    end
+
+    def select_best_candidate(candidates)
+      sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
+
+      debug("Top 5 candidates:")
+      sorted_candidates[0...5].each do |candidate|
+        debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
+      end
+
+      best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
+      debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
+
+      best_candidate
+    end
+
+    def get_link_density(elem)
+      link_length = elem.css("a").map(&:text).join("").length
+      text_length = elem.text.length
+      link_length / text_length.to_f
+    end
+
+    def class_weight(e)
+      weight = 0
+      return weight unless @weight_classes
+
+      if e[:class] && e[:class] != ""
+        weight -= 25 if e[:class] =~ REGEXES[:negativeRe]
+        weight += 25 if e[:class] =~ REGEXES[:positiveRe]
+      end
+
+      if e[:id] && e[:id] != ""
+        weight -= 25 if e[:id] =~ REGEXES[:negativeRe]
+        weight += 25 if e[:id] =~ REGEXES[:positiveRe]
+      end
+
+      weight
+    end
+
+    ELEMENT_SCORES = {
+      'div' => 5,
+      'blockquote' => 3,
+      'form' => -3,
+      'th' => -5
+    }.freeze
+
+    def score_node(elem)
+      content_score = class_weight(elem)
+      content_score += ELEMENT_SCORES.fetch(elem.name.downcase, 0)
+      { :content_score => content_score, :elem => elem }
+    end
+
+    def debug(str)
+      puts str if options[:debug]
+    end
+
+    def sanitize(node, candidates, options = {})
+      node.css("h1, h2, h3, h4, h5, h6").each do |header|
+        header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
+      end
+
+      node.css("form, object, iframe, embed").each do |elem|
+        elem.remove
+      end
+
+      if @options[:remove_empty_nodes]
+        # remove <p> tags that have no text content - this will also remove p tags that contain only images.
+        node.css("p").each do |elem|
+          elem.remove if elem.content.strip.empty?
+        end
+      end
+
+      # Conditionally clean <table>s, <ul>s, and <div>s
+      clean_conditionally(node, candidates, "table, ul, div")
+
+      # We'll sanitize all elements using a whitelist
+      base_whitelist = @options[:tags] || %w[div p]
+      # We'll add whitespace instead of block elements,
+      # so a<br>b will have a nice space between them
+      base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
+
+      # Use a hash for speed (don't want to make a million calls to include?)
+      whitelist = Hash.new
+      base_whitelist.each {|tag| whitelist[tag] = true }
+      replace_with_whitespace = Hash.new
+      base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true }
+
+      ([node] + node.css("*")).each do |el|
+        # If element is in whitelist, delete all its attributes
+        if whitelist[el.node_name]
+          el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
+
+          # Otherwise, replace the element with its contents
+        else
+          # If element is root, replace the node as a text node
+          if el.parent.nil?
+            node = Nokogiri::XML::Text.new(el.text, el.document)
+            break
+          else
+            if replace_with_whitespace[el.node_name]
+              el.swap(Nokogiri::XML::Text.new(' ' << el.text << ' ', el.document))
+            else
+              el.swap(Nokogiri::XML::Text.new(el.text, el.document))
+            end
+          end
+        end
+
+      end
+
+      s = Nokogiri::XML::Node::SaveOptions
+      save_opts = s::NO_DECLARATION | s::NO_EMPTY_TAGS | s::AS_XHTML
+      html = node.serialize(:save_with => save_opts)
+
+      # Get rid of duplicate whitespace
+      return html.gsub(/[\r\n\f]+/, "\n" )
+    end
+
+    def clean_conditionally(node, candidates, selector)
+      return unless @clean_conditionally
+      node.css(selector).each do |el|
+        weight = class_weight(el)
+        content_score = candidates[el] ? candidates[el][:content_score] : 0
+        name = el.name.downcase
+
+        if weight + content_score < 0
+          el.remove
+          debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
+        elsif el.text.count(",") < 10
+          counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
+          counts["li"] -= 100
+
+          # For every img under a noscript tag discount one from the count to avoid double counting
+          counts["img"] -= el.css("noscript").css("img").length
+
+          content_length = el.text.strip.length  # Count the text length excluding any surrounding whitespace
+          link_density = get_link_density(el)
+
+          reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
+          if reason
+            debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
+            el.remove
+          end
+        end
+      end
+    end
+
+    def clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
+      if (counts["img"] > counts["p"]) && (counts["img"] > 1)
+        "too many images"
+      elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
+        "more <li>s than <p>s"
+      elsif counts["input"] > (counts["p"] / 3).to_i
+        "less than 3x <p>s than <input>s"
+      elsif (content_length < options[:min_text_length]) && (counts["img"] != 1)
+        "too short a content length without a single image"
+      elsif weight < 25 && link_density > 0.2
+        "too many links for its weight (#{weight})"
+      elsif weight >= 25 && link_density > 0.5
+        "too many links for its weight (#{weight})"
+      elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
+        "<embed>s with too short a content length, or too many <embed>s"
+      else
+        nil
+      end
+    end
+
+    private
+
+    # 제거항목 추가항목을 지정한다.
+    def handle_exclusions!(whitelist, blacklist)
+      return unless whitelist || blacklist
+
+      if blacklist
+        elems = @html.css(blacklist)
+        if elems
+          elems.each do |e|
+            e.remove
+          end
+        end
+      end
+
+      if whitelist
+        elems = @html.css(whitelist).to_s
+
+        if body = @html.at_css('body')
+          body.inner_html = elems
+        end
+      end
+
+      @input = @html.to_s
+    end
+
+    # 코멘트가 제거된 기본 html 노드 반환
+    def make_html(whitelist=nil, blacklist=nil)
+      @html = Nokogiri::HTML(@input, nil, @options[:encoding])
+      # In case document has no body, such as from empty string or redirect
+      @html = Nokogiri::HTML('<body />', nil, @options[:encoding]) if @html.css('body').length == 0
+      # Remove html comment tags
+      @html.xpath('//comment()').each { |i| i.remove }
+    end
+
+
+    def prepare_candidates
+      @html.css("script, style").each { |i| i.remove }
+      remove_unlikely_candidates! if @remove_unlikely_candidates
+      transform_misused_divs_into_paragraphs!
+
+      @candidates     = score_paragraphs(options[:min_text_length])
+      @best_candidate = select_best_candidate(@candidates)
+    end
+
+    # 가망없는 후보자를 제거한다. (명확한 후보자는 제외하고 제거한다.)
+    def remove_unlikely_candidates!
+      @html.css("*").each do |elem|
+        str = "#{elem[:class]}#{elem[:id]}"
+        if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && (elem.name.downcase != 'html') && (elem.name.downcase != 'body')
+          debug("Removing unlikely candidate - #{str}")
+          elem.remove
+        end
+      end
+    end
+
+    # 잘못 사용되고 있는 DIV를 p로 변환한다.
+    def transform_misused_divs_into_paragraphs!
+      @html.css("*").each do |elem|
+        if elem.name.downcase == "div"
+          # transform <div>s that do not contain other block elements into <p>s
+          if elem.inner_html !~ REGEXES[:divToPElementsRe]
+            debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
+            elem.name = "p"
+          end
+        else
+          # wrap text nodes in p tags
+#          elem.children.each do |child|
+#            if child.text?
+#              debug("wrapping text node with a p")
+#              child.swap("<p>#{child.text}</p>")
+#            end
+#          end
+        end
+      end
+    end
+
+    # 가능노드에 점수를 매긴다.
+    def score_paragraphs(min_text_length)
+      candidates = {}
+      @html.css("p,td").each do |elem|
+        parent_node = elem.parent
+        grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
+        inner_text = elem.text
+
+        # If this paragraph is less than 25 characters, don't even count it.
+        next if inner_text.length < min_text_length
+
+        candidates[parent_node] ||= score_node(parent_node)
+        candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
+
+        content_score = 1
+        content_score += inner_text.split(',').length
+        content_score += [(inner_text.length / 100).to_i, 3].min
+
+        candidates[parent_node][:content_score] += content_score
+        candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
+      end
+
+      # Scale the final candidates score based on link density. Good content should have a
+      # relatively small link density (5% or less) and be mostly unaffected by this operation.
+      candidates.each do |elem, candidate|
+        candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
+      end
+
+      candidates
+    end
+  end
+end
--- a/lib/title_finder.ex
+++ b/lib/title_finder.ex
@ -0,0 +1,64 @@
+defmodule Readability.TitleFinder do
+  @moduledoc """
+  The TitleFinder engine traverse the HTML tree searching for finding title.
+  """
+
+  @title_suffix ~r/(\-)|(\:\:)|(\|)/
+  @h_tag_selector "h1, h2, h3"
+
+  @type html_tree :: tuple | list
+
+  def title(html_tree) do
+    maybe_title = tag_title(html_tree)
+    if length(String.split(maybe_title, " ")) <= 4 do
+      maybe_title = og_title(html_tree)
+    end
+    maybe_title || h_tag_title(html_tree)
+  end
+
+  @doc """
+  Find title from title tag
+  """
+
+  @spec tag_title(html_tree) :: binary
+
+  def tag_title(html_tree) do
+    html_tree
+    |> Floki.find("title")
+    |> to_clean_text
+  end
+
+  @doc """
+  Find title from og:title property of meta tag
+  """
+
+  @spec og_title(html_tree) :: binary
+
+  def og_title(html_tree) do
+    html_tree
+    |> Floki.find("meta[property=og:title]")
+    |> Floki.attribute("content")
+    |> to_clean_text
+  end
+
+  @doc """
+  Find title from h tag
+  """
+
+  @spec h_tag_title(html_tree, String.t) :: binary
+
+  def h_tag_title(html_tree, selector \\@h_tag_selector) do
+    html_tree
+    |> Floki.find(selector)
+    |> hd
+    |> to_clean_text
+  end
+
+  defp to_clean_text(html_tree) do
+    title_text = html_tree
+                 |> Floki.text
+                 |> String.split(@title_suffix)
+                 |> hd
+                 |> String.strip
+  end
+end
--- a/mix.exs
+++ b/mix.exs
@ -0,0 +1,34 @@
+defmodule Readability.Mixfile do
+  use Mix.Project
+
+  def project do
+    [app: :readability,
+     version: "0.0.1",
+     elixir: "~> 1.2",
+     build_embedded: Mix.env == :prod,
+     start_permanent: Mix.env == :prod,
+     deps: deps]
+  end
+
+  # Configuration for the OTP application
+  #
+  # Type "mix help compile.app" for more information
+  def application do
+    [applications: [:logger,
+                    :floki
+                   ]]
+  end
+
+  # Dependencies can be Hex packages:
+  #
+  #   {:mydep, "~> 0.3.0"}
+  #
+  # Or git/path repositories:
+  #
+  #   {:mydep, git: "https://github.com/elixir-lang/mydep.git", tag: "0.1.0"}
+  #
+  # Type "mix help deps" for more examples and options
+  defp deps do
+    [{:floki, "~> 0.8.0"}]
+  end
+end
--- a/mix.lock
+++ b/mix.lock
@ -0,0 +1,2 @@
+%{"floki": {:hex, :floki, "0.8.0"},
+  "mochiweb_html": {:hex, :mochiweb_html, "2.13.0"}}
--- a/test/features/nytimes.html
+++ b/test/features/nytimes.html
--- a/test/readability_test.exs
+++ b/test/readability_test.exs
@ -0,0 +1,8 @@
+defmodule ReadabilityTest do
+  use ExUnit.Case
+  doctest Readability
+
+  test "the truth" do
+    assert 1 + 1 == 2
+  end
+end
--- a/test/test_helper.exs
+++ b/test/test_helper.exs
@ -0,0 +1 @@
+ExUnit.start()
--- a/test/title_finder_test.exs
+++ b/test/title_finder_test.exs
@ -0,0 +1,45 @@
+defmodule Readability.TitleFinderTest do
+  use ExUnit.Case, async: true
+
+  doctest Readability
+
+  @html """
+  <html>
+    <head>
+      <title>Tag title - test</title>
+      <meta property='og:title' content='og title | test'>
+    </head>
+    <body>
+      <p>
+        <h1>h1 title</h1>
+        <h2>h2 title</h2>
+      </p>
+    </body>
+  </html>
+  """
+
+  test "extract og title" do
+    title = Readability.TitleFinder.og_title(@html)
+    assert title == "og title"
+  end
+
+  test "extract tag title" do
+    title = Readability.TitleFinder.tag_title(@html)
+    assert title == "Tag title"
+  end
+
+  test "extract h1 tag title" do
+    title = Readability.TitleFinder.h_tag_title(@html)
+    assert title == "h1 title"
+  end
+
+  test "extrat h2 tag title" do
+    title = Readability.TitleFinder.h_tag_title(@html, "h2")
+    assert title == "h2 title"
+  end
+
+  test "extract most proper title" do
+    title = Readability.TitleFinder.title(@html)
+    assert title == "og title"
+  end
+end