s clean_conditionally(node, candidates, "table, ul, div") # We'll sanitize all elements using a whitelist base_whitelist = @options[:tags] || %w[div p] # We'll add whitespace instead of block elements, # so a
b will have a nice space between them base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center] # Use a hash for speed (don't want to make a million calls to include?) whitelist = Hash.new base_whitelist.each {|tag| whitelist[tag] = true } replace_with_whitespace = Hash.new base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true } ([node] + node.css("*")).each do |el| # If element is in whitelist, delete all its attributes if whitelist[el.node_name] el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } # Otherwise, replace the element with its contents else # If element is root, replace the node as a text node if el.parent.nil? node = Nokogiri::XML::Text.new(el.text, el.document) break else if replace_with_whitespace[el.node_name] el.swap(Nokogiri::XML::Text.new(' ' << el.text << ' ', el.document)) else el.swap(Nokogiri::XML::Text.new(el.text, el.document)) end end end end s = Nokogiri::XML::Node::SaveOptions save_opts = s::NO_DECLARATION | s::NO_EMPTY_TAGS | s::AS_XHTML html = node.serialize(:save_with => save_opts) # Get rid of duplicate whitespace return html.gsub(/[\r\n\f]+/, "\n" ) end def clean_conditionally(node, candidates, selector) return unless @clean_conditionally node.css(selector).each do |el| weight = class_weight(el) content_score = candidates[el] ? candidates[el][:content_score] : 0 name = el.name.downcase if weight + content_score < 0 el.remove debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.") elsif el.text.count(",") < 10 counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m } counts["li"] -= 100 # For every img under a noscript tag discount one from the count to avoid double counting counts["img"] -= el.css("noscript").css("img").length content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace link_density = get_link_density(el) reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density) if reason debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.") el.remove end end end end def clean_conditionally_reason?(name, counts, content_length, options, weight, link_density) if (counts["img"] > counts["p"]) && (counts["img"] > 1) "too many images" elsif counts["li"] > counts["p"] && name != "ul" && name != "ol" "more

s than

s" elsif counts["input"] > (counts["p"] / 3).to_i "less than 3x

s than s" elsif (content_length < options[:min_text_length]) && (counts["img"] != 1) "too short a content length without a single image" elsif weight < 25 && link_density > 0.2 "too many links for its weight (#{weight})" elsif weight >= 25 && link_density > 0.5 "too many links for its weight (#{weight})" elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1 "s with too short a content length, or too many s" else nil end end private # 제거항목 추가항목을 지정한다. def handle_exclusions!(whitelist, blacklist) return unless whitelist || blacklist if blacklist elems = @html.css(blacklist) if elems elems.each do |e| e.remove end end end if whitelist elems = @html.css(whitelist).to_s if body = @html.at_css('body') body.inner_html = elems end end @input = @html.to_s end # 코멘트가 제거된 기본 html 노드 반환 def make_html(whitelist=nil, blacklist=nil) @html = Nokogiri::HTML(@input, nil, @options[:encoding]) # In case document has no body, such as from empty string or redirect @html = Nokogiri::HTML('', nil, @options[:encoding]) if @html.css('body').length == 0 # Remove html comment tags @html.xpath('//comment()').each { |i| i.remove } end def prepare_candidates @html.css("script, style").each { |i| i.remove } remove_unlikely_candidates! if @remove_unlikely_candidates transform_misused_divs_into_paragraphs! @candidates = score_paragraphs(options[:min_text_length]) @best_candidate = select_best_candidate(@candidates) end # 가망없는 후보자를 제거한다. (명확한 후보자는 제외하고 제거한다.) def remove_unlikely_candidates! @html.css("*").each do |elem| str = "#{elem[:class]}#{elem[:id]}" if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && (elem.name.downcase != 'html') && (elem.name.downcase != 'body') debug("Removing unlikely candidate - #{str}") elem.remove end end end # 잘못 사용되고 있는 DIV를 p로 변환한다. def transform_misused_divs_into_paragraphs! @html.css("*").each do |elem| if elem.name.downcase == "div" # transform

s that do not contain other block elements into

s if elem.inner_html !~ REGEXES[:divToPElementsRe] debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p"); elem.name = "p" end else # wrap text nodes in p tags # elem.children.each do |child| # if child.text? # debug("wrapping text node with a p") # child.swap("

#{child.text}

") # end # end end end end # 가능노드에 점수를 매긴다. def score_paragraphs(min_text_length) candidates = {} @html.css("p,td").each do |elem| parent_node = elem.parent grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil inner_text = elem.text # If this paragraph is less than 25 characters, don't even count it. next if inner_text.length < min_text_length candidates[parent_node] ||= score_node(parent_node) candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node content_score = 1 content_score += inner_text.split(',').length content_score += [(inner_text.length / 100).to_i, 3].min candidates[parent_node][:content_score] += content_score candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node end # Scale the final candidates score based on link density. Good content should have a # relatively small link density (5% or less) and be mostly unaffected by this operation. candidates.each do |elem, candidate| candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem)) end candidates end end end