From bd42073e24f0e0725fe84302e225e4ff519f676a Mon Sep 17 00:00:00 2001 From: Shadowfacts Date: Fri, 14 Aug 2020 21:55:38 -0400 Subject: [PATCH] Fix whatever.scalzi.com extractor --- lib/frenzy/pipeline/extractor/whatever_scalzi.ex | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/lib/frenzy/pipeline/extractor/whatever_scalzi.ex b/lib/frenzy/pipeline/extractor/whatever_scalzi.ex index c41ddce..c8ec216 100644 --- a/lib/frenzy/pipeline/extractor/whatever_scalzi.ex +++ b/lib/frenzy/pipeline/extractor/whatever_scalzi.ex @@ -18,11 +18,17 @@ defmodule Frenzy.Pipeline.Extractor.WhateverScalzi do end defp get_article_content(html_tree) do - case Floki.find(html_tree, "article.post > div.entry-content") do - [content_elem | _] -> - # remove social media buttons that are included in the .entry-content element - Floki.filter_out(content_elem, "div#jp-post-flair") - + # there's no element that contains only the post content + # .postarea contains the headline, post content, social media buttons, and comments + with [{_tag, _attrs, postarea_children} | _] <- Floki.find(html_tree, ".postarea"), + {_before_headline, [_headline | rest]} <- + Enum.split_while(postarea_children, fn {tag, _attrs, _children} -> tag != "h1" end), + {article_content, _rest} <- + Enum.split_while(rest, fn {tag, attrs, _children} -> + tag != "div" || !({"id", "jp-post-flair"} in attrs) + end) do + article_content + else _ -> nil end