From 26b832b6220228715b5442a2f1d11561384bd4ac Mon Sep 17 00:00:00 2001 From: Shadowfacts Date: Wed, 31 Mar 2021 15:30:05 -0400 Subject: [PATCH] Fix whatever.scalzi.com extractor --- .../pipeline/extractor/whatever_scalzi.ex | 58 ++++++++++++++++--- 1 file changed, 49 insertions(+), 9 deletions(-) diff --git a/lib/frenzy/pipeline/extractor/whatever_scalzi.ex b/lib/frenzy/pipeline/extractor/whatever_scalzi.ex index c8ec216..ae8090e 100644 --- a/lib/frenzy/pipeline/extractor/whatever_scalzi.ex +++ b/lib/frenzy/pipeline/extractor/whatever_scalzi.ex @@ -20,15 +20,55 @@ defmodule Frenzy.Pipeline.Extractor.WhateverScalzi do defp get_article_content(html_tree) do # there's no element that contains only the post content # .postarea contains the headline, post content, social media buttons, and comments - with [{_tag, _attrs, postarea_children} | _] <- Floki.find(html_tree, ".postarea"), - {_before_headline, [_headline | rest]} <- - Enum.split_while(postarea_children, fn {tag, _attrs, _children} -> tag != "h1" end), - {article_content, _rest} <- - Enum.split_while(rest, fn {tag, attrs, _children} -> - tag != "div" || !({"id", "jp-post-flair"} in attrs) - end) do - article_content - else + case Floki.find(html_tree, ".postarea") do + [{_tag, _attrs, postarea_children}] -> + Enum.split_while(postarea_children, fn + {"h1", _, _} -> true + _ -> false + end) + |> case do + {_before_headline, [_headline | rest]} -> + {article_content, _rest} = + Enum.split_while(rest, fn + {"div", attrs, _} = el -> + class = Floki.attribute(el, "class") |> List.first() + + if {"id", "comments"} in attrs do + false + else + is_nil(class) || !String.contains?(class, "sharedaddy") + end + + _ -> + true + end) + + article_content + |> Floki.map(fn + {"img", attrs} = el -> + class = Enum.find(attrs, fn {k, _} -> k == "class" end) + class = if is_nil(class), do: nil, else: elem(class, 1) + + if !is_nil(class) && String.contains?(class, "jetpack-lazy-image") do + { + "img", + Enum.filter(attrs, fn + {"srcset", _} -> false + _ -> true + end) + } + else + el + end + + el -> + el + end) + + _ -> + nil + end + _ -> nil end