defmodule Frenzy.Pipeline.Extractor.WhateverScalzi do @moduledoc """ Extractor for https://whatever.scalzi.com """ alias Frenzy.Pipeline.Extractor @behaviour Extractor @impl Extractor def extract(html_tree) do case get_article_content(html_tree) do nil -> {:error, "no matching elements"} elem -> {:ok, elem} end end defp get_article_content(html_tree) do # there's no element that contains only the post content # .postarea contains the headline, post content, social media buttons, and comments case Floki.find(html_tree, ".postarea") do [{_tag, _attrs, postarea_children}] -> Enum.split_while(postarea_children, fn {"h1", _, _} -> true _ -> false end) |> case do {_before_headline, [_headline | rest]} -> {article_content, _rest} = Enum.split_while(rest, fn {"div", attrs, _} = el -> class = Floki.attribute(el, "class") |> List.first() if {"id", "comments"} in attrs do false else is_nil(class) || !String.contains?(class, "sharedaddy") end _ -> true end) Extractor.Util.strip_wp_lazy_loading(article_content) _ -> nil end _ -> nil end end end