defmodule Frenzy.Pipeline.Extractor.WhateverScalzi do @moduledoc """ Extractor for https://whatever.scalzi.com """ alias Frenzy.Pipeline.Extractor @behaviour Extractor @impl Extractor def extract(html_tree) do case get_article_content(html_tree) do nil -> {:error, "no matching elements"} elem -> {:ok, elem} end end defp get_article_content(html_tree) do # there's no element that contains only the post content # .postarea contains the headline, post content, social media buttons, and comments with [{_tag, _attrs, postarea_children} | _] <- Floki.find(html_tree, ".postarea"), {_before_headline, [_headline | rest]} <- Enum.split_while(postarea_children, fn {tag, _attrs, _children} -> tag != "h1" end), {article_content, _rest} <- Enum.split_while(rest, fn {tag, attrs, _children} -> tag != "div" || !({"id", "jp-post-flair"} in attrs) end) do article_content else _ -> nil end end end