frenzy/lib/frenzy/pipeline/extractor/verge.ex

defmodule Frenzy.Pipeline.Extractor.TheVerge do
  @moduledoc """
  Extractor for https://theverge.com
  Handles their bizarro new layout that's a pile of unsemantic classes
  """

  require Logger
  alias Frenzy.Pipeline.Extractor
  @behaviour Extractor

  @impl Extractor
  def extract(html_tree) do
    image = extract_header_image(html_tree)

    content =
      html_tree
      |> Floki.find("article#content > div:not(.duet--article--lede)")
      |> Floki.filter_out(
        ".duet--layout--rail, .duet--article--article-pullquote, .duet--article--comments-join-the-conversation"
      )

    {:ok, image ++ content}
  end

  @spec extract_header_image(Floki.html_tree()) :: Floki.html_tree()
  defp extract_header_image(html_tree) do
    case Floki.find(html_tree, "article#content > .duet--article--lede figure") do
      [figure | _] ->
        img =
          case Floki.find(figure, "img") do
            [img | _] ->
              [img]

            _ ->
              nil
          end

        caption =
          case Floki.find(figure, ".duet--media--caption") do
            [{_tag, _attrs, children} | _] ->
              [{"figcaption", [], [Floki.text(children)]}]

            _ ->
              []
          end

        if img do
          [{"figure", [], img ++ caption}]
        else
          []
        end

      _ ->
        []
    end
  end
end