defmodule Frenzy.Pipeline.Extractor.TheVerge do @moduledoc """ Extractor for https://theverge.com Handles their bizarro new layout that's a pile of unsemantic classes """ require Logger alias Frenzy.Pipeline.Extractor @behaviour Extractor @impl Extractor def extract(html_tree) do image = extract_header_image(html_tree) content = html_tree |> Floki.find("article#content > div:not(.duet--article--lede)") |> Floki.filter_out( ".duet--layout--rail, .duet--article--article-pullquote, .duet--article--comments-join-the-conversation" ) {:ok, image ++ content} end @spec extract_header_image(Floki.html_tree()) :: Floki.html_tree() defp extract_header_image(html_tree) do case Floki.find(html_tree, "article#content > .duet--article--lede figure") do [figure | _] -> img = case Floki.find(figure, "img") do [img | _] -> [img] _ -> nil end caption = case Floki.find(figure, ".duet--media--caption") do [{_tag, _attrs, children} | _] -> [{"figcaption", [], [Floki.text(children)]}] _ -> [] end if img do [{"figure", [], img ++ caption}] else [] end _ -> [] end end end