defmodule Frenzy.Pipeline.Extractor.TheVerge do @moduledoc """ Extractor for https://theverge.com Handles their bizarro new layout that's a pile of unsemantic classes """ require Logger alias Frenzy.Pipeline.Extractor @behaviour Extractor @impl Extractor def extract(html_tree) do image = extract_header_image(html_tree) content = html_tree |> Floki.find("main article > div:not(.duet--article--lede)") |> Floki.filter_out( ".hidden, .duet--layout--rail, .duet--article--article-pullquote, .duet--article--comments-join-the-conversation, .duet--recirculation--related-list, .duet--article--comments-button, .duet--article--share-buttons" ) |> Readability.Helper.remove_attrs("style") |> Floki.traverse_and_update(&rewrite/1) {:ok, image ++ content} end @spec extract_header_image(Floki.html_tree()) :: Floki.html_tree() defp extract_header_image(html_tree) do case Floki.find(html_tree, "article#content > .duet--article--lede figure") do [figure | _] -> img = case Floki.find(figure, "img") do [img | _] -> [img] _ -> nil end caption = case Floki.find(figure, ".duet--media--caption") do [{_tag, _attrs, children} | _] -> [{"figcaption", [], [Floki.text(children)]}] _ -> [] end if img do [{"figure", [], img ++ caption}] else [] end _ -> [] end end defp rewrite({_tag, _attrs, children} = el) do cond do is_empty_gif(el) -> nil is_gallery(el) -> images = Floki.find(children, ":not(noscript) > img") |> Enum.map(fn el -> [src] = Floki.attribute(el, "src") src end) { "div", [ {"style", "display: flex; flex-direction: row; overflow-x: auto; scroll-snap-type: x mandatory;;"} ], Enum.map(images, fn src -> { "img", [ {"src", src}, {"loading", "lazy"}, {"style", "max-width: 85%; max-height: 30vh; scroll-snap-align: start;"} ], [] } end) } true -> el end end defp rewrite(other), do: other defp is_gallery(el) do case Floki.attribute(el, "class") do [classes] -> String.contains?(classes, "duet--article--gallery") _ -> false end end defp is_empty_gif(el) do case Floki.attribute(el, "src") do [src] -> String.starts_with?(src, "data:image/gif;") _ -> false end end end