From e7184a2535f126a05a656c53456fd24d71fa504b Mon Sep 17 00:00:00 2001 From: Shadowfacts Date: Wed, 14 Sep 2022 17:47:22 -0400 Subject: [PATCH] Add extractor for The Verge --- lib/frenzy/pipeline/extractor/verge.ex | 57 +++++++++++++++++++ .../live/configure_stage/scrape_stage_live.ex | 1 + 2 files changed, 58 insertions(+) create mode 100644 lib/frenzy/pipeline/extractor/verge.ex diff --git a/lib/frenzy/pipeline/extractor/verge.ex b/lib/frenzy/pipeline/extractor/verge.ex new file mode 100644 index 0000000..32c152e --- /dev/null +++ b/lib/frenzy/pipeline/extractor/verge.ex @@ -0,0 +1,57 @@ +defmodule Frenzy.Pipeline.Extractor.TheVerge do + @moduledoc """ + Extractor for https://theverge.com + Handles their bizarro new layout that's a pile of unsemantic classes + """ + + require Logger + alias Frenzy.Pipeline.Extractor + @behaviour Extractor + + @impl Extractor + def extract(html_tree) do + image = extract_header_image(html_tree) + + content = + html_tree + |> Floki.find("article#content > div:not(.duet--article--lede)") + |> Floki.filter_out( + ".duet--layout--rail, .duet--article--article-pullquote, .duet--article--comments-join-the-conversation" + ) + + {:ok, image ++ content} + end + + @spec extract_header_image(Floki.html_tree()) :: Floki.html_tree() + defp extract_header_image(html_tree) do + case Floki.find(html_tree, "article#content > .duet--article--lede figure") do + [figure | _] -> + img = + case Floki.find(figure, "img") do + [img | _] -> + [img] + + _ -> + nil + end + + caption = + case Floki.find(figure, ".duet--media--caption") do + [{_tag, _attrs, children} | _] -> + [{"figcaption", [], [Floki.text(children)]}] + + _ -> + [] + end + + if img do + [{"figure", [], img ++ caption}] + else + [] + end + + _ -> + [] + end + end +end diff --git a/lib/frenzy_web/live/configure_stage/scrape_stage_live.ex b/lib/frenzy_web/live/configure_stage/scrape_stage_live.ex index 46c6856..4245928 100644 --- a/lib/frenzy_web/live/configure_stage/scrape_stage_live.ex +++ b/lib/frenzy_web/live/configure_stage/scrape_stage_live.ex @@ -13,6 +13,7 @@ defmodule FrenzyWeb.ConfigureStage.ScrapeStageLive do {"macstories.net", Frenzy.Pipeline.Extractor.MacStories}, {"om.co", Frenzy.Pipeline.Extractor.OmMalik}, {"slate.com", Frenzy.Pipeline.Extractor.Slate}, + {"The Verge", Frenzy.Pipeline.Extractor.TheVerge}, {"whatever.scalzi.com", Frenzy.Pipeline.Extractor.WhateverScalzi} ] |> Enum.map(fn {pretty_name, module} ->