Add extractor for The Verge

2022-09-14 17:47:22 -04:00 · 2022-09-14 17:47:22 -04:00 · e7184a2535
commit e7184a2535
parent b29c75d7d6
2 changed files with 58 additions and 0 deletions
--- a/lib/frenzy/pipeline/extractor/verge.ex
+++ b/lib/frenzy/pipeline/extractor/verge.ex
@ -0,0 +1,57 @@
+defmodule Frenzy.Pipeline.Extractor.TheVerge do
+  @moduledoc """
+  Extractor for https://theverge.com
+  Handles their bizarro new layout that's a pile of unsemantic classes
+  """
+
+  require Logger
+  alias Frenzy.Pipeline.Extractor
+  @behaviour Extractor
+
+  @impl Extractor
+  def extract(html_tree) do
+    image = extract_header_image(html_tree)
+
+    content =
+      html_tree
+      |> Floki.find("article#content > div:not(.duet--article--lede)")
+      |> Floki.filter_out(
+        ".duet--layout--rail, .duet--article--article-pullquote, .duet--article--comments-join-the-conversation"
+      )
+
+    {:ok, image ++ content}
+  end
+
+  @spec extract_header_image(Floki.html_tree()) :: Floki.html_tree()
+  defp extract_header_image(html_tree) do
+    case Floki.find(html_tree, "article#content > .duet--article--lede figure") do
+      [figure | _] ->
+        img =
+          case Floki.find(figure, "img") do
+            [img | _] ->
+              [img]
+
+            _ ->
+              nil
+          end
+
+        caption =
+          case Floki.find(figure, ".duet--media--caption") do
+            [{_tag, _attrs, children} | _] ->
+              [{"figcaption", [], [Floki.text(children)]}]
+
+            _ ->
+              []
+          end
+
+        if img do
+          [{"figure", [], img ++ caption}]
+        else
+          []
+        end
+
+      _ ->
+        []
+    end
+  end
+end
--- a/lib/frenzy_web/live/configure_stage/scrape_stage_live.ex
+++ b/lib/frenzy_web/live/configure_stage/scrape_stage_live.ex
@ -13,6 +13,7 @@ defmodule FrenzyWeb.ConfigureStage.ScrapeStageLive do
                {"macstories.net", Frenzy.Pipeline.Extractor.MacStories},
                {"om.co", Frenzy.Pipeline.Extractor.OmMalik},
                {"slate.com", Frenzy.Pipeline.Extractor.Slate},
+                {"The Verge", Frenzy.Pipeline.Extractor.TheVerge},
                {"whatever.scalzi.com", Frenzy.Pipeline.Extractor.WhateverScalzi}
              ]
              |> Enum.map(fn {pretty_name, module} ->