frenzy/lib/frenzy/pipeline/extractor/verge.ex

defmodule Frenzy.Pipeline.Extractor.TheVerge do
  @moduledoc """
  Extractor for https://theverge.com
  Handles their bizarro new layout that's a pile of unsemantic classes
  """

  require Logger
  alias Frenzy.Pipeline.Extractor
  @behaviour Extractor

  @impl Extractor
  def extract(html_tree) do
    image = extract_header_image(html_tree)

    content =
      html_tree
      |> Floki.find("article#content > div:not(.duet--article--lede)")
      |> Floki.filter_out(
        ".duet--layout--rail, .duet--article--article-pullquote, .duet--article--comments-join-the-conversation"
      )

    {:ok, image ++ content}
  end

  @spec extract_header_image(Floki.html_tree()) :: Floki.html_tree()
  defp extract_header_image(html_tree) do
    case Floki.find(html_tree, "article#content > .duet--article--lede figure") do
      [figure | _] ->
        img =
          case Floki.find(figure, "img") do
            [img | _] ->
              [img]

            _ ->
              nil
          end

        caption =
          case Floki.find(figure, ".duet--media--caption") do
            [{_tag, _attrs, children} | _] ->
              [{"figcaption", [], [Floki.text(children)]}]

            _ ->
              []
          end

        if img do
          [{"figure", [], img ++ caption}]
        else
          []
        end

      _ ->
        []
    end
  end
end
Add extractor for The Verge 2022-09-14 21:47:22 +00:00			`defmodule Frenzy.Pipeline.Extractor.TheVerge do`
			`@moduledoc """`
			`Extractor for https://theverge.com`
			`Handles their bizarro new layout that's a pile of unsemantic classes`
			`"""`

			`require Logger`
			`alias Frenzy.Pipeline.Extractor`
			`@behaviour Extractor`

			`@impl Extractor`
			`def extract(html_tree) do`
			`image = extract_header_image(html_tree)`

			`content =`
			`html_tree`
			`\|> Floki.find("article#content > div:not(.duet--article--lede)")`
			`\|> Floki.filter_out(`
			`".duet--layout--rail, .duet--article--article-pullquote, .duet--article--comments-join-the-conversation"`
			`)`

			`{:ok, image ++ content}`
			`end`

			`@spec extract_header_image(Floki.html_tree()) :: Floki.html_tree()`
			`defp extract_header_image(html_tree) do`
			`case Floki.find(html_tree, "article#content > .duet--article--lede figure") do`
			`[figure \| _] ->`
			`img =`
			`case Floki.find(figure, "img") do`
			`[img \| _] ->`
			`[img]`

			`_ ->`
			`nil`
			`end`

			`caption =`
			`case Floki.find(figure, ".duet--media--caption") do`
			`[{_tag, _attrs, children} \| _] ->`
			`[{"figcaption", [], [Floki.text(children)]}]`

			`_ ->`
			`[]`
			`end`

			`if img do`
			`[{"figure", [], img ++ caption}]`
			`else`
			`[]`
			`end`

			`_ ->`
			`[]`
			`end`
			`end`
			`end`