frenzy/lib/frenzy/pipeline/extractor/ars_technica.ex

defmodule Frenzy.Pipeline.Extractor.ArsTechnica do
  @moduledoc """
  Extractor for https://arstechnica.com
  Handles multi-page articles
  """

  require Logger
  alias Frenzy.Network
  alias Frenzy.Pipeline.Extractor
  @behaviour Extractor

  @impl Extractor
  def extract(html_tree) do
    case get_pages_from_tree(html_tree) do
      {:error, _} = err -> err
      content -> {:ok, content}
    end
  end

  defp get_pages_from_tree(tree) do
    with [article | _] <- Floki.find(tree, ~s([itemtype="http://schema.org/NewsArticle"])),
         [content | _] <- Floki.find(article, ~s([itemprop=articleBody])) do
      content = clean_content(content)

      next_page_url =
        with [next | _] <- Floki.find(article, ".page-numbers a:last-of-type"),
             "Next" <> _ <- Floki.text(next),
             [href] <- Floki.attribute(next, "href") do
          href
        else
          _ ->
            nil
        end

      if next_page_url != nil do
        with body when not is_nil(body) <- fetch_page(next_page_url),
             next_pages when is_list(next_pages) <- get_pages_from_tree(Floki.parse(body)) do
          [content] ++ next_pages
        else
          _ ->
            [
              content,
              {"p", [], [{"em", [], ["Article truncated, unable to scrape subsequent pages"]}]}
            ]
        end
      else
        [content]
      end
    else
      _ -> {:error, "no matching elements"}
    end
  end

  defp clean_content(tree) do
    Floki.filter_out(tree, ".social-left, .story-sidebar, .ad_wrapper, figcaption .enlarge-link")
  end

  defp fetch_page(url) do
    Logger.debug("Getting Ars Technica page from #{url}")

    case Network.http_get(url) do
      {:ok, %Tesla.Env{status: code, body: body}} when code in 200..299 ->
        body

      {:ok, %Tesla.Env{status: code}} ->
        Logger.warn("Unexpected HTTP code #{code} getting Ars Technica page #{url}")
        nil

      {:error, reason} ->
        Logger.error("Couldn't get Ars Technica page #{url}: #{inspect(reason)}")
        nil
    end
  end
end