defmodule Frenzy.Pipeline.Extractor.ArsTechnica do @moduledoc """ Extractor for https://arstechnica.com Handles multi-page articles """ require Logger alias Frenzy.Network alias Frenzy.Pipeline.Extractor @behaviour Extractor @impl Extractor def extract(html_tree) do case get_pages_from_tree(html_tree) do {:error, _} = err -> err content -> {:ok, content} end end defp get_pages_from_tree(tree) do with [article | _] <- Floki.find(tree, ~s([itemtype="http://schema.org/NewsArticle"])), [content | _] <- Floki.find(article, ~s([itemprop=articleBody])) do content = clean_content(content) next_page_url = with [next | _] <- Floki.find(article, ".page-numbers a:last-of-type"), "Next" <> _ <- Floki.text(next), [href] <- Floki.attribute(next, "href") do href else _ -> nil end if next_page_url != nil do with body when not is_nil(body) <- fetch_page(next_page_url), next_pages when is_list(next_pages) <- get_pages_from_tree(Floki.parse(body)) do [content] ++ next_pages else _ -> [ content, {"p", [], [{"em", [], ["Article truncated, unable to scrape subsequent pages"]}]} ] end else [content] end else _ -> {:error, "no matching elements"} end end defp clean_content(tree) do Floki.filter_out(tree, ".social-left, .story-sidebar, .ad_wrapper, figcaption .enlarge-link") end defp fetch_page(url) do Logger.debug("Getting Ars Technica page from #{url}") case Network.http_get(url) do {:ok, %Tesla.Env{status: code, body: body}} when code in 200..299 -> body {:ok, %Tesla.Env{status: code}} -> Logger.warn("Unexpected HTTP code #{code} getting Ars Technica page #{url}") nil {:error, reason} -> Logger.error("Couldn't get Ars Technica page #{url}: #{inspect(reason)}") nil end end end