diff --git a/lib/frenzy/pipeline/extractor/ars_technica.ex b/lib/frenzy/pipeline/extractor/ars_technica.ex new file mode 100644 index 0000000..f696560 --- /dev/null +++ b/lib/frenzy/pipeline/extractor/ars_technica.ex @@ -0,0 +1,74 @@ +defmodule Frenzy.Pipeline.Extractor.ArsTechnica do + @moduledoc """ + Extractor for https://arstechnica.com + Handles multi-page articles + """ + + require Logger + alias Frenzy.Network + alias Frenzy.Pipeline.Extractor + @behaviour Extractor + + @impl Extractor + def extract(html_tree) do + case get_pages_from_tree(html_tree) do + {:error, _} = err -> err + content -> {:ok, content} + end + end + + defp get_pages_from_tree(tree) do + with [article | _] <- Floki.find(tree, ~s([itemtype="http://schema.org/NewsArticle"])), + [content | _] <- Floki.find(article, ~s([itemprop=articleBody])) do + content = clean_content(content) + + next_page_url = + with [next | _] <- Floki.find(article, ".page-numbers a:last-of-type"), + "Next" <> _ <- Floki.text(next), + [href] <- Floki.attribute(next, "href") do + href + else + _ -> + nil + end + + if next_page_url != nil do + with body when not is_nil(body) <- fetch_page(next_page_url), + next_pages when is_list(next_pages) <- get_pages_from_tree(Floki.parse(body)) do + [content] ++ next_pages + else + _ -> + [ + content, + {"p", [], [{"em", [], ["Article truncated, unable to scrape subsequent pages"]}]} + ] + end + else + [content] + end + else + _ -> {:error, "no matching elements"} + end + end + + defp clean_content(tree) do + Floki.filter_out(tree, ".social-left, .story-sidebar, .ad_wrapper, figcaption .enlarge-link") + end + + defp fetch_page(url) do + Logger.debug("Getting Ars Technica page from #{url}") + + case Network.http_get(url) do + {:ok, %Tesla.Env{status: code, body: body}} when code in 200..299 -> + body + + {:ok, %Tesla.Env{status: code}} -> + Logger.warn("Unexpected HTTP code #{code} getting Ars Technica page #{url}") + nil + + {:error, reason} -> + Logger.error("Couldn't get Ars Technica page #{url}: #{inspect(reason)}") + nil + end + end +end diff --git a/lib/frenzy_web/live/configure_stage/scrape_stage_live.ex b/lib/frenzy_web/live/configure_stage/scrape_stage_live.ex index bde6136..4402bd6 100644 --- a/lib/frenzy_web/live/configure_stage/scrape_stage_live.ex +++ b/lib/frenzy_web/live/configure_stage/scrape_stage_live.ex @@ -4,6 +4,7 @@ defmodule FrenzyWeb.ConfigureStage.ScrapeStageLive do @extractors [ {"Builtin", "builtin"}, {"512 Pixels", Frenzy.Pipeline.Extractor.FiveTwelvePixels}, + {"Ars Technica", Frenzy.Pipeline.Extractor.ArsTechnica}, {"beckyhansmeyer.com", Frenzy.Pipeline.Extractor.BeckyHansmeyer}, {"daringfireball.net", Frenzy.Pipeline.Extractor.DaringFireball}, {"ericasadun.com", Frenzy.Pipeline.Extractor.EricaSadun},