From 5990d0e4c260727226dc5ea464794e51ff9dbd8e Mon Sep 17 00:00:00 2001 From: Shadowfacts Date: Fri, 3 Sep 2021 16:24:35 -0400 Subject: [PATCH] Add Slate extractor --- lib/frenzy/pipeline/extractor/slate.ex | 39 +++++++++++++++++++ lib/frenzy/pipeline/scrape_stage.ex | 8 +--- .../live/configure_stage/scrape_stage_live.ex | 1 + 3 files changed, 41 insertions(+), 7 deletions(-) create mode 100644 lib/frenzy/pipeline/extractor/slate.ex diff --git a/lib/frenzy/pipeline/extractor/slate.ex b/lib/frenzy/pipeline/extractor/slate.ex new file mode 100644 index 0000000..ba69e3f --- /dev/null +++ b/lib/frenzy/pipeline/extractor/slate.ex @@ -0,0 +1,39 @@ +defmodule Frenzy.Pipeline.Extractor.Slate do + @moduledoc """ + Extractor for https://slate.com + """ + + alias Frenzy.Pipeline.Extractor + @behaviour Extractor + + @impl Extractor + def extract(html_tree) do + case get_article_content(html_tree) do + nil -> + {:error, "no matching elements"} + + elem -> + {:ok, elem} + end + end + + defp get_article_content(html_tree) do + case Floki.find(html_tree, ".article__content") do + [el] -> + article_content = Floki.filter_out(el, ".slate-ad, .in-article-recirc, .social-share") + + image = Floki.find(html_tree, ".article__top-image img") + + case image do + [] -> + article_content + + [image | _] -> + [image, article_content] + end + + _ -> + nil + end + end +end diff --git a/lib/frenzy/pipeline/scrape_stage.ex b/lib/frenzy/pipeline/scrape_stage.ex index a5830d3..a15434d 100644 --- a/lib/frenzy/pipeline/scrape_stage.ex +++ b/lib/frenzy/pipeline/scrape_stage.ex @@ -113,13 +113,7 @@ defmodule Frenzy.Pipeline.ScrapeStage do html = Floki.map(html, rewrite_image_urls(convert_to_data_uris, URI.parse(url))) - case opts["extractor"] do - "builtin" -> - {:ok, Readability.readable_html(html)} - - _ -> - {:ok, Floki.raw_html(html)} - end + {:ok, Readability.readable_html(html)} res -> res diff --git a/lib/frenzy_web/live/configure_stage/scrape_stage_live.ex b/lib/frenzy_web/live/configure_stage/scrape_stage_live.ex index 0b68275..93012de 100644 --- a/lib/frenzy_web/live/configure_stage/scrape_stage_live.ex +++ b/lib/frenzy_web/live/configure_stage/scrape_stage_live.ex @@ -9,6 +9,7 @@ defmodule FrenzyWeb.ConfigureStage.ScrapeStageLive do {"finertech.com", Frenzy.Pipeline.Extractor.FinerTech}, {"macstories.net", Frenzy.Pipeline.Extractor.MacStories}, {"om.co", Frenzy.Pipeline.Extractor.OmMalik}, + {"slate.com", Frenzy.Pipeline.Extractor.Slate}, {"whatever.scalzi.com", Frenzy.Pipeline.Extractor.WhateverScalzi} ] |> Enum.map(fn {pretty_name, module} ->