diff --git a/lib/frenzy/pipeline/extractor.ex b/lib/frenzy/pipeline/extractor.ex index c160c8b..ec15fa9 100644 --- a/lib/frenzy/pipeline/extractor.ex +++ b/lib/frenzy/pipeline/extractor.ex @@ -1,3 +1,3 @@ defmodule Frenzy.Pipeline.Extractor do - @callback extract(String.t()) :: {:ok, String.t()} | {:error, String.t()} + @callback extract(Floki.html_tree()) :: {:ok, Floki.html_tree()} | {:error, String.t()} end diff --git a/lib/frenzy/pipeline/extractor/daring_fireball.ex b/lib/frenzy/pipeline/extractor/daring_fireball.ex index 5daf6f5..b5dad1f 100644 --- a/lib/frenzy/pipeline/extractor/daring_fireball.ex +++ b/lib/frenzy/pipeline/extractor/daring_fireball.ex @@ -7,15 +7,13 @@ defmodule Frenzy.Pipeline.Extractor.DaringFireball do @behaviour Extractor @impl Extractor - def extract(body) do - html_tree = Floki.parse(body) - + def extract(html_tree) do case get_article_element(html_tree) || get_link_element(html_tree) do nil -> {:error, "no matching elements"} elem -> - {:ok, Floki.raw_html(elem)} + {:ok, elem} end end diff --git a/lib/frenzy/pipeline/extractor/erica_sadun.ex b/lib/frenzy/pipeline/extractor/erica_sadun.ex index 3ef44d2..c11b02a 100644 --- a/lib/frenzy/pipeline/extractor/erica_sadun.ex +++ b/lib/frenzy/pipeline/extractor/erica_sadun.ex @@ -7,18 +7,14 @@ defmodule Frenzy.Pipeline.Extractor.EricaSadun do @behaviour Extractor @impl Extractor - def extract(body) do - html_tree = Floki.parse(body) - + def extract(html_tree) do case Floki.find(html_tree, ".post-content") do [content_elem | _] -> # content element includes social media buttons and related posts - content = - content_elem - |> Floki.filter_out("div.sharedaddy, div#jp-relatedposts") - |> Floki.raw_html() - - {:ok, content} + { + :ok, + Floki.filter_out(content_elem, "div.sharedaddy, div#jp-relatedposts") + } _ -> {:error, "no matching elements"} diff --git a/lib/frenzy/pipeline/extractor/whatever_scalzi.ex b/lib/frenzy/pipeline/extractor/whatever_scalzi.ex index 279124a..b600b7d 100644 --- a/lib/frenzy/pipeline/extractor/whatever_scalzi.ex +++ b/lib/frenzy/pipeline/extractor/whatever_scalzi.ex @@ -7,15 +7,13 @@ defmodule Frenzy.Pipeline.Extractor.WhateverScale do @behaviour Extractor @impl Extractor - def extract(body) do - html_tree = Floki.parse(body) - + def extract(html_tree) do case get_article_content(html_tree) do nil -> {:error, "no matching elements"} elem -> - {:ok, Floki.raw_html(elem)} + {:ok, elem} end end diff --git a/lib/frenzy/pipeline/scrape_stage.ex b/lib/frenzy/pipeline/scrape_stage.ex index 61834e0..0caadcd 100644 --- a/lib/frenzy/pipeline/scrape_stage.ex +++ b/lib/frenzy/pipeline/scrape_stage.ex @@ -68,7 +68,16 @@ defmodule Frenzy.Pipeline.ScrapeStage do {:ok, Readability.readable_html(article)} module_name -> - apply(String.to_existing_atom("Elixir." <> module_name), :extract, [body]) + html_tree = Floki.parse(body) + + case apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree]) do + {:error, _} = err -> + err + + {:ok, html_tree} -> + html_tree + |> Floki.raw_html() + end end end