diff --git a/lib/frenzy/pipeline/scrape_stage.ex b/lib/frenzy/pipeline/scrape_stage.ex index 4e85cc7..73301db 100644 --- a/lib/frenzy/pipeline/scrape_stage.ex +++ b/lib/frenzy/pipeline/scrape_stage.ex @@ -10,7 +10,7 @@ defmodule Frenzy.Pipeline.ScrapeStage do {:ok, %{item_params | content: content}} {:error, reason} -> - Logger.warn("Unable to get article content: #{reason}") + Logger.warn("Unable to get article content for #{url}: #{reason}") item_params end end @@ -47,8 +47,16 @@ defmodule Frenzy.Pipeline.ScrapeStage do when status_code in [301, 302] do {"Location", new_url} = Enum.find(headers, fn {name, _value} -> name == "Location" end) - Logger.debug("Got 301 redirect from #{url} to #{new_url}") - get_article_content(new_url) + headers + |> Enum.find(fn {name, _value} -> name == "Location" end) + |> case do + {"Location", new_url} -> + Logger.debug("Got 301 redirect from #{url} to #{new_url}") + get_article_content(new_url) + + _ -> + {:error, "Missing Location header for redirect"} + end end defp handle_response(_url, %HTTPoison.Response{status_code: 403}) do diff --git a/lib/frenzy/pipeline/site/daring_fireball_scrape_stage.ex b/lib/frenzy/pipeline/site/daring_fireball_scrape_stage.ex new file mode 100644 index 0000000..050d77a --- /dev/null +++ b/lib/frenzy/pipeline/site/daring_fireball_scrape_stage.ex @@ -0,0 +1,102 @@ +defmodule Frenzy.Pipeline.Site.DaringFireballScrapeStage do + require Logger + alias Frenzy.Pipeline.Stage + @behaviour Stage + + @impl Stage + def apply(_opts, %{url: url} = item_params) do + case get_article_content(url) do + {:ok, content} -> + {:ok, %{item_params | content: content}} + + {:error, reason} -> + Logger.warn("Unable to get Daring Fireball article content for #{url}: #{reason}") + item_params + end + end + + @impl Stage + def validate_opts(opts), do: {:ok, opts} + + defp get_article_content(url) when is_binary(url) and url != "" do + Logger.debug("Get Daring Fireball article from #{url}") + + url + |> HTTPoison.get() + |> case do + {:ok, response} -> + handle_response(url, response) + + {:error, %HTTPoison.Error{reason: reason}} -> + {:error, "HTTPoison error: #{reason}"} + end + end + + defp get_article_content(_url), do: {:error, "URL must be a non-empty string"} + + defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}) do + html_tree = Floki.parse(body) + + case get_article_element(html_tree) || get_link_element(html_tree) do + nil -> + {:error, "no matching element"} + + elem -> + readable_html = + elem + |> Floki.filter_out(:comment) + |> Readability.readable_html() + + {:ok, readable_html} + end + end + + defp handle_response(_url, %HTTPoison.Response{status_code: 404}) do + {:error, "404 not found"} + end + + defp handle_response(url, %HTTPoison.Response{status_code: status_code, headers: headers}) + when status_code in [301, 302] do + {"Location", new_url} = Enum.find(headers, fn {name, _value} -> name == "Location" end) + + headers + |> Enum.find(fn {name, _value} -> name == "Location" end) + |> case do + {"Location", new_url} -> + Logger.debug("Got 301 redirect from #{url} to #{new_url}") + get_article_content(new_url) + + _ -> + {:error, "Missing Location header for redirect"} + end + end + + defp handle_response(_url, %HTTPoison.Response{status_code: 403}) do + {:error, "403 Forbidden"} + end + + defp handle_response(_url, %HTTPoison.Response{status_code: status_code} = response) do + {:error, "No handler for response #{inspect(response)}"} + end + + defp get_article_element(html_tree) do + case Floki.find(html_tree, "div.article") do + [article_elem | _] -> + # articles include extra information in the div.article element + Floki.filter_out(article_elem, "h1, .dateline, #PreviousNext") + + _ -> + nil + end + end + + defp get_link_element(html_tree) do + case Floki.find(html_tree, "dl.linkedlist dd") do + [dd_elem | _] -> + dd_elem + + _ -> + nil + end + end +end