From cfd9f7505a91f82dee8cbad730e5c0a87bd705c1 Mon Sep 17 00:00:00 2001 From: Shadowfacts Date: Thu, 31 Oct 2019 17:38:16 -0400 Subject: [PATCH] Rewrite image URLs without hosts to use the host of the article URL --- lib/frenzy/pipeline/scrape_stage.ex | 45 +++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/lib/frenzy/pipeline/scrape_stage.ex b/lib/frenzy/pipeline/scrape_stage.ex index 0caadcd..c6d2363 100644 --- a/lib/frenzy/pipeline/scrape_stage.ex +++ b/lib/frenzy/pipeline/scrape_stage.ex @@ -61,23 +61,29 @@ defmodule Frenzy.Pipeline.ScrapeStage do @spec handle_response(String.t(), HTTPoison.Response.t(), String.t()) :: {:ok, String.t()} | {:error, String.t()} - defp handle_response(_url, %HTTPoison.Response{status_code: 200, body: body}, extractor) do + defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}, extractor) do case extractor do "builtin" -> - article = Readability.article(body) - {:ok, Readability.readable_html(article)} + {:ok, Readability.article(body)} module_name -> html_tree = Floki.parse(body) + apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree]) + end + |> case do + {:ok, html} -> + html = Floki.map(html, rewrite_image_urls(URI.parse(url))) - case apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree]) do - {:error, _} = err -> - err + case extractor do + "builtin" -> + {:ok, Readability.readable_html(html)} - {:ok, html_tree} -> - html_tree - |> Floki.raw_html() + _ -> + {:ok, Floki.raw_html(html)} end + + res -> + res end end @@ -110,4 +116,25 @@ defmodule Frenzy.Pipeline.ScrapeStage do defp handle_response(_url, %HTTPoison.Response{} = response, _extractor) do {:error, "No handler for response #{inspect(response)}"} end + + # Generates a helper function for the article with the given URI that takes an HTML element and, + # if it's an element whose src attribute does not have a hostname, adds the hostname and + # scheme to the element. + defp rewrite_image_urls(%URI{host: host, scheme: scheme}) do + fn + {"img", [{"src", src} | attrs]} = elem -> + case URI.parse(src) do + %URI{host: nil, path: path} -> + new_src = URI.to_string(%URI{path: path, host: host, scheme: scheme}) + + {"img", [{"src", new_src} | attrs]} + + _ -> + elem + end + + elem -> + elem + end + end end