Compare commits

..

4 Commits

8 changed files with 155 additions and 64 deletions

View File

@ -74,4 +74,6 @@ config :frenzy, Frenzy.Repo,
hostname: "localhost",
pool_size: 10
config :tesla, Tesla.Middleware.Logger, debug: false
import_config "dev.secret.exs"

View File

@ -1,51 +1,76 @@
defmodule Frenzy.Network do
require Logger
@http_redirect_codes [301, 302]
defmodule HTTP do
use Tesla
@spec http_get(String.t()) :: {:ok, HTTPoison.Response.t()} | {:error, term()}
def http_get(url) do
case HTTPoison.get(url) do
{:ok, %HTTPoison.Response{status_code: 200} = response} ->
{:ok, response}
adapter(Tesla.Adapter.Hackney)
{:ok, %HTTPoison.Response{status_code: status_code, headers: headers}}
when status_code in @http_redirect_codes ->
headers
|> Enum.find(fn {name, _value} -> name == "location" end)
|> case do
{"location", new_url} ->
new_url =
case URI.parse(new_url) do
%URI{host: nil, path: path} ->
# relative path
%URI{URI.parse(url) | path: path} |> URI.to_string()
plug Tesla.Middleware.Logger, log_level: &log_level/1
plug Tesla.Middleware.FollowRedirects
uri ->
uri
end
# can't use JSON middleware currently, because feed_parser expects to parse the raw body data itself
# plug Tesla.Middleware.JSON
plug Tesla.Middleware.Timeout, timeout: 10_000
Logger.debug("Got 301 redirect from #{url} to #{new_url}")
http_get(new_url)
_ ->
{:error, "Missing Location header for redirect"}
end
{:ok, %HTTPoison.Response{status_code: 403}} ->
{:error, "403 Forbidden"}
{:ok, %HTTPoison.Response{status_code: 404}} ->
{:error, "404 Not Found"}
{:ok, %HTTPoison.Response{status_code: status_code}} ->
{:error, "HTTP #{status_code}"}
{:error, error} ->
{:error, error}
def log_level(env) do
case env.status do
code when code >= 400 -> :warn
_ -> :debug
end
end
end
@spec http_get(String.t()) :: Tesla.Env.result()
def http_get(url) do
HTTP.get(url)
end
# @http_redirect_codes [301, 302]
# @spec http_get(String.t()) :: {:ok, HTTPoison.Response.t()} | {:error, term()}
# def http_get(url) do
# case HTTPoison.get(url) do
# {:ok, %HTTPoison.Response{status_code: 200} = response} ->
# {:ok, response}
# {:ok, %HTTPoison.Response{status_code: status_code, headers: headers}}
# when status_code in @http_redirect_codes ->
# headers
# |> Enum.find(fn {name, _value} -> String.downcase(name) == "location" end)
# |> case do
# {_, new_url} ->
# new_url =
# case URI.parse(new_url) do
# %URI{host: nil, path: path} ->
# # relative path
# %URI{URI.parse(url) | path: path} |> URI.to_string()
# uri ->
# uri
# end
# Logger.debug("Got 301 redirect from #{url} to #{new_url}")
# http_get(new_url)
# _ ->
# {:error, "Missing Location header for redirect"}
# end
# {:ok, %HTTPoison.Response{status_code: 403}} ->
# {:error, "403 Forbidden"}
# {:ok, %HTTPoison.Response{status_code: 404}} ->
# {:error, "404 Not Found"}
# {:ok, %HTTPoison.Response{status_code: status_code}} ->
# {:error, "HTTP #{status_code}"}
# {:error, error} ->
# {:error, error}
# end
# end
@gemini_success_codes 20..29
@gemini_redirect_codes 30..39

View File

@ -20,15 +20,55 @@ defmodule Frenzy.Pipeline.Extractor.WhateverScalzi do
defp get_article_content(html_tree) do
# there's no element that contains only the post content
# .postarea contains the headline, post content, social media buttons, and comments
with [{_tag, _attrs, postarea_children} | _] <- Floki.find(html_tree, ".postarea"),
{_before_headline, [_headline | rest]} <-
Enum.split_while(postarea_children, fn {tag, _attrs, _children} -> tag != "h1" end),
{article_content, _rest} <-
Enum.split_while(rest, fn {tag, attrs, _children} ->
tag != "div" || !({"id", "jp-post-flair"} in attrs)
end) do
article_content
else
case Floki.find(html_tree, ".postarea") do
[{_tag, _attrs, postarea_children}] ->
Enum.split_while(postarea_children, fn
{"h1", _, _} -> true
_ -> false
end)
|> case do
{_before_headline, [_headline | rest]} ->
{article_content, _rest} =
Enum.split_while(rest, fn
{"div", attrs, _} = el ->
class = Floki.attribute(el, "class") |> List.first()
if {"id", "comments"} in attrs do
false
else
is_nil(class) || !String.contains?(class, "sharedaddy")
end
_ ->
true
end)
article_content
|> Floki.map(fn
{"img", attrs} = el ->
class = Enum.find(attrs, fn {k, _} -> k == "class" end)
class = if is_nil(class), do: nil, else: elem(class, 1)
if !is_nil(class) && String.contains?(class, "jetpack-lazy-image") do
{
"img",
Enum.filter(attrs, fn
{"srcset", _} -> false
_ -> true
end)
}
else
el
end
el ->
el
end)
_ ->
nil
end
_ ->
nil
end

View File

@ -70,7 +70,7 @@ defmodule Frenzy.Pipeline.ScrapeStage do
url
|> Network.http_get()
|> case do
{:ok, response} ->
{:ok, %Tesla.Env{status: code} = response} when code in 200..299 ->
handle_response(url, response, opts)
{:error, reason} ->
@ -80,16 +80,28 @@ defmodule Frenzy.Pipeline.ScrapeStage do
defp get_article_content(_url, _opts), do: {:error, "URL must be a non-empty string"}
@spec handle_response(String.t(), HTTPoison.Response.t(), map()) ::
@spec handle_response(String.t(), Tesla.Env.t(), map()) ::
{:ok, String.t()} | {:error, String.t()}
defp handle_response(url, %HTTPoison.Response{body: body}, opts) do
defp handle_response(url, %Tesla.Env{body: body}, opts) do
case opts["extractor"] do
"builtin" ->
{:ok, Readability.article(body)}
module_name ->
html_tree = Floki.parse(body)
apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree])
try do
apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree])
rescue
e ->
Logger.error(
"Encountered error extracting article content from '#{url}' with #{module_name}, falling back to default"
)
Logger.error(Exception.format(:error, e, __STACKTRACE__))
{:ok, Readability.article(body)}
end
end
|> case do
{:ok, html} ->
@ -143,10 +155,10 @@ defmodule Frenzy.Pipeline.ScrapeStage do
absolute_url = URI.merge(site_uri, src) |> to_string()
case Network.http_get(absolute_url) do
{:ok, %HTTPoison.Response{body: body, headers: headers}} ->
Enum.find(headers, fn {header, _value} -> header == "Content-Type" end)
{:ok, %Tesla.Env{body: body, headers: headers}} ->
Enum.find(headers, fn {header, _value} -> String.downcase(header) == "content-type" end)
|> case do
{"Content-Type", content_type} when content_type in @content_type_allowlist ->
{_, content_type} when content_type in @content_type_allowlist ->
"data:#{content_type};base64,#{Base.encode64(body)}"
_ ->

View File

@ -46,7 +46,7 @@ defmodule Frenzy.Task.FetchFavicon do
defp fetch_favicon_url_from_webpage(url) when is_binary(url) do
case Network.http_get(url) do
{:ok, %HTTPoison.Response{body: body, status_code: code}} when code in 200..299 ->
{:ok, %Tesla.Env{body: body, status: code}} when code in 200..299 ->
extract_favicon_url(url, body)
{:error, reason} ->
@ -109,7 +109,7 @@ defmodule Frenzy.Task.FetchFavicon do
Logger.debug("Fetching favicon from: '#{favicon_url}'")
case Network.http_get(favicon_url) do
{:ok, %HTTPoison.Response{body: body, status_code: code}} when code in 200..299 ->
{:ok, %Tesla.Env{body: body, status: code}} when code in 200..299 ->
{:ok, "data:image/png;base64,#{Base.encode64(body)}"}
{:error, reason} ->

View File

@ -104,8 +104,8 @@ defmodule Frenzy.UpdateFeeds do
defp update_feed_http(feed) do
case Network.http_get(feed.feed_url) do
{:ok,
%HTTPoison.Response{
status_code: 200,
%Tesla.Env{
status: 200,
body: body,
headers: headers
}} ->

View File

@ -43,8 +43,8 @@ defmodule Frenzy.MixProject do
{:gettext, "~> 0.11"},
{:jason, "~> 1.0"},
{:plug_cowboy, "~> 2.3"},
{:httpoison, "~> 1.8.0"},
{:hackney, "1.17.4"},
{:tesla, "~> 1.4.0"},
{:feed_parser,
git: "https://git.shadowfacts.net/shadowfacts/feed_parser.git", branch: "master"},
{:timex, "~> 3.6"},

View File

@ -110,11 +110,6 @@
{:hex, :html_entities, "0.4.0",
"f2fee876858cf6aaa9db608820a3209e45a087c5177332799592142b50e89a6b", [:mix], [], "hexpm",
"3e3d7156a272950373ce5a4018b1490bea26676f8d6a7d409f6fac8568b8cb9a"},
httpoison:
{:hex, :httpoison, "1.8.0",
"6b85dea15820b7804ef607ff78406ab449dd78bed923a49c7160e1886e987a3d", [:mix],
[{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm",
"28089eaa98cf90c66265b6b5ad87c59a3729bea2e74e9d08f9b51eb9729b3c3a"},
idna:
{:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d",
[:rebar3],
@ -236,6 +231,23 @@
{:hex, :telemetry, "0.4.2",
"2808c992455e08d6177322f14d3bdb6b625fbcfd233a73505870d8738a2f4599", [:rebar3], [], "hexpm",
"2d1419bd9dda6a206d7b5852179511722e2b18812310d304620c7bd92a13fcef"},
tesla:
{:hex, :tesla, "1.4.0", "1081bef0124b8bdec1c3d330bbe91956648fb008cf0d3950a369cda466a31a87",
[:mix],
[
{:castore, "~> 0.1", [hex: :castore, repo: "hexpm", optional: true]},
{:exjsx, ">= 3.0.0", [hex: :exjsx, repo: "hexpm", optional: true]},
{:finch, "~> 0.3", [hex: :finch, repo: "hexpm", optional: true]},
{:fuse, "~> 2.4", [hex: :fuse, repo: "hexpm", optional: true]},
{:gun, "~> 1.3", [hex: :gun, repo: "hexpm", optional: true]},
{:hackney, "~> 1.6", [hex: :hackney, repo: "hexpm", optional: true]},
{:ibrowse, "~> 4.4.0", [hex: :ibrowse, repo: "hexpm", optional: true]},
{:jason, ">= 1.0.0", [hex: :jason, repo: "hexpm", optional: true]},
{:mime, "~> 1.0", [hex: :mime, repo: "hexpm", optional: false]},
{:mint, "~> 1.0", [hex: :mint, repo: "hexpm", optional: true]},
{:poison, ">= 1.0.0", [hex: :poison, repo: "hexpm", optional: true]},
{:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: true]}
], "hexpm", "bf1374a5569f5fca8e641363b63f7347d680d91388880979a33bc12a6eb3e0aa"},
timex:
{:hex, :timex, "3.6.1", "efdf56d0e67a6b956cc57774353b0329c8ab7726766a11547e529357ffdc1d56",
[:mix],