Compare commits
4 Commits
e10a614f3e
...
40984b419d
Author | SHA1 | Date |
---|---|---|
Shadowfacts | 40984b419d | |
Shadowfacts | 33d1cac5e1 | |
Shadowfacts | 26b832b622 | |
Shadowfacts | 0ded09a65d |
|
@ -74,4 +74,6 @@ config :frenzy, Frenzy.Repo,
|
||||||
hostname: "localhost",
|
hostname: "localhost",
|
||||||
pool_size: 10
|
pool_size: 10
|
||||||
|
|
||||||
|
config :tesla, Tesla.Middleware.Logger, debug: false
|
||||||
|
|
||||||
import_config "dev.secret.exs"
|
import_config "dev.secret.exs"
|
||||||
|
|
|
@ -1,50 +1,75 @@
|
||||||
defmodule Frenzy.Network do
|
defmodule Frenzy.Network do
|
||||||
require Logger
|
require Logger
|
||||||
|
|
||||||
@http_redirect_codes [301, 302]
|
defmodule HTTP do
|
||||||
|
use Tesla
|
||||||
|
|
||||||
@spec http_get(String.t()) :: {:ok, HTTPoison.Response.t()} | {:error, term()}
|
adapter(Tesla.Adapter.Hackney)
|
||||||
|
|
||||||
|
plug Tesla.Middleware.Logger, log_level: &log_level/1
|
||||||
|
plug Tesla.Middleware.FollowRedirects
|
||||||
|
|
||||||
|
# can't use JSON middleware currently, because feed_parser expects to parse the raw body data itself
|
||||||
|
# plug Tesla.Middleware.JSON
|
||||||
|
plug Tesla.Middleware.Timeout, timeout: 10_000
|
||||||
|
|
||||||
|
def log_level(env) do
|
||||||
|
case env.status do
|
||||||
|
code when code >= 400 -> :warn
|
||||||
|
_ -> :debug
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
@spec http_get(String.t()) :: Tesla.Env.result()
|
||||||
def http_get(url) do
|
def http_get(url) do
|
||||||
case HTTPoison.get(url) do
|
HTTP.get(url)
|
||||||
{:ok, %HTTPoison.Response{status_code: 200} = response} ->
|
|
||||||
{:ok, response}
|
|
||||||
|
|
||||||
{:ok, %HTTPoison.Response{status_code: status_code, headers: headers}}
|
|
||||||
when status_code in @http_redirect_codes ->
|
|
||||||
headers
|
|
||||||
|> Enum.find(fn {name, _value} -> name == "location" end)
|
|
||||||
|> case do
|
|
||||||
{"location", new_url} ->
|
|
||||||
new_url =
|
|
||||||
case URI.parse(new_url) do
|
|
||||||
%URI{host: nil, path: path} ->
|
|
||||||
# relative path
|
|
||||||
%URI{URI.parse(url) | path: path} |> URI.to_string()
|
|
||||||
|
|
||||||
uri ->
|
|
||||||
uri
|
|
||||||
end
|
end
|
||||||
|
|
||||||
Logger.debug("Got 301 redirect from #{url} to #{new_url}")
|
# @http_redirect_codes [301, 302]
|
||||||
http_get(new_url)
|
|
||||||
|
|
||||||
_ ->
|
# @spec http_get(String.t()) :: {:ok, HTTPoison.Response.t()} | {:error, term()}
|
||||||
{:error, "Missing Location header for redirect"}
|
# def http_get(url) do
|
||||||
end
|
# case HTTPoison.get(url) do
|
||||||
|
# {:ok, %HTTPoison.Response{status_code: 200} = response} ->
|
||||||
|
# {:ok, response}
|
||||||
|
|
||||||
{:ok, %HTTPoison.Response{status_code: 403}} ->
|
# {:ok, %HTTPoison.Response{status_code: status_code, headers: headers}}
|
||||||
{:error, "403 Forbidden"}
|
# when status_code in @http_redirect_codes ->
|
||||||
|
# headers
|
||||||
|
# |> Enum.find(fn {name, _value} -> String.downcase(name) == "location" end)
|
||||||
|
# |> case do
|
||||||
|
# {_, new_url} ->
|
||||||
|
# new_url =
|
||||||
|
# case URI.parse(new_url) do
|
||||||
|
# %URI{host: nil, path: path} ->
|
||||||
|
# # relative path
|
||||||
|
# %URI{URI.parse(url) | path: path} |> URI.to_string()
|
||||||
|
|
||||||
{:ok, %HTTPoison.Response{status_code: 404}} ->
|
# uri ->
|
||||||
{:error, "404 Not Found"}
|
# uri
|
||||||
|
# end
|
||||||
|
|
||||||
{:ok, %HTTPoison.Response{status_code: status_code}} ->
|
# Logger.debug("Got 301 redirect from #{url} to #{new_url}")
|
||||||
{:error, "HTTP #{status_code}"}
|
# http_get(new_url)
|
||||||
|
|
||||||
{:error, error} ->
|
# _ ->
|
||||||
{:error, error}
|
# {:error, "Missing Location header for redirect"}
|
||||||
end
|
# end
|
||||||
end
|
|
||||||
|
# {:ok, %HTTPoison.Response{status_code: 403}} ->
|
||||||
|
# {:error, "403 Forbidden"}
|
||||||
|
|
||||||
|
# {:ok, %HTTPoison.Response{status_code: 404}} ->
|
||||||
|
# {:error, "404 Not Found"}
|
||||||
|
|
||||||
|
# {:ok, %HTTPoison.Response{status_code: status_code}} ->
|
||||||
|
# {:error, "HTTP #{status_code}"}
|
||||||
|
|
||||||
|
# {:error, error} ->
|
||||||
|
# {:error, error}
|
||||||
|
# end
|
||||||
|
# end
|
||||||
|
|
||||||
@gemini_success_codes 20..29
|
@gemini_success_codes 20..29
|
||||||
@gemini_redirect_codes 30..39
|
@gemini_redirect_codes 30..39
|
||||||
|
|
|
@ -20,15 +20,55 @@ defmodule Frenzy.Pipeline.Extractor.WhateverScalzi do
|
||||||
defp get_article_content(html_tree) do
|
defp get_article_content(html_tree) do
|
||||||
# there's no element that contains only the post content
|
# there's no element that contains only the post content
|
||||||
# .postarea contains the headline, post content, social media buttons, and comments
|
# .postarea contains the headline, post content, social media buttons, and comments
|
||||||
with [{_tag, _attrs, postarea_children} | _] <- Floki.find(html_tree, ".postarea"),
|
case Floki.find(html_tree, ".postarea") do
|
||||||
{_before_headline, [_headline | rest]} <-
|
[{_tag, _attrs, postarea_children}] ->
|
||||||
Enum.split_while(postarea_children, fn {tag, _attrs, _children} -> tag != "h1" end),
|
Enum.split_while(postarea_children, fn
|
||||||
{article_content, _rest} <-
|
{"h1", _, _} -> true
|
||||||
Enum.split_while(rest, fn {tag, attrs, _children} ->
|
_ -> false
|
||||||
tag != "div" || !({"id", "jp-post-flair"} in attrs)
|
end)
|
||||||
end) do
|
|> case do
|
||||||
article_content
|
{_before_headline, [_headline | rest]} ->
|
||||||
|
{article_content, _rest} =
|
||||||
|
Enum.split_while(rest, fn
|
||||||
|
{"div", attrs, _} = el ->
|
||||||
|
class = Floki.attribute(el, "class") |> List.first()
|
||||||
|
|
||||||
|
if {"id", "comments"} in attrs do
|
||||||
|
false
|
||||||
else
|
else
|
||||||
|
is_nil(class) || !String.contains?(class, "sharedaddy")
|
||||||
|
end
|
||||||
|
|
||||||
|
_ ->
|
||||||
|
true
|
||||||
|
end)
|
||||||
|
|
||||||
|
article_content
|
||||||
|
|> Floki.map(fn
|
||||||
|
{"img", attrs} = el ->
|
||||||
|
class = Enum.find(attrs, fn {k, _} -> k == "class" end)
|
||||||
|
class = if is_nil(class), do: nil, else: elem(class, 1)
|
||||||
|
|
||||||
|
if !is_nil(class) && String.contains?(class, "jetpack-lazy-image") do
|
||||||
|
{
|
||||||
|
"img",
|
||||||
|
Enum.filter(attrs, fn
|
||||||
|
{"srcset", _} -> false
|
||||||
|
_ -> true
|
||||||
|
end)
|
||||||
|
}
|
||||||
|
else
|
||||||
|
el
|
||||||
|
end
|
||||||
|
|
||||||
|
el ->
|
||||||
|
el
|
||||||
|
end)
|
||||||
|
|
||||||
|
_ ->
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
|
||||||
_ ->
|
_ ->
|
||||||
nil
|
nil
|
||||||
end
|
end
|
||||||
|
|
|
@ -70,7 +70,7 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||||
url
|
url
|
||||||
|> Network.http_get()
|
|> Network.http_get()
|
||||||
|> case do
|
|> case do
|
||||||
{:ok, response} ->
|
{:ok, %Tesla.Env{status: code} = response} when code in 200..299 ->
|
||||||
handle_response(url, response, opts)
|
handle_response(url, response, opts)
|
||||||
|
|
||||||
{:error, reason} ->
|
{:error, reason} ->
|
||||||
|
@ -80,16 +80,28 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||||
|
|
||||||
defp get_article_content(_url, _opts), do: {:error, "URL must be a non-empty string"}
|
defp get_article_content(_url, _opts), do: {:error, "URL must be a non-empty string"}
|
||||||
|
|
||||||
@spec handle_response(String.t(), HTTPoison.Response.t(), map()) ::
|
@spec handle_response(String.t(), Tesla.Env.t(), map()) ::
|
||||||
{:ok, String.t()} | {:error, String.t()}
|
{:ok, String.t()} | {:error, String.t()}
|
||||||
defp handle_response(url, %HTTPoison.Response{body: body}, opts) do
|
defp handle_response(url, %Tesla.Env{body: body}, opts) do
|
||||||
case opts["extractor"] do
|
case opts["extractor"] do
|
||||||
"builtin" ->
|
"builtin" ->
|
||||||
{:ok, Readability.article(body)}
|
{:ok, Readability.article(body)}
|
||||||
|
|
||||||
module_name ->
|
module_name ->
|
||||||
html_tree = Floki.parse(body)
|
html_tree = Floki.parse(body)
|
||||||
|
|
||||||
|
try do
|
||||||
apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree])
|
apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree])
|
||||||
|
rescue
|
||||||
|
e ->
|
||||||
|
Logger.error(
|
||||||
|
"Encountered error extracting article content from '#{url}' with #{module_name}, falling back to default"
|
||||||
|
)
|
||||||
|
|
||||||
|
Logger.error(Exception.format(:error, e, __STACKTRACE__))
|
||||||
|
|
||||||
|
{:ok, Readability.article(body)}
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|> case do
|
|> case do
|
||||||
{:ok, html} ->
|
{:ok, html} ->
|
||||||
|
@ -143,10 +155,10 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||||
absolute_url = URI.merge(site_uri, src) |> to_string()
|
absolute_url = URI.merge(site_uri, src) |> to_string()
|
||||||
|
|
||||||
case Network.http_get(absolute_url) do
|
case Network.http_get(absolute_url) do
|
||||||
{:ok, %HTTPoison.Response{body: body, headers: headers}} ->
|
{:ok, %Tesla.Env{body: body, headers: headers}} ->
|
||||||
Enum.find(headers, fn {header, _value} -> header == "Content-Type" end)
|
Enum.find(headers, fn {header, _value} -> String.downcase(header) == "content-type" end)
|
||||||
|> case do
|
|> case do
|
||||||
{"Content-Type", content_type} when content_type in @content_type_allowlist ->
|
{_, content_type} when content_type in @content_type_allowlist ->
|
||||||
"data:#{content_type};base64,#{Base.encode64(body)}"
|
"data:#{content_type};base64,#{Base.encode64(body)}"
|
||||||
|
|
||||||
_ ->
|
_ ->
|
||||||
|
|
|
@ -46,7 +46,7 @@ defmodule Frenzy.Task.FetchFavicon do
|
||||||
|
|
||||||
defp fetch_favicon_url_from_webpage(url) when is_binary(url) do
|
defp fetch_favicon_url_from_webpage(url) when is_binary(url) do
|
||||||
case Network.http_get(url) do
|
case Network.http_get(url) do
|
||||||
{:ok, %HTTPoison.Response{body: body, status_code: code}} when code in 200..299 ->
|
{:ok, %Tesla.Env{body: body, status: code}} when code in 200..299 ->
|
||||||
extract_favicon_url(url, body)
|
extract_favicon_url(url, body)
|
||||||
|
|
||||||
{:error, reason} ->
|
{:error, reason} ->
|
||||||
|
@ -109,7 +109,7 @@ defmodule Frenzy.Task.FetchFavicon do
|
||||||
Logger.debug("Fetching favicon from: '#{favicon_url}'")
|
Logger.debug("Fetching favicon from: '#{favicon_url}'")
|
||||||
|
|
||||||
case Network.http_get(favicon_url) do
|
case Network.http_get(favicon_url) do
|
||||||
{:ok, %HTTPoison.Response{body: body, status_code: code}} when code in 200..299 ->
|
{:ok, %Tesla.Env{body: body, status: code}} when code in 200..299 ->
|
||||||
{:ok, "data:image/png;base64,#{Base.encode64(body)}"}
|
{:ok, "data:image/png;base64,#{Base.encode64(body)}"}
|
||||||
|
|
||||||
{:error, reason} ->
|
{:error, reason} ->
|
||||||
|
|
|
@ -104,8 +104,8 @@ defmodule Frenzy.UpdateFeeds do
|
||||||
defp update_feed_http(feed) do
|
defp update_feed_http(feed) do
|
||||||
case Network.http_get(feed.feed_url) do
|
case Network.http_get(feed.feed_url) do
|
||||||
{:ok,
|
{:ok,
|
||||||
%HTTPoison.Response{
|
%Tesla.Env{
|
||||||
status_code: 200,
|
status: 200,
|
||||||
body: body,
|
body: body,
|
||||||
headers: headers
|
headers: headers
|
||||||
}} ->
|
}} ->
|
||||||
|
|
2
mix.exs
2
mix.exs
|
@ -43,8 +43,8 @@ defmodule Frenzy.MixProject do
|
||||||
{:gettext, "~> 0.11"},
|
{:gettext, "~> 0.11"},
|
||||||
{:jason, "~> 1.0"},
|
{:jason, "~> 1.0"},
|
||||||
{:plug_cowboy, "~> 2.3"},
|
{:plug_cowboy, "~> 2.3"},
|
||||||
{:httpoison, "~> 1.8.0"},
|
|
||||||
{:hackney, "1.17.4"},
|
{:hackney, "1.17.4"},
|
||||||
|
{:tesla, "~> 1.4.0"},
|
||||||
{:feed_parser,
|
{:feed_parser,
|
||||||
git: "https://git.shadowfacts.net/shadowfacts/feed_parser.git", branch: "master"},
|
git: "https://git.shadowfacts.net/shadowfacts/feed_parser.git", branch: "master"},
|
||||||
{:timex, "~> 3.6"},
|
{:timex, "~> 3.6"},
|
||||||
|
|
22
mix.lock
22
mix.lock
|
@ -110,11 +110,6 @@
|
||||||
{:hex, :html_entities, "0.4.0",
|
{:hex, :html_entities, "0.4.0",
|
||||||
"f2fee876858cf6aaa9db608820a3209e45a087c5177332799592142b50e89a6b", [:mix], [], "hexpm",
|
"f2fee876858cf6aaa9db608820a3209e45a087c5177332799592142b50e89a6b", [:mix], [], "hexpm",
|
||||||
"3e3d7156a272950373ce5a4018b1490bea26676f8d6a7d409f6fac8568b8cb9a"},
|
"3e3d7156a272950373ce5a4018b1490bea26676f8d6a7d409f6fac8568b8cb9a"},
|
||||||
httpoison:
|
|
||||||
{:hex, :httpoison, "1.8.0",
|
|
||||||
"6b85dea15820b7804ef607ff78406ab449dd78bed923a49c7160e1886e987a3d", [:mix],
|
|
||||||
[{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm",
|
|
||||||
"28089eaa98cf90c66265b6b5ad87c59a3729bea2e74e9d08f9b51eb9729b3c3a"},
|
|
||||||
idna:
|
idna:
|
||||||
{:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d",
|
{:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d",
|
||||||
[:rebar3],
|
[:rebar3],
|
||||||
|
@ -236,6 +231,23 @@
|
||||||
{:hex, :telemetry, "0.4.2",
|
{:hex, :telemetry, "0.4.2",
|
||||||
"2808c992455e08d6177322f14d3bdb6b625fbcfd233a73505870d8738a2f4599", [:rebar3], [], "hexpm",
|
"2808c992455e08d6177322f14d3bdb6b625fbcfd233a73505870d8738a2f4599", [:rebar3], [], "hexpm",
|
||||||
"2d1419bd9dda6a206d7b5852179511722e2b18812310d304620c7bd92a13fcef"},
|
"2d1419bd9dda6a206d7b5852179511722e2b18812310d304620c7bd92a13fcef"},
|
||||||
|
tesla:
|
||||||
|
{:hex, :tesla, "1.4.0", "1081bef0124b8bdec1c3d330bbe91956648fb008cf0d3950a369cda466a31a87",
|
||||||
|
[:mix],
|
||||||
|
[
|
||||||
|
{:castore, "~> 0.1", [hex: :castore, repo: "hexpm", optional: true]},
|
||||||
|
{:exjsx, ">= 3.0.0", [hex: :exjsx, repo: "hexpm", optional: true]},
|
||||||
|
{:finch, "~> 0.3", [hex: :finch, repo: "hexpm", optional: true]},
|
||||||
|
{:fuse, "~> 2.4", [hex: :fuse, repo: "hexpm", optional: true]},
|
||||||
|
{:gun, "~> 1.3", [hex: :gun, repo: "hexpm", optional: true]},
|
||||||
|
{:hackney, "~> 1.6", [hex: :hackney, repo: "hexpm", optional: true]},
|
||||||
|
{:ibrowse, "~> 4.4.0", [hex: :ibrowse, repo: "hexpm", optional: true]},
|
||||||
|
{:jason, ">= 1.0.0", [hex: :jason, repo: "hexpm", optional: true]},
|
||||||
|
{:mime, "~> 1.0", [hex: :mime, repo: "hexpm", optional: false]},
|
||||||
|
{:mint, "~> 1.0", [hex: :mint, repo: "hexpm", optional: true]},
|
||||||
|
{:poison, ">= 1.0.0", [hex: :poison, repo: "hexpm", optional: true]},
|
||||||
|
{:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: true]}
|
||||||
|
], "hexpm", "bf1374a5569f5fca8e641363b63f7347d680d91388880979a33bc12a6eb3e0aa"},
|
||||||
timex:
|
timex:
|
||||||
{:hex, :timex, "3.6.1", "efdf56d0e67a6b956cc57774353b0329c8ab7726766a11547e529357ffdc1d56",
|
{:hex, :timex, "3.6.1", "efdf56d0e67a6b956cc57774353b0329c8ab7726766a11547e529357ffdc1d56",
|
||||||
[:mix],
|
[:mix],
|
||||||
|
|
Loading…
Reference in New Issue