Compare commits

..

No commits in common. "162ba74dde2b165f3da8e55aa4519416ea202292" and "56d7579e2c449b522692cf4e644a989fd51a459a" have entirely different histories.

5 changed files with 10 additions and 48 deletions

View File

@ -1,39 +0,0 @@
defmodule Frenzy.Pipeline.Extractor.Slate do
@moduledoc """
Extractor for https://slate.com
"""
alias Frenzy.Pipeline.Extractor
@behaviour Extractor
@impl Extractor
def extract(html_tree) do
case get_article_content(html_tree) do
nil ->
{:error, "no matching elements"}
elem ->
{:ok, elem}
end
end
defp get_article_content(html_tree) do
case Floki.find(html_tree, ".article__content") do
[el] ->
article_content = Floki.filter_out(el, ".slate-ad, .in-article-recirc, .social-share")
image = Floki.find(html_tree, ".article__top-image img")
case image do
[] ->
article_content
[image | _] ->
[image, article_content]
end
_ ->
nil
end
end
end

View File

@ -113,7 +113,13 @@ defmodule Frenzy.Pipeline.ScrapeStage do
html = Floki.map(html, rewrite_image_urls(convert_to_data_uris, URI.parse(url))) html = Floki.map(html, rewrite_image_urls(convert_to_data_uris, URI.parse(url)))
{:ok, Readability.readable_html(html)} case opts["extractor"] do
"builtin" ->
{:ok, Readability.readable_html(html)}
_ ->
{:ok, Floki.raw_html(html)}
end
res -> res ->
res res

View File

@ -3,12 +3,10 @@ defmodule Frenzy.Task.CreateItem do
use Task use Task
alias Frenzy.Repo alias Frenzy.Repo
@spec start_link(Frenzy.Feed.t(), FeedParser.Item.t()) :: {:ok, pid()}
def start_link(feed, entry) do def start_link(feed, entry) do
Task.start_link(__MODULE__, :run, [feed, entry]) Task.start_link(__MODULE__, :run, [feed, entry])
end end
@spec run(Frenzy.Feed.t(), FeedParser.Item.t()) :: :ok
def run(feed, entry) do def run(feed, entry) do
Logger.metadata(item_task_id: generate_task_id()) Logger.metadata(item_task_id: generate_task_id())
@ -33,7 +31,7 @@ defmodule Frenzy.Task.CreateItem do
title: entry.title, title: entry.title,
url: url, url: url,
date: date, date: date,
creator: entry.creator, creator: "",
content: entry.content, content: entry.content,
# we assume text/html in the feed itself, other stages may alter this # we assume text/html in the feed itself, other stages may alter this
content_type: "text/html" content_type: "text/html"
@ -93,8 +91,6 @@ defmodule Frenzy.Task.CreateItem do
Logger.error(changeset) Logger.error(changeset)
end end
end end
:ok
end end
defp get_real_url(entry) do defp get_real_url(entry) do

View File

@ -9,7 +9,6 @@ defmodule FrenzyWeb.ConfigureStage.ScrapeStageLive do
{"finertech.com", Frenzy.Pipeline.Extractor.FinerTech}, {"finertech.com", Frenzy.Pipeline.Extractor.FinerTech},
{"macstories.net", Frenzy.Pipeline.Extractor.MacStories}, {"macstories.net", Frenzy.Pipeline.Extractor.MacStories},
{"om.co", Frenzy.Pipeline.Extractor.OmMalik}, {"om.co", Frenzy.Pipeline.Extractor.OmMalik},
{"slate.com", Frenzy.Pipeline.Extractor.Slate},
{"whatever.scalzi.com", Frenzy.Pipeline.Extractor.WhateverScalzi} {"whatever.scalzi.com", Frenzy.Pipeline.Extractor.WhateverScalzi}
] ]
|> Enum.map(fn {pretty_name, module} -> |> Enum.map(fn {pretty_name, module} ->

View File

@ -15,7 +15,7 @@
"ecto_sql": {:hex, :ecto_sql, "3.4.4", "d28bac2d420f708993baed522054870086fd45016a9d09bb2cd521b9c48d32ea", [:mix], [{:db_connection, "~> 2.2", [hex: :db_connection, repo: "hexpm", optional: false]}, {:ecto, "~> 3.4.3", [hex: :ecto, repo: "hexpm", optional: false]}, {:myxql, "~> 0.3.0 or ~> 0.4.0", [hex: :myxql, repo: "hexpm", optional: true]}, {:postgrex, "~> 0.15.0", [hex: :postgrex, repo: "hexpm", optional: true]}, {:tds, "~> 2.1.0", [hex: :tds, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "edb49af715dd72f213b66adfd0f668a43c17ed510b5d9ac7528569b23af57fe8"}, "ecto_sql": {:hex, :ecto_sql, "3.4.4", "d28bac2d420f708993baed522054870086fd45016a9d09bb2cd521b9c48d32ea", [:mix], [{:db_connection, "~> 2.2", [hex: :db_connection, repo: "hexpm", optional: false]}, {:ecto, "~> 3.4.3", [hex: :ecto, repo: "hexpm", optional: false]}, {:myxql, "~> 0.3.0 or ~> 0.4.0", [hex: :myxql, repo: "hexpm", optional: true]}, {:postgrex, "~> 0.15.0", [hex: :postgrex, repo: "hexpm", optional: true]}, {:tds, "~> 2.1.0", [hex: :tds, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "edb49af715dd72f213b66adfd0f668a43c17ed510b5d9ac7528569b23af57fe8"},
"elixir_make": {:hex, :elixir_make, "0.5.2", "96a28c79f5b8d34879cd95ebc04d2a0d678cfbbd3e74c43cb63a76adf0ee8054", [:mix], [], "hexpm", "382eeea8e02dfe6c468f6729b6cf20fe5b14390671d38c7363e59621c7ab4efc"}, "elixir_make": {:hex, :elixir_make, "0.5.2", "96a28c79f5b8d34879cd95ebc04d2a0d678cfbbd3e74c43cb63a76adf0ee8054", [:mix], [], "hexpm", "382eeea8e02dfe6c468f6729b6cf20fe5b14390671d38c7363e59621c7ab4efc"},
"erlex": {:hex, :erlex, "0.2.4", "23791959df45fe8f01f388c6f7eb733cc361668cbeedd801bf491c55a029917b", [:mix], [], "hexpm", "4a12ebc7cd8f24f2d0fce93d279fa34eb5068e0e885bb841d558c4d83c52c439"}, "erlex": {:hex, :erlex, "0.2.4", "23791959df45fe8f01f388c6f7eb733cc361668cbeedd801bf491c55a029917b", [:mix], [], "hexpm", "4a12ebc7cd8f24f2d0fce93d279fa34eb5068e0e885bb841d558c4d83c52c439"},
"feed_parser": {:git, "https://git.shadowfacts.net/shadowfacts/feed_parser.git", "13394e38f6cf378e0e5789ea5b471d63d5b3794b", [branch: "master"]}, "feed_parser": {:git, "https://git.shadowfacts.net/shadowfacts/feed_parser.git", "b8de34c436855b453b1e4ced1fa3659c5a35ac95", [branch: "master"]},
"file_system": {:hex, :file_system, "0.2.6", "fd4dc3af89b9ab1dc8ccbcc214a0e60c41f34be251d9307920748a14bf41f1d3", [:mix], [], "hexpm", "0d50da6b04c58e101a3793b1600f9a03b86e3a8057b192ac1766013d35706fa6"}, "file_system": {:hex, :file_system, "0.2.6", "fd4dc3af89b9ab1dc8ccbcc214a0e60c41f34be251d9307920748a14bf41f1d3", [:mix], [], "hexpm", "0d50da6b04c58e101a3793b1600f9a03b86e3a8057b192ac1766013d35706fa6"},
"floki": {:hex, :floki, "0.23.0", "956ab6dba828c96e732454809fb0bd8d43ce0979b75f34de6322e73d4c917829", [:mix], [{:html_entities, "~> 0.4.0", [hex: :html_entities, repo: "hexpm", optional: false]}], "hexpm", "e680b5ef0b61ce02faa7137db8d1714903a5552be4c89fb57293b8770e7f49c2"}, "floki": {:hex, :floki, "0.23.0", "956ab6dba828c96e732454809fb0bd8d43ce0979b75f34de6322e73d4c917829", [:mix], [{:html_entities, "~> 0.4.0", [hex: :html_entities, repo: "hexpm", optional: false]}], "hexpm", "e680b5ef0b61ce02faa7137db8d1714903a5552be4c89fb57293b8770e7f49c2"},
"gemini": {:git, "https://git.shadowfacts.net/shadowfacts/gemini-ex.git", "cc6f4e04374d163438faae1b12b54809bdfb7f4d", [branch: "main"]}, "gemini": {:git, "https://git.shadowfacts.net/shadowfacts/gemini-ex.git", "cc6f4e04374d163438faae1b12b54809bdfb7f4d", [branch: "main"]},
@ -37,7 +37,7 @@
"phoenix_pubsub": {:hex, :phoenix_pubsub, "2.0.0", "a1ae76717bb168cdeb10ec9d92d1480fec99e3080f011402c0a2d68d47395ffb", [:mix], [], "hexpm", "c52d948c4f261577b9c6fa804be91884b381a7f8f18450c5045975435350f771"}, "phoenix_pubsub": {:hex, :phoenix_pubsub, "2.0.0", "a1ae76717bb168cdeb10ec9d92d1480fec99e3080f011402c0a2d68d47395ffb", [:mix], [], "hexpm", "c52d948c4f261577b9c6fa804be91884b381a7f8f18450c5045975435350f771"},
"plug": {:hex, :plug, "1.10.4", "41eba7d1a2d671faaf531fa867645bd5a3dce0957d8e2a3f398ccff7d2ef017f", [:mix], [{:mime, "~> 1.0", [hex: :mime, repo: "hexpm", optional: false]}, {:plug_crypto, "~> 1.1.1 or ~> 1.2", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "ad1e233fe73d2eec56616568d260777b67f53148a999dc2d048f4eb9778fe4a0"}, "plug": {:hex, :plug, "1.10.4", "41eba7d1a2d671faaf531fa867645bd5a3dce0957d8e2a3f398ccff7d2ef017f", [:mix], [{:mime, "~> 1.0", [hex: :mime, repo: "hexpm", optional: false]}, {:plug_crypto, "~> 1.1.1 or ~> 1.2", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "ad1e233fe73d2eec56616568d260777b67f53148a999dc2d048f4eb9778fe4a0"},
"plug_cowboy": {:hex, :plug_cowboy, "2.3.0", "149a50e05cb73c12aad6506a371cd75750c0b19a32f81866e1a323dda9e0e99d", [:mix], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:plug, "~> 1.7", [hex: :plug, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "bc595a1870cef13f9c1e03df56d96804db7f702175e4ccacdb8fc75c02a7b97e"}, "plug_cowboy": {:hex, :plug_cowboy, "2.3.0", "149a50e05cb73c12aad6506a371cd75750c0b19a32f81866e1a323dda9e0e99d", [:mix], [{:cowboy, "~> 2.7", [hex: :cowboy, repo: "hexpm", optional: false]}, {:plug, "~> 1.7", [hex: :plug, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "bc595a1870cef13f9c1e03df56d96804db7f702175e4ccacdb8fc75c02a7b97e"},
"plug_crypto": {:hex, :plug_crypto, "1.2.2", "05654514ac717ff3a1843204b424477d9e60c143406aa94daf2274fdd280794d", [:mix], [], "hexpm", "87631c7ad914a5a445f0a3809f99b079113ae4ed4b867348dd9eec288cecb6db"}, "plug_crypto": {:hex, :plug_crypto, "1.1.2", "bdd187572cc26dbd95b87136290425f2b580a116d3fb1f564216918c9730d227", [:mix], [], "hexpm", "6b8b608f895b6ffcfad49c37c7883e8df98ae19c6a28113b02aa1e9c5b22d6b5"},
"poison": {:hex, :poison, "3.1.0", "d9eb636610e096f86f25d9a46f35a9facac35609a7591b3be3326e99a0484665", [:mix], [], "hexpm", "fec8660eb7733ee4117b85f55799fd3833eb769a6df71ccf8903e8dc5447cfce"}, "poison": {:hex, :poison, "3.1.0", "d9eb636610e096f86f25d9a46f35a9facac35609a7591b3be3326e99a0484665", [:mix], [], "hexpm", "fec8660eb7733ee4117b85f55799fd3833eb769a6df71ccf8903e8dc5447cfce"},
"poolboy": {:hex, :poolboy, "1.5.2", "392b007a1693a64540cead79830443abf5762f5d30cf50bc95cb2c1aaafa006b", [:rebar3], [], "hexpm", "dad79704ce5440f3d5a3681c8590b9dc25d1a561e8f5a9c995281012860901e3"}, "poolboy": {:hex, :poolboy, "1.5.2", "392b007a1693a64540cead79830443abf5762f5d30cf50bc95cb2c1aaafa006b", [:rebar3], [], "hexpm", "dad79704ce5440f3d5a3681c8590b9dc25d1a561e8f5a9c995281012860901e3"},
"postgrex": {:hex, :postgrex, "0.15.5", "aec40306a622d459b01bff890fa42f1430dac61593b122754144ad9033a2152f", [:mix], [{:connection, "~> 1.0", [hex: :connection, repo: "hexpm", optional: false]}, {:db_connection, "~> 2.1", [hex: :db_connection, repo: "hexpm", optional: false]}, {:decimal, "~> 1.5", [hex: :decimal, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}], "hexpm", "ed90c81e1525f65a2ba2279dbcebf030d6d13328daa2f8088b9661eb9143af7f"}, "postgrex": {:hex, :postgrex, "0.15.5", "aec40306a622d459b01bff890fa42f1430dac61593b122754144ad9033a2152f", [:mix], [{:connection, "~> 1.0", [hex: :connection, repo: "hexpm", optional: false]}, {:db_connection, "~> 2.1", [hex: :db_connection, repo: "hexpm", optional: false]}, {:decimal, "~> 1.5", [hex: :decimal, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}], "hexpm", "ed90c81e1525f65a2ba2279dbcebf030d6d13328daa2f8088b9661eb9143af7f"},