Compare commits
No commits in common. "e55a6941943eb55829c9e27b1f2b378d7e2e4460" and "d0c6831d45b1b2bec94e5161ce012def324e235e" have entirely different histories.
e55a694194
...
d0c6831d45
|
@ -32,72 +32,6 @@ defmodule Frenzy.Pipeline.FilterStage do
|
||||||
{:ok, item_params}
|
{:ok, item_params}
|
||||||
end
|
end
|
||||||
|
|
||||||
@impl Stage
|
|
||||||
def validate_opts(opts) when is_map(opts) do
|
|
||||||
cond do
|
|
||||||
not (Map.has_key?(opts, "mode") and is_binary(opts["mode"]) and
|
|
||||||
opts["mode"] in ["accept", "reject"]) ->
|
|
||||||
{:error, "mode must be a string, either 'accept' or 'reject'"}
|
|
||||||
|
|
||||||
not (Map.has_key?(opts, "score") and is_integer(opts["score"])) ->
|
|
||||||
{:error, "score must be an integer"}
|
|
||||||
|
|
||||||
not (Map.has_key?(opts, "rules") and is_list(opts["rules"])) ->
|
|
||||||
{:error, "rules must be a list of rules"}
|
|
||||||
|
|
||||||
true ->
|
|
||||||
validate_rules(opts)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
@impl Stage
|
|
||||||
def validate_opts(_opts), do: {:error, "options must be a map"}
|
|
||||||
|
|
||||||
defp validate_rules(%{"rules" => rules} = opts) do
|
|
||||||
rules
|
|
||||||
|> Enum.with_index()
|
|
||||||
|> Enum.reduce_while(:ok, fn {rule, index}, :ok ->
|
|
||||||
case validate_rule(rule) do
|
|
||||||
:ok ->
|
|
||||||
{:cont, :ok}
|
|
||||||
|
|
||||||
{:error, reason} ->
|
|
||||||
{:halt, {:error, "invalid rule #{index}: #{reason}"}}
|
|
||||||
end
|
|
||||||
end)
|
|
||||||
|> case do
|
|
||||||
:ok ->
|
|
||||||
{:ok, opts}
|
|
||||||
|
|
||||||
{:error, _reason} = err ->
|
|
||||||
err
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
defp validate_rule(rule) do
|
|
||||||
cond do
|
|
||||||
not is_map(rule) ->
|
|
||||||
{:error, "rule must be a map"}
|
|
||||||
|
|
||||||
not (Map.has_key?(rule, "mode") and is_binary(rule["mode"]) and
|
|
||||||
rule["mode"] in ["contains_string", "matches_regex"]) ->
|
|
||||||
{:error, "mode property must be a string, either 'contains_string' or 'matches_regex'"}
|
|
||||||
|
|
||||||
not (Map.has_key?(rule, "property") and is_binary(rule["property"]) and
|
|
||||||
rule["property"] in ["url", "title", "author"]) ->
|
|
||||||
{:error, "property property must be a string, either 'url', 'title', or 'author'"}
|
|
||||||
|
|
||||||
not (Map.has_key?(rule, "param") and is_binary(rule["param"])) ->
|
|
||||||
{:error, "param property must be a string"}
|
|
||||||
|
|
||||||
not (Map.has_key?(rule, "weight") and is_integer(rule["weight"])) ->
|
|
||||||
{:error, "weight property must be an integer"}
|
|
||||||
|
|
||||||
true ->
|
|
||||||
:ok
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
defp test(
|
defp test(
|
||||||
%{"mode" => mode, "property" => property, "param" => param, "weight" => weight},
|
%{"mode" => mode, "property" => property, "param" => param, "weight" => weight},
|
||||||
item_params
|
item_params
|
||||||
|
|
|
@ -10,14 +10,11 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||||
{:ok, %{item_params | content: content}}
|
{:ok, %{item_params | content: content}}
|
||||||
|
|
||||||
{:error, reason} ->
|
{:error, reason} ->
|
||||||
Logger.warn("Unable to get article content for #{url}: #{reason}")
|
Logger.warn("Unable to get article content: #{reason}")
|
||||||
item_params
|
item_params
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
@impl Stage
|
|
||||||
def validate_opts(opts), do: {:ok, opts}
|
|
||||||
|
|
||||||
defp get_article_content(url) when is_binary(url) and url != "" do
|
defp get_article_content(url) when is_binary(url) and url != "" do
|
||||||
Logger.debug("Getting article from #{url}")
|
Logger.debug("Getting article from #{url}")
|
||||||
|
|
||||||
|
@ -47,16 +44,8 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||||
when status_code in [301, 302] do
|
when status_code in [301, 302] do
|
||||||
{"Location", new_url} = Enum.find(headers, fn {name, _value} -> name == "Location" end)
|
{"Location", new_url} = Enum.find(headers, fn {name, _value} -> name == "Location" end)
|
||||||
|
|
||||||
headers
|
|
||||||
|> Enum.find(fn {name, _value} -> name == "Location" end)
|
|
||||||
|> case do
|
|
||||||
{"Location", new_url} ->
|
|
||||||
Logger.debug("Got 301 redirect from #{url} to #{new_url}")
|
Logger.debug("Got 301 redirect from #{url} to #{new_url}")
|
||||||
get_article_content(new_url)
|
get_article_content(new_url)
|
||||||
|
|
||||||
_ ->
|
|
||||||
{:error, "Missing Location header for redirect"}
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
|
||||||
defp handle_response(_url, %HTTPoison.Response{status_code: 403}) do
|
defp handle_response(_url, %HTTPoison.Response{status_code: 403}) do
|
||||||
|
|
|
@ -1,102 +0,0 @@
|
||||||
defmodule Frenzy.Pipeline.Site.DaringFireballScrapeStage do
|
|
||||||
require Logger
|
|
||||||
alias Frenzy.Pipeline.Stage
|
|
||||||
@behaviour Stage
|
|
||||||
|
|
||||||
@impl Stage
|
|
||||||
def apply(_opts, %{url: url} = item_params) do
|
|
||||||
case get_article_content(url) do
|
|
||||||
{:ok, content} ->
|
|
||||||
{:ok, %{item_params | content: content}}
|
|
||||||
|
|
||||||
{:error, reason} ->
|
|
||||||
Logger.warn("Unable to get Daring Fireball article content for #{url}: #{reason}")
|
|
||||||
item_params
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
@impl Stage
|
|
||||||
def validate_opts(opts), do: {:ok, opts}
|
|
||||||
|
|
||||||
defp get_article_content(url) when is_binary(url) and url != "" do
|
|
||||||
Logger.debug("Get Daring Fireball article from #{url}")
|
|
||||||
|
|
||||||
url
|
|
||||||
|> HTTPoison.get()
|
|
||||||
|> case do
|
|
||||||
{:ok, response} ->
|
|
||||||
handle_response(url, response)
|
|
||||||
|
|
||||||
{:error, %HTTPoison.Error{reason: reason}} ->
|
|
||||||
{:error, "HTTPoison error: #{reason}"}
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
defp get_article_content(_url), do: {:error, "URL must be a non-empty string"}
|
|
||||||
|
|
||||||
defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}) do
|
|
||||||
html_tree = Floki.parse(body)
|
|
||||||
|
|
||||||
case get_article_element(html_tree) || get_link_element(html_tree) do
|
|
||||||
nil ->
|
|
||||||
{:error, "no matching element"}
|
|
||||||
|
|
||||||
elem ->
|
|
||||||
readable_html =
|
|
||||||
elem
|
|
||||||
|> Floki.filter_out(:comment)
|
|
||||||
|> Readability.readable_html()
|
|
||||||
|
|
||||||
{:ok, readable_html}
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
defp handle_response(_url, %HTTPoison.Response{status_code: 404}) do
|
|
||||||
{:error, "404 not found"}
|
|
||||||
end
|
|
||||||
|
|
||||||
defp handle_response(url, %HTTPoison.Response{status_code: status_code, headers: headers})
|
|
||||||
when status_code in [301, 302] do
|
|
||||||
{"Location", new_url} = Enum.find(headers, fn {name, _value} -> name == "Location" end)
|
|
||||||
|
|
||||||
headers
|
|
||||||
|> Enum.find(fn {name, _value} -> name == "Location" end)
|
|
||||||
|> case do
|
|
||||||
{"Location", new_url} ->
|
|
||||||
Logger.debug("Got 301 redirect from #{url} to #{new_url}")
|
|
||||||
get_article_content(new_url)
|
|
||||||
|
|
||||||
_ ->
|
|
||||||
{:error, "Missing Location header for redirect"}
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
defp handle_response(_url, %HTTPoison.Response{status_code: 403}) do
|
|
||||||
{:error, "403 Forbidden"}
|
|
||||||
end
|
|
||||||
|
|
||||||
defp handle_response(_url, %HTTPoison.Response{status_code: status_code} = response) do
|
|
||||||
{:error, "No handler for response #{inspect(response)}"}
|
|
||||||
end
|
|
||||||
|
|
||||||
defp get_article_element(html_tree) do
|
|
||||||
case Floki.find(html_tree, "div.article") do
|
|
||||||
[article_elem | _] ->
|
|
||||||
# articles include extra information in the div.article element
|
|
||||||
Floki.filter_out(article_elem, "h1, .dateline, #PreviousNext")
|
|
||||||
|
|
||||||
_ ->
|
|
||||||
nil
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
defp get_link_element(html_tree) do
|
|
||||||
case Floki.find(html_tree, "dl.linkedlist dd") do
|
|
||||||
[dd_elem | _] ->
|
|
||||||
dd_elem
|
|
||||||
|
|
||||||
_ ->
|
|
||||||
nil
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
|
@ -1,5 +1,3 @@
|
||||||
defmodule Frenzy.Pipeline.Stage do
|
defmodule Frenzy.Pipeline.Stage do
|
||||||
@callback apply(Map.t(), Map.t()) :: {:ok, Map.t()} | :tombstone | {:error, String.t()}
|
@callback apply(Map.t(), Map.t()) :: {:ok, Map.t()} | :tombstone | {:error, String.t()}
|
||||||
|
|
||||||
@callback validate_opts(Map.t()) :: {:ok, Map.t()} | {:error, String.t()}
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -65,11 +65,7 @@ defmodule FrenzyWeb.PipelineController do
|
||||||
feed = conn.assigns[:feed]
|
feed = conn.assigns[:feed]
|
||||||
stage = conn.assigns[:stage]
|
stage = conn.assigns[:stage]
|
||||||
|
|
||||||
with {:ok, options} <- Jason.decode(options_json),
|
with {:ok, options} <- Jason.decode(options_json) do
|
||||||
{:ok, options} <-
|
|
||||||
apply(String.to_existing_atom("Elixir." <> stage.module_name), :validate_opts, [
|
|
||||||
options
|
|
||||||
]) do
|
|
||||||
changeset = PipelineStage.changeset(stage, %{options: options})
|
changeset = PipelineStage.changeset(stage, %{options: options})
|
||||||
{:ok, _stage} = Repo.update(changeset)
|
{:ok, _stage} = Repo.update(changeset)
|
||||||
|
|
||||||
|
@ -116,9 +112,7 @@ defmodule FrenzyWeb.PipelineController do
|
||||||
feed = conn.assigns[:feed]
|
feed = conn.assigns[:feed]
|
||||||
|
|
||||||
with {index, _} <- Integer.parse(index),
|
with {index, _} <- Integer.parse(index),
|
||||||
module_atom <- String.to_existing_atom("Elixir." <> module_name),
|
{:ok, options} <- Jason.decode(options_json) do
|
||||||
{:ok, options} <- Jason.decode(options_json),
|
|
||||||
{:ok, options} <- apply(module_atom, :validate_opts, [options]) do
|
|
||||||
changeset =
|
changeset =
|
||||||
Ecto.build_assoc(feed, :pipeline_stages, %{
|
Ecto.build_assoc(feed, :pipeline_stages, %{
|
||||||
index: index,
|
index: index,
|
||||||
|
|
Loading…
Reference in New Issue