Compare commits
8 Commits
9e6b185cfd
...
cfd9f7505a
Author | SHA1 | Date |
---|---|---|
Shadowfacts | cfd9f7505a | |
Shadowfacts | eec0b918e7 | |
Shadowfacts | d476839fce | |
Shadowfacts | 6f568a03e1 | |
Shadowfacts | 3192969889 | |
Shadowfacts | dbef262779 | |
Shadowfacts | c113ee08ee | |
Shadowfacts | 048e2b755e |
|
@ -0,0 +1,3 @@
|
||||||
|
defmodule Frenzy.Pipeline.Extractor do
|
||||||
|
@callback extract(Floki.html_tree()) :: {:ok, Floki.html_tree()} | {:error, String.t()}
|
||||||
|
end
|
|
@ -0,0 +1,40 @@
|
||||||
|
defmodule Frenzy.Pipeline.Extractor.DaringFireball do
|
||||||
|
@moduledoc """
|
||||||
|
Extractor for https://daringfireball.net
|
||||||
|
"""
|
||||||
|
|
||||||
|
alias Frenzy.Pipeline.Extractor
|
||||||
|
@behaviour Extractor
|
||||||
|
|
||||||
|
@impl Extractor
|
||||||
|
def extract(html_tree) do
|
||||||
|
case get_article_element(html_tree) || get_link_element(html_tree) do
|
||||||
|
nil ->
|
||||||
|
{:error, "no matching elements"}
|
||||||
|
|
||||||
|
elem ->
|
||||||
|
{:ok, elem}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp get_article_element(html_tree) do
|
||||||
|
case Floki.find(html_tree, "div.article") do
|
||||||
|
[article_elem | _] ->
|
||||||
|
# articles include extra information in the div.article element
|
||||||
|
Floki.filter_out(article_elem, "h1, .dateline, #PreviousNext")
|
||||||
|
|
||||||
|
_ ->
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp get_link_element(html_tree) do
|
||||||
|
case Floki.find(html_tree, "dl.linkedlist dd") do
|
||||||
|
[dd_elem | _] ->
|
||||||
|
dd_elem
|
||||||
|
|
||||||
|
_ ->
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,23 @@
|
||||||
|
defmodule Frenzy.Pipeline.Extractor.EricaSadun do
|
||||||
|
@moduledoc """
|
||||||
|
Extractor for https://ericasadun.com
|
||||||
|
"""
|
||||||
|
|
||||||
|
alias Frenzy.Pipeline.Extractor
|
||||||
|
@behaviour Extractor
|
||||||
|
|
||||||
|
@impl Extractor
|
||||||
|
def extract(html_tree) do
|
||||||
|
case Floki.find(html_tree, ".post-content") do
|
||||||
|
[content_elem | _] ->
|
||||||
|
# content element includes social media buttons and related posts
|
||||||
|
{
|
||||||
|
:ok,
|
||||||
|
Floki.filter_out(content_elem, "div.sharedaddy, div#jp-relatedposts")
|
||||||
|
}
|
||||||
|
|
||||||
|
_ ->
|
||||||
|
{:error, "no matching elements"}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,30 @@
|
||||||
|
defmodule Frenzy.Pipeline.Extractor.WhateverScale do
|
||||||
|
@moduledoc """
|
||||||
|
Extractor for https://whatever.scalzi.com
|
||||||
|
"""
|
||||||
|
|
||||||
|
alias Frenzy.Pipeline.Extractor
|
||||||
|
@behaviour Extractor
|
||||||
|
|
||||||
|
@impl Extractor
|
||||||
|
def extract(html_tree) do
|
||||||
|
case get_article_content(html_tree) do
|
||||||
|
nil ->
|
||||||
|
{:error, "no matching elements"}
|
||||||
|
|
||||||
|
elem ->
|
||||||
|
{:ok, elem}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp get_article_content(html_tree) do
|
||||||
|
case Floki.find(html_tree, "article.post > div.entry-content") do
|
||||||
|
[content_elem | _] ->
|
||||||
|
# remove social media buttons that are included in the .entry-content element
|
||||||
|
Floki.filter_out(content_elem, "div#jp-post-flair")
|
||||||
|
|
||||||
|
_ ->
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -4,8 +4,8 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||||
@behaviour Stage
|
@behaviour Stage
|
||||||
|
|
||||||
@impl Stage
|
@impl Stage
|
||||||
def apply(_opts, %{url: url} = item_params) do
|
def apply(opts, %{url: url} = item_params) do
|
||||||
case get_article_content(url) do
|
case get_article_content(url, opts["extractor"]) do
|
||||||
{:ok, content} ->
|
{:ok, content} ->
|
||||||
{:ok, %{item_params | content: content}}
|
{:ok, %{item_params | content: content}}
|
||||||
|
|
||||||
|
@ -16,17 +16,41 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||||
end
|
end
|
||||||
|
|
||||||
@impl Stage
|
@impl Stage
|
||||||
def validate_opts(opts), do: {:ok, opts}
|
def validate_opts(opts) when is_map(opts) do
|
||||||
|
# todo: figure out why this errors when an empty map is provided
|
||||||
|
case opts["extractor"] do
|
||||||
|
nil ->
|
||||||
|
{:ok, %{opts | extractor: "builtin"}}
|
||||||
|
|
||||||
@spec get_article_content(String.t()) :: {:ok, String.t()} | {:error, String.t()}
|
extractor when not is_binary(extractor) ->
|
||||||
defp get_article_content(url) when is_binary(url) and url != "" do
|
{:error, "extractor must be a string"}
|
||||||
|
|
||||||
|
"builtin" ->
|
||||||
|
{:ok, opts}
|
||||||
|
|
||||||
|
extractor ->
|
||||||
|
try do
|
||||||
|
String.to_existing_atom("Elixir." <> extractor)
|
||||||
|
{:ok, opts}
|
||||||
|
rescue
|
||||||
|
ArgumentError ->
|
||||||
|
{:error, "extractor must be \"builtin\" or a module that exists"}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
@impl Stage
|
||||||
|
def validate_opts(_), do: {:error, "options must be a map"}
|
||||||
|
|
||||||
|
@spec get_article_content(String.t(), String.t()) :: {:ok, String.t()} | {:error, String.t()}
|
||||||
|
defp get_article_content(url, extractor) when is_binary(url) and url != "" do
|
||||||
Logger.debug("Getting article from #{url}")
|
Logger.debug("Getting article from #{url}")
|
||||||
|
|
||||||
url
|
url
|
||||||
|> HTTPoison.get()
|
|> HTTPoison.get()
|
||||||
|> case do
|
|> case do
|
||||||
{:ok, response} ->
|
{:ok, response} ->
|
||||||
handle_response(url, response)
|
handle_response(url, response, extractor)
|
||||||
|
|
||||||
{:error, %HTTPoison.Error{reason: reason}} ->
|
{:error, %HTTPoison.Error{reason: reason}} ->
|
||||||
{:error, "HTTPoison error: #{reason}"}
|
{:error, "HTTPoison error: #{reason}"}
|
||||||
|
@ -35,38 +59,82 @@ defmodule Frenzy.Pipeline.ScrapeStage do
|
||||||
|
|
||||||
defp get_article_content(_url), do: {:error, "URL must be a non-empty string"}
|
defp get_article_content(_url), do: {:error, "URL must be a non-empty string"}
|
||||||
|
|
||||||
@spec handle_response(String.t(), HTTPoison.Response.t()) ::
|
@spec handle_response(String.t(), HTTPoison.Response.t(), String.t()) ::
|
||||||
{:ok, String.t()} | {:error, String.t()}
|
{:ok, String.t()} | {:error, String.t()}
|
||||||
defp handle_response(_url, %HTTPoison.Response{status_code: 200, body: body}) do
|
defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}, extractor) do
|
||||||
article = Readability.article(body)
|
case extractor do
|
||||||
{:ok, Readability.readable_html(article)}
|
"builtin" ->
|
||||||
|
{:ok, Readability.article(body)}
|
||||||
|
|
||||||
|
module_name ->
|
||||||
|
html_tree = Floki.parse(body)
|
||||||
|
apply(String.to_existing_atom("Elixir." <> module_name), :extract, [html_tree])
|
||||||
|
end
|
||||||
|
|> case do
|
||||||
|
{:ok, html} ->
|
||||||
|
html = Floki.map(html, rewrite_image_urls(URI.parse(url)))
|
||||||
|
|
||||||
|
case extractor do
|
||||||
|
"builtin" ->
|
||||||
|
{:ok, Readability.readable_html(html)}
|
||||||
|
|
||||||
|
_ ->
|
||||||
|
{:ok, Floki.raw_html(html)}
|
||||||
|
end
|
||||||
|
|
||||||
|
res ->
|
||||||
|
res
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
defp handle_response(_url, %HTTPoison.Response{status_code: 404}) do
|
defp handle_response(_url, %HTTPoison.Response{status_code: 404}, _extractor) do
|
||||||
{:error, "404 not found"}
|
{:error, "404 not found"}
|
||||||
end
|
end
|
||||||
|
|
||||||
defp handle_response(url, %HTTPoison.Response{status_code: status_code, headers: headers})
|
defp handle_response(
|
||||||
|
url,
|
||||||
|
%HTTPoison.Response{status_code: status_code, headers: headers},
|
||||||
|
extractor
|
||||||
|
)
|
||||||
when status_code in [301, 302] do
|
when status_code in [301, 302] do
|
||||||
{"Location", new_url} = Enum.find(headers, fn {name, _value} -> name == "Location" end)
|
|
||||||
|
|
||||||
headers
|
headers
|
||||||
|> Enum.find(fn {name, _value} -> name == "Location" end)
|
|> Enum.find(fn {name, _value} -> name == "Location" end)
|
||||||
|> case do
|
|> case do
|
||||||
{"Location", new_url} ->
|
{"Location", new_url} ->
|
||||||
Logger.debug("Got 301 redirect from #{url} to #{new_url}")
|
Logger.debug("Got 301 redirect from #{url} to #{new_url}")
|
||||||
get_article_content(new_url)
|
get_article_content(new_url, extractor)
|
||||||
|
|
||||||
_ ->
|
_ ->
|
||||||
{:error, "Missing Location header for redirect"}
|
{:error, "Missing Location header for redirect"}
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
defp handle_response(_url, %HTTPoison.Response{status_code: 403}) do
|
defp handle_response(_url, %HTTPoison.Response{status_code: 403}, _extractor) do
|
||||||
{:error, "403 Forbidden"}
|
{:error, "403 Forbidden"}
|
||||||
end
|
end
|
||||||
|
|
||||||
defp handle_response(_url, %HTTPoison.Response{status_code: status_code} = response) do
|
defp handle_response(_url, %HTTPoison.Response{} = response, _extractor) do
|
||||||
{:error, "No handler for response #{inspect(response)}"}
|
{:error, "No handler for response #{inspect(response)}"}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Generates a helper function for the article with the given URI that takes an HTML element and,
|
||||||
|
# if it's an <img> element whose src attribute does not have a hostname, adds the hostname and
|
||||||
|
# scheme to the element.
|
||||||
|
defp rewrite_image_urls(%URI{host: host, scheme: scheme}) do
|
||||||
|
fn
|
||||||
|
{"img", [{"src", src} | attrs]} = elem ->
|
||||||
|
case URI.parse(src) do
|
||||||
|
%URI{host: nil, path: path} ->
|
||||||
|
new_src = URI.to_string(%URI{path: path, host: host, scheme: scheme})
|
||||||
|
|
||||||
|
{"img", [{"src", new_src} | attrs]}
|
||||||
|
|
||||||
|
_ ->
|
||||||
|
elem
|
||||||
|
end
|
||||||
|
|
||||||
|
elem ->
|
||||||
|
elem
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -1,105 +0,0 @@
|
||||||
defmodule Frenzy.Pipeline.Site.DaringFireballScrapeStage do
|
|
||||||
require Logger
|
|
||||||
alias Frenzy.Pipeline.Stage
|
|
||||||
@behaviour Stage
|
|
||||||
|
|
||||||
@impl Stage
|
|
||||||
def apply(_opts, %{url: url} = item_params) do
|
|
||||||
case get_article_content(url) do
|
|
||||||
{:ok, content} ->
|
|
||||||
{:ok, %{item_params | content: content}}
|
|
||||||
|
|
||||||
{:error, reason} ->
|
|
||||||
Logger.warn("Unable to get Daring Fireball article content for #{url}: #{reason}")
|
|
||||||
{:ok, item_params}
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
@impl Stage
|
|
||||||
def validate_opts(opts), do: {:ok, opts}
|
|
||||||
|
|
||||||
@spec get_article_content(String.t()) :: {:ok, String.t()} | {:error, String.t()}
|
|
||||||
defp get_article_content(url) when is_binary(url) and url != "" do
|
|
||||||
Logger.debug("Get Daring Fireball article from #{url}")
|
|
||||||
|
|
||||||
url
|
|
||||||
|> HTTPoison.get()
|
|
||||||
|> case do
|
|
||||||
{:ok, response} ->
|
|
||||||
handle_response(url, response)
|
|
||||||
|
|
||||||
{:error, %HTTPoison.Error{reason: reason}} ->
|
|
||||||
{:error, "HTTPoison error: #{reason}"}
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
defp get_article_content(_url), do: {:error, "URL must be a non-empty string"}
|
|
||||||
|
|
||||||
@spec handle_response(String.t(), HTTPoison.Response.t()) ::
|
|
||||||
{:ok, String.t()} | {:error, String.t()}
|
|
||||||
defp handle_response(url, %HTTPoison.Response{status_code: 200, body: body}) do
|
|
||||||
html_tree = Floki.parse(body)
|
|
||||||
|
|
||||||
case get_article_element(html_tree) || get_link_element(html_tree) do
|
|
||||||
nil ->
|
|
||||||
{:error, "no matching element"}
|
|
||||||
|
|
||||||
elem ->
|
|
||||||
readable_html =
|
|
||||||
elem
|
|
||||||
|> Floki.filter_out(:comment)
|
|
||||||
|> Readability.readable_html()
|
|
||||||
|
|
||||||
{:ok, readable_html}
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
defp handle_response(_url, %HTTPoison.Response{status_code: 404}) do
|
|
||||||
{:error, "404 not found"}
|
|
||||||
end
|
|
||||||
|
|
||||||
defp handle_response(url, %HTTPoison.Response{status_code: status_code, headers: headers})
|
|
||||||
when status_code in [301, 302] do
|
|
||||||
{"Location", new_url} = Enum.find(headers, fn {name, _value} -> name == "Location" end)
|
|
||||||
|
|
||||||
headers
|
|
||||||
|> Enum.find(fn {name, _value} -> name == "Location" end)
|
|
||||||
|> case do
|
|
||||||
{"Location", new_url} ->
|
|
||||||
Logger.debug("Got 301 redirect from #{url} to #{new_url}")
|
|
||||||
get_article_content(new_url)
|
|
||||||
|
|
||||||
_ ->
|
|
||||||
{:error, "Missing Location header for redirect"}
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
defp handle_response(_url, %HTTPoison.Response{status_code: 403}) do
|
|
||||||
{:error, "403 Forbidden"}
|
|
||||||
end
|
|
||||||
|
|
||||||
defp handle_response(_url, %HTTPoison.Response{status_code: status_code} = response) do
|
|
||||||
{:error, "No handler for response #{inspect(response)}"}
|
|
||||||
end
|
|
||||||
|
|
||||||
defp get_article_element(html_tree) do
|
|
||||||
case Floki.find(html_tree, "div.article") do
|
|
||||||
[article_elem | _] ->
|
|
||||||
# articles include extra information in the div.article element
|
|
||||||
Floki.filter_out(article_elem, "h1, .dateline, #PreviousNext")
|
|
||||||
|
|
||||||
_ ->
|
|
||||||
nil
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
defp get_link_element(html_tree) do
|
|
||||||
case Floki.find(html_tree, "dl.linkedlist dd") do
|
|
||||||
[dd_elem | _] ->
|
|
||||||
dd_elem
|
|
||||||
|
|
||||||
_ ->
|
|
||||||
nil
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
|
@ -1,6 +1,6 @@
|
||||||
defmodule FrenzyWeb.AccountController do
|
defmodule FrenzyWeb.AccountController do
|
||||||
use FrenzyWeb, :controller
|
use FrenzyWeb, :controller
|
||||||
alias Frenzy.{Repo, User, FervorClient, Filter}
|
alias Frenzy.{Repo, User, FervorClient}
|
||||||
alias FrenzyWeb.Router.Helpers, as: Routes
|
alias FrenzyWeb.Router.Helpers, as: Routes
|
||||||
alias FrenzyWeb.Endpoint
|
alias FrenzyWeb.Endpoint
|
||||||
|
|
||||||
|
@ -125,11 +125,7 @@ defmodule FrenzyWeb.AccountController do
|
||||||
Enum.each(feeds, fn feed_url ->
|
Enum.each(feeds, fn feed_url ->
|
||||||
feed_changeset =
|
feed_changeset =
|
||||||
Ecto.build_assoc(group, :feeds, %{
|
Ecto.build_assoc(group, :feeds, %{
|
||||||
feed_url: feed_url,
|
feed_url: feed_url
|
||||||
filter: %Filter{
|
|
||||||
mode: "reject",
|
|
||||||
score: 0
|
|
||||||
}
|
|
||||||
})
|
})
|
||||||
|
|
||||||
{:ok, _feed} = Repo.insert(feed_changeset)
|
{:ok, _feed} = Repo.insert(feed_changeset)
|
||||||
|
|
|
@ -67,7 +67,7 @@ defmodule FrenzyWeb.FeedController do
|
||||||
end
|
end
|
||||||
|
|
||||||
def edit(conn, _params) do
|
def edit(conn, _params) do
|
||||||
feed = conn.assigns[:feed]
|
feed = conn.assigns[:feed] |> Repo.preload(:pipeline_stages)
|
||||||
stages = Enum.sort_by(feed.pipeline_stages, fn stage -> stage.index end)
|
stages = Enum.sort_by(feed.pipeline_stages, fn stage -> stage.index end)
|
||||||
|
|
||||||
render(conn, "edit.html", %{
|
render(conn, "edit.html", %{
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
defmodule FrenzyWeb.Fervor.FeedsController do
|
defmodule FrenzyWeb.Fervor.FeedsController do
|
||||||
use FrenzyWeb, :controller
|
use FrenzyWeb, :controller
|
||||||
alias Frenzy.{Repo, Feed, Filter, Item}
|
alias Frenzy.{Repo, Feed, Item}
|
||||||
import Ecto.Query
|
import Ecto.Query
|
||||||
alias FrenzyWeb.Fervor.Paginator
|
alias FrenzyWeb.Fervor.Paginator
|
||||||
|
|
||||||
|
@ -85,11 +85,7 @@ defmodule FrenzyWeb.Fervor.FeedsController do
|
||||||
group ->
|
group ->
|
||||||
changeset =
|
changeset =
|
||||||
Ecto.build_assoc(group, :feeds, %{
|
Ecto.build_assoc(group, :feeds, %{
|
||||||
feed_url: feed_url,
|
feed_url: feed_url
|
||||||
filter: %Filter{
|
|
||||||
mode: "reject",
|
|
||||||
score: 0
|
|
||||||
}
|
|
||||||
})
|
})
|
||||||
|
|
||||||
{:ok, feed} = Repo.insert(changeset)
|
{:ok, feed} = Repo.insert(changeset)
|
||||||
|
|
|
@ -17,7 +17,7 @@
|
||||||
<%= for item <- @items do %>
|
<%= for item <- @items do %>
|
||||||
<tr <%= if item.read do %>class="item-read"<% end %>>
|
<tr <%= if item.read do %>class="item-read"<% end %>>
|
||||||
<td>
|
<td>
|
||||||
<a href="<%= Routes.item_path(@conn, :show, item.id) %>"><%= item.title %></a>
|
<a href="<%= Routes.item_path(@conn, :show, item.id) %>"><%= item.title || "(Untitled)" %></a>
|
||||||
</td>
|
</td>
|
||||||
<td>
|
<td>
|
||||||
<% {:ok, date} = Timex.format(item.date, "{YYYY}-{M}-{D} {h12}:{m} {AM}") %>
|
<% {:ok, date} = Timex.format(item.date, "{YYYY}-{M}-{D} {h12}:{m} {AM}") %>
|
||||||
|
|
|
@ -2,4 +2,9 @@
|
||||||
|
|
||||||
.item-read a {
|
.item-read a {
|
||||||
color: #606c76;
|
color: #606c76;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
blockquote {
|
||||||
|
padding-left: 2rem;
|
||||||
|
border-left: 4px solid lightgray;
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue