From 1e984ba30edaa81245a8c7721b14db98068ac11d Mon Sep 17 00:00:00 2001 From: Shadowfacts Date: Wed, 20 May 2020 22:10:50 -0400 Subject: [PATCH] Add Webmention endpoint discovery --- config/config.exs | 2 + lib/clacks/http.ex | 10 +-- lib/clacks/webmention/endpoint.ex | 119 ++++++++++++++++++++++++++++++ mix.exs | 4 +- mix.lock | 2 + 5 files changed, 127 insertions(+), 10 deletions(-) create mode 100644 lib/clacks/webmention/endpoint.ex diff --git a/config/config.exs b/config/config.exs index 180c84d..a4567e5 100644 --- a/config/config.exs +++ b/config/config.exs @@ -40,6 +40,8 @@ config :clacks, Oban, prune: {:maxlen, 10_000}, queues: [federate: 10] +config :floki, :html_parser, Floki.HTMLParser.FastHtml + # Import environment specific config. This must remain at the bottom # of this file so it overrides the configuration defined above. import_config "#{Mix.env()}.exs" diff --git a/lib/clacks/http.ex b/lib/clacks/http.ex index da7c807..1acf9c6 100644 --- a/lib/clacks/http.ex +++ b/lib/clacks/http.ex @@ -27,15 +27,7 @@ defmodule Clacks.HTTP do |> Enum.find(fn {name, _value} -> String.downcase(name) == "location" end) |> case do {_, new_url} -> - new_url = - case URI.parse(new_url) do - %URI{host: nil, path: path} -> - # relative path - %URI{URI.parse(url) | path: path} |> URI.to_string() - - uri -> - uri - end + new_url = URI.merge(URI.parse(url), URI.parse(new_url)) Logger.debug("Got 301 redirect from #{url} to #{new_url}") fetch(method, new_url, headers) diff --git a/lib/clacks/webmention/endpoint.ex b/lib/clacks/webmention/endpoint.ex new file mode 100644 index 0000000..7395e47 --- /dev/null +++ b/lib/clacks/webmention/endpoint.ex @@ -0,0 +1,119 @@ +defmodule Clacks.Webmention.Endpoint do + require Logger + + @spec find_endpoint(url :: String.t()) :: URI.t() | nil + def find_endpoint(url) do + case find_endpoint_by_header(url) do + nil -> + find_endpoint_by_html(url) + + :error -> + :error + + endpoint -> + endpoint + end + end + + defp find_endpoint_by_header(url) do + case Clacks.HTTP.head(url) do + {:ok, %HTTPoison.Response{headers: headers, request: %HTTPoison.Request{url: final_url}}} -> + headers + |> Enum.filter(fn {name, _} -> String.downcase(name) == "link" end) + |> webmention_link() + |> case do + nil -> + nil + + str when is_binary(str) -> + URI.merge(final_url, str) + end + + {:error, reason} -> + Logger.warn("Unable to find Webmention endpoint for '#{url}': #{reason}") + :error + end + end + + defp webmention_link([]), do: nil + + defp webmention_link([{_, value} | rest]) do + String.split(value, ",") + |> Enum.map(&parse_link_header/1) + |> Enum.find(fn {rels, _} -> "webmention" in rels end) + |> case do + nil -> + webmention_link(rest) + + {_, res} -> + res + end + end + + defp parse_link_header(value) do + [value | params] = String.split(value, ";") + + uri_reference = + value + |> String.trim() + |> String.slice(1..-1) + + {_, rel} = + params + |> Enum.map(fn str -> + str = String.trim(str) + [name | rest] = String.split(str, "=") + rest = Enum.join(rest, "=") + + value = + if String.starts_with?(rest, "\"") do + {_, rest} = String.split_at(rest, 1) + + if String.ends_with?(rest, "\"") do + {rest, _} = String.split_at(rest, -1) + rest + else + rest + end + else + rest + end + + {name, value} + end) + |> Enum.find(fn {name, _} -> String.downcase(name) == "rel" end) + + rels = String.split(rel, ~r/\s+/) |> Enum.map(&String.downcase/1) + + {rels, uri_reference} + end + + defp find_endpoint_by_html(url) do + case Clacks.HTTP.get(url) do + {:ok, %HTTPoison.Response{body: body, request: %HTTPoison.Request{url: final_url}}} -> + {:ok, doc} = Floki.parse_document(body) + + Floki.find(doc, "link[rel~=webmention], a[rel~=webmention]") + |> Enum.reduce_while(nil, fn el, _acc -> + case Floki.attribute(el, "href") do + [href] when is_binary(href) -> + {:halt, href} + + _ -> + {:cont, nil} + end + end) + |> case do + nil -> + nil + + str when is_binary(str) -> + URI.merge(final_url, str) + end + + {:error, reason} -> + Logger.warn("Unable to find Webmention endpoint for '#{url}': #{reason}") + :error + end + end +end diff --git a/mix.exs b/mix.exs index 5ea6360..466e1e0 100644 --- a/mix.exs +++ b/mix.exs @@ -52,7 +52,9 @@ defmodule Clacks.MixProject do {:bcrypt_elixir, "~> 2.0"}, {:oban, "~> 1.2.0"}, {:fast_sanitize, "~> 0.1.7"}, - {:dialyxir, "~> 1.0", only: [:dev], runtime: false} + {:fast_html, "~> 1.0.3"}, + {:dialyxir, "~> 1.0", only: [:dev], runtime: false}, + {:floki, "~> 0.26.0"} ] end diff --git a/mix.lock b/mix.lock index 991c9be..b21e7f6 100644 --- a/mix.lock +++ b/mix.lock @@ -19,8 +19,10 @@ "fast_sanitize": {:hex, :fast_sanitize, "0.1.7", "2a7cd8734c88a2de6de55022104f8a3b87f1fdbe8bbf131d9049764b53d50d0d", [:mix], [{:fast_html, "~> 1.0", [hex: :fast_html, repo: "hexpm", optional: false]}, {:plug, "~> 1.8", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm", "f39fe8ea08fbac17487c30bf09b7d9f3e12472e51fb07a88ffeb8fd17da8ab67"}, "file_system": {:hex, :file_system, "0.2.7", "e6f7f155970975789f26e77b8b8d8ab084c59844d8ecfaf58cbda31c494d14aa", [:mix], [], "hexpm", "b4cfa2d69c7f0b18fd06db222b2398abeef743a72504e6bd7df9c52f171b047f"}, "flake_id": {:hex, :flake_id, "0.1.0", "7716b086d2e405d09b647121a166498a0d93d1a623bead243e1f74216079ccb3", [:mix], [{:base62, "~> 1.2", [hex: :base62, repo: "hexpm", optional: false]}, {:ecto, ">= 2.0.0", [hex: :ecto, repo: "hexpm", optional: true]}], "hexpm", "31fc8090fde1acd267c07c36ea7365b8604055f897d3a53dd967658c691bd827"}, + "floki": {:hex, :floki, "0.26.0", "4df88977e2e357c6720e1b650f613444bfb48c5acfc6a0c646ab007d08ad13bf", [:mix], [{:html_entities, "~> 0.5.0", [hex: :html_entities, repo: "hexpm", optional: false]}], "hexpm", "e7b66ce7feef5518a9cd9fc7b52dd62a64028bd9cb6d6ad282a0f0fc90a4ae52"}, "gettext": {:hex, :gettext, "0.17.0", "abe21542c831887a2b16f4c94556db9c421ab301aee417b7c4fbde7fbdbe01ec", [:mix], [], "hexpm", "e0b8598e802676c81e66b061a2148c37c03886b24a3ca86a1f98ed40693b94b3"}, "hackney": {:hex, :hackney, "1.15.2", "07e33c794f8f8964ee86cebec1a8ed88db5070e52e904b8f12209773c1036085", [:rebar3], [{:certifi, "2.5.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.5", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm", "e0100f8ef7d1124222c11ad362c857d3df7cb5f4204054f9f0f4a728666591fc"}, + "html_entities": {:hex, :html_entities, "0.5.1", "1c9715058b42c35a2ab65edc5b36d0ea66dd083767bef6e3edb57870ef556549", [:mix], [], "hexpm", "30efab070904eb897ff05cd52fa61c1025d7f8ef3a9ca250bc4e6513d16c32de"}, "http_signatures": {:git, "https://git.pleroma.social/pleroma/http_signatures.git", "293d77bb6f4a67ac8bde1428735c3b42f22cbb30", [ref: "293d77bb6f4a67ac8bde1428735c3b42f22cbb30"]}, "httpoison": {:hex, :httpoison, "1.5.1", "0f55b5b673b03c5c327dac7015a67cb571b99b631acc0bc1b0b98dcd6b9f2104", [:mix], [{:hackney, "~> 1.8", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "191a3b6329c917de4e7ca68431919a59bf19e60694b313a69bc1f56a4cb160bf"}, "idna": {:hex, :idna, "6.0.0", "689c46cbcdf3524c44d5f3dde8001f364cd7608a99556d8fbd8239a5798d4c10", [:rebar3], [{:unicode_util_compat, "0.4.1", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "4bdd305eb64e18b0273864920695cb18d7a2021f31a11b9c5fbcd9a253f936e2"},