Filter more things out of Slate and The Verge

This commit is contained in:
Shadowfacts 2023-06-25 14:06:23 -07:00
parent 6dd4f3ca82
commit 1f94e9080d
2 changed files with 5 additions and 2 deletions

View File

@ -21,7 +21,10 @@ defmodule Frenzy.Pipeline.Extractor.Slate do
case Floki.find(html_tree, ".article__content") do
[el] ->
article_content =
Floki.filter_out(el, ".slate-ad, .in-article-recirc, .social-share, .newsletter-signup")
Floki.filter_out(
el,
".slate-ad, .in-article-recirc, .social-share, .newsletter-signup, .recirc-line, .product"
)
image = Floki.find(html_tree, ".article__top-image img")

View File

@ -16,7 +16,7 @@ defmodule Frenzy.Pipeline.Extractor.TheVerge do
html_tree
|> Floki.find("article#content > div:not(.duet--article--lede)")
|> Floki.filter_out(
".duet--layout--rail, .duet--article--article-pullquote, .duet--article--comments-join-the-conversation"
".duet--layout--rail, .duet--article--article-pullquote, .duet--article--comments-join-the-conversation, .duet--recirculation--related-list, .duet--article--comments-button"
)
{:ok, image ++ content}