From 6fbda7dc7839262441b3383c7c78e1030a03734d Mon Sep 17 00:00:00 2001 From: Shadowfacts Date: Sat, 15 Jan 2022 19:31:04 -0500 Subject: [PATCH] Fix excerpt generator getting confused by massive img srcs --- Reader/ExcerptGenerator.swift | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/Reader/ExcerptGenerator.swift b/Reader/ExcerptGenerator.swift index 9aa681a..96eb4e9 100644 --- a/Reader/ExcerptGenerator.swift +++ b/Reader/ExcerptGenerator.swift @@ -120,9 +120,19 @@ private func elementHandler(element: OpaquePointer!, userData: UnsafeMutableRawP private func textHandler(chunk: OpaquePointer!, userData: UnsafeMutableRawPointer!) -> lol_html_rewriter_directive_t { let userDataPtr = userData.assumingMemoryBound(to: UserData.self) - let s = lol_html_text_chunk_content_get(chunk) - let content = String(bytesNoCopy: UnsafeMutableRawPointer(mutating: s.data), length: s.len, encoding: .utf8, freeWhenDone: false)! - userDataPtr.pointee.paragraphText += content + if userDataPtr.pointee.isInParagraph { + let s = lol_html_text_chunk_content_get(chunk) + let content = String(bytesNoCopy: UnsafeMutableRawPointer(mutating: s.data), length: s.len, encoding: .utf8, freeWhenDone: false)! + userDataPtr.pointee.paragraphText += content + if userDataPtr.pointee.paragraphText.underestimatedCount >= 1024 { + // lol-html seems to get confused by img tags with hundreds of kilobytes of data in their src attributes + // and returns that data as text even though it's a tag + // if the text is over 1024 characters so far, we assume that's what's happened + // and abandon this attempt and try again at the next paragraph + userDataPtr.pointee.paragraphText = "" + userDataPtr.pointee.isInParagraph = false + } + } return LOL_HTML_CONTINUE }