diff --git a/Reader/ExcerptGenerator.swift b/Reader/ExcerptGenerator.swift index 9aa681a..96eb4e9 100644 --- a/Reader/ExcerptGenerator.swift +++ b/Reader/ExcerptGenerator.swift @@ -120,9 +120,19 @@ private func elementHandler(element: OpaquePointer!, userData: UnsafeMutableRawP private func textHandler(chunk: OpaquePointer!, userData: UnsafeMutableRawPointer!) -> lol_html_rewriter_directive_t { let userDataPtr = userData.assumingMemoryBound(to: UserData.self) - let s = lol_html_text_chunk_content_get(chunk) - let content = String(bytesNoCopy: UnsafeMutableRawPointer(mutating: s.data), length: s.len, encoding: .utf8, freeWhenDone: false)! - userDataPtr.pointee.paragraphText += content + if userDataPtr.pointee.isInParagraph { + let s = lol_html_text_chunk_content_get(chunk) + let content = String(bytesNoCopy: UnsafeMutableRawPointer(mutating: s.data), length: s.len, encoding: .utf8, freeWhenDone: false)! + userDataPtr.pointee.paragraphText += content + if userDataPtr.pointee.paragraphText.underestimatedCount >= 1024 { + // lol-html seems to get confused by img tags with hundreds of kilobytes of data in their src attributes + // and returns that data as text even though it's a tag + // if the text is over 1024 characters so far, we assume that's what's happened + // and abandon this attempt and try again at the next paragraph + userDataPtr.pointee.paragraphText = "" + userDataPtr.pointee.isInParagraph = false + } + } return LOL_HTML_CONTINUE }