// // ExcerptGenerator.swift // Reader // // Created by Shadowfacts on 1/13/22. // import Foundation import OSLog import CoreData import Persistence // public so that it can be imported in ReaderTests even when Reader is compiled in release mode (w/ testing disabled) public struct ExcerptGenerator { private init() {} private static let logger = Logger(subsystem: Bundle.main.bundleIdentifier!, category: "ExcerptGenerator") static func generateAll(_ fervorController: FervorController, setProgress: @escaping (_ current: Int, _ total: Int) -> Void) async { let req = Item.fetchRequest() req.predicate = NSPredicate(format: "generatedExcerpt = NO") req.sortDescriptors = [NSSortDescriptor(key: "published", ascending: false)] req.fetchBatchSize = 50 let ctx = fervorController.persistentContainer.newBackgroundContext() await ctx.perform { guard let items = try? ctx.fetch(req) else { return } var count = 0 for item in items { if let excerpt = excerpt(for: item) { item.excerpt = excerpt count += 1 if count % 50 == 0 { logger.debug("Generated \(count, privacy: .public) excerpts") setProgress(count, items.count) } } item.generatedExcerpt = true } logger.log("Generated excerpts for \(count, privacy: .public) items") if ctx.hasChanges { do { // get the updated objects now, because this set is empty after .save is called let updated = ctx.updatedObjects try ctx.save() // make sure the view context has the newly added excerpts NSManagedObjectContext.mergeChanges(fromRemoteContextSave: [ NSUpdatedObjectsKey: Array(updated) ], into: [fervorController.persistentContainer.viewContext]) } catch { logger.error("Unable to save context: \(String(describing: error), privacy: .public)") } } } } public static func excerpt(for item: Item) -> String? { guard let content = item.content else { return nil } return excerpt(from: content) } public static func excerpt(from html: String) -> String? { var html = html let builder = lol_html_rewriter_builder_new()! let pSelector = lol_html_selector_parse("p", 1)! var userData = UserData() withUnsafeMutablePointer(to: &userData) { userDataPtr in let rawPtr = UnsafeMutableRawPointer(userDataPtr) let res = lol_html_rewriter_builder_add_element_content_handlers(builder, pSelector, elementHandler, rawPtr, nil, nil, textHandler, rawPtr) guard res == 0 else { lolHtmlError() } let memSettings = lol_html_memory_settings_t(preallocated_parsing_buffer_size: 1024, max_allowed_memory_usage: .max) let rewriter = lol_html_rewriter_build(builder, "utf-8", 5, memSettings, outputSink, nil, true) lol_html_rewriter_builder_free(builder) lol_html_selector_free(pSelector) guard let rewriter = rewriter else { lolHtmlError() } _ = html.withUTF8 { buffer in buffer.withMemoryRebound(to: CChar.self) { buffer in lol_html_rewriter_write(rewriter, buffer.baseAddress!, buffer.count) } } } if userData.isInParagraph { return userData.paragraphText.htmlUnescape().trimmingCharacters(in: .whitespacesAndNewlines) // todo: steal css whitespace collapsing from tusker } else { return nil } } private static func lolHtmlError() -> Never { let lastError = lol_html_take_last_error() let message = String(bytesNoCopy: UnsafeMutableRawPointer(mutating: lastError.data!), length: lastError.len, encoding: .utf8, freeWhenDone: false) fatalError(message ?? "Unknown lol-html error") } } private struct UserData { var isInParagraph = false var paragraphText = "" } private func elementHandler(element: OpaquePointer!, userData: UnsafeMutableRawPointer!) -> lol_html_rewriter_directive_t { let userDataPtr = userData.assumingMemoryBound(to: UserData.self) if userDataPtr.pointee.isInParagraph { return LOL_HTML_STOP } else { let s = lol_html_element_tag_name_get(element) let tagName = String(bytesNoCopy: UnsafeMutableRawPointer(mutating: s.data), length: s.len, encoding: .utf8, freeWhenDone: false)! userDataPtr.pointee.isInParagraph = tagName == "p" || tagName == "P" lol_html_str_free(s) return LOL_HTML_CONTINUE } } private func textHandler(chunk: OpaquePointer!, userData: UnsafeMutableRawPointer!) -> lol_html_rewriter_directive_t { let userDataPtr = userData.assumingMemoryBound(to: UserData.self) if userDataPtr.pointee.isInParagraph { let s = lol_html_text_chunk_content_get(chunk) let content = String(bytesNoCopy: UnsafeMutableRawPointer(mutating: s.data), length: s.len, encoding: .utf8, freeWhenDone: false)! userDataPtr.pointee.paragraphText += content if userDataPtr.pointee.paragraphText.underestimatedCount >= 1024 { // lol-html seems to get confused by img tags with hundreds of kilobytes of data in their src attributes // and returns that data as text even though it's a tag // if the text is over 1024 characters so far, we assume that's what's happened // and abandon this attempt and try again at the next paragraph userDataPtr.pointee.paragraphText = "" userDataPtr.pointee.isInParagraph = false } } return LOL_HTML_CONTINUE } private func outputSink(chunk: UnsafePointer!, chunkLen: Int, userData: UnsafeMutableRawPointer!) { // no-op }