frenzy-ios/Reader/ExcerptGenerator.swift

145 lines
6.1 KiB
Swift

//
// ExcerptGenerator.swift
// Reader
//
// Created by Shadowfacts on 1/13/22.
//
import Foundation
import OSLog
import CoreData
import Persistence
// public so that it can be imported in ReaderTests even when Reader is compiled in release mode (w/ testing disabled)
public struct ExcerptGenerator {
private init() {}
private static let logger = Logger(subsystem: Bundle.main.bundleIdentifier!, category: "ExcerptGenerator")
static func generateAll(_ fervorController: FervorController, setProgress: @escaping (_ current: Int, _ total: Int) -> Void) async {
let req = Item.fetchRequest()
req.predicate = NSPredicate(format: "generatedExcerpt = NO")
req.sortDescriptors = [NSSortDescriptor(key: "published", ascending: false)]
req.fetchBatchSize = 50
let ctx = fervorController.persistentContainer.newBackgroundContext()
await ctx.perform {
guard let items = try? ctx.fetch(req) else { return }
var count = 0
for item in items {
if let excerpt = excerpt(for: item) {
item.excerpt = excerpt
count += 1
if count % 50 == 0 {
logger.debug("Generated \(count, privacy: .public) excerpts")
setProgress(count, items.count)
}
}
item.generatedExcerpt = true
}
logger.log("Generated excerpts for \(count, privacy: .public) items")
if ctx.hasChanges {
do {
// get the updated objects now, because this set is empty after .save is called
let updated = ctx.updatedObjects
try ctx.save()
// make sure the view context has the newly added excerpts
NSManagedObjectContext.mergeChanges(fromRemoteContextSave: [
NSUpdatedObjectsKey: Array(updated)
], into: [fervorController.persistentContainer.viewContext])
} catch {
logger.error("Unable to save context: \(String(describing: error), privacy: .public)")
}
}
}
}
public static func excerpt(for item: Item) -> String? {
guard let content = item.content else {
return nil
}
return excerpt(from: content)
}
public static func excerpt(from html: String) -> String? {
var html = html
let builder = lol_html_rewriter_builder_new()!
let pSelector = lol_html_selector_parse("p", 1)!
var userData = UserData()
withUnsafeMutablePointer(to: &userData) { userDataPtr in
let rawPtr = UnsafeMutableRawPointer(userDataPtr)
let res = lol_html_rewriter_builder_add_element_content_handlers(builder, pSelector, elementHandler, rawPtr, nil, nil, textHandler, rawPtr)
guard res == 0 else {
lolHtmlError()
}
let memSettings = lol_html_memory_settings_t(preallocated_parsing_buffer_size: 1024, max_allowed_memory_usage: .max)
let rewriter = lol_html_rewriter_build(builder, "utf-8", 5, memSettings, outputSink, nil, true)
lol_html_rewriter_builder_free(builder)
lol_html_selector_free(pSelector)
guard let rewriter = rewriter else {
lolHtmlError()
}
_ = html.withUTF8 { buffer in
buffer.withMemoryRebound(to: CChar.self) { buffer in
lol_html_rewriter_write(rewriter, buffer.baseAddress!, buffer.count)
}
}
}
if userData.isInParagraph {
return userData.paragraphText.htmlUnescape().trimmingCharacters(in: .whitespacesAndNewlines)
// todo: steal css whitespace collapsing from tusker
} else {
return nil
}
}
private static func lolHtmlError() -> Never {
let lastError = lol_html_take_last_error()
let message = String(bytesNoCopy: UnsafeMutableRawPointer(mutating: lastError.data!), length: lastError.len, encoding: .utf8, freeWhenDone: false)
fatalError(message ?? "Unknown lol-html error")
}
}
private struct UserData {
var isInParagraph = false
var paragraphText = ""
}
private func elementHandler(element: OpaquePointer!, userData: UnsafeMutableRawPointer!) -> lol_html_rewriter_directive_t {
let userDataPtr = userData.assumingMemoryBound(to: UserData.self)
if userDataPtr.pointee.isInParagraph {
return LOL_HTML_STOP
} else {
let s = lol_html_element_tag_name_get(element)
let tagName = String(bytesNoCopy: UnsafeMutableRawPointer(mutating: s.data), length: s.len, encoding: .utf8, freeWhenDone: false)!
userDataPtr.pointee.isInParagraph = tagName == "p" || tagName == "P"
lol_html_str_free(s)
return LOL_HTML_CONTINUE
}
}
private func textHandler(chunk: OpaquePointer!, userData: UnsafeMutableRawPointer!) -> lol_html_rewriter_directive_t {
let userDataPtr = userData.assumingMemoryBound(to: UserData.self)
if userDataPtr.pointee.isInParagraph {
let s = lol_html_text_chunk_content_get(chunk)
let content = String(bytesNoCopy: UnsafeMutableRawPointer(mutating: s.data), length: s.len, encoding: .utf8, freeWhenDone: false)!
userDataPtr.pointee.paragraphText += content
if userDataPtr.pointee.paragraphText.underestimatedCount >= 1024 {
// lol-html seems to get confused by img tags with hundreds of kilobytes of data in their src attributes
// and returns that data as text even though it's a tag
// if the text is over 1024 characters so far, we assume that's what's happened
// and abandon this attempt and try again at the next paragraph
userDataPtr.pointee.paragraphText = ""
userDataPtr.pointee.isInParagraph = false
}
}
return LOL_HTML_CONTINUE
}
private func outputSink(chunk: UnsafePointer<CChar>!, chunkLen: Int, userData: UnsafeMutableRawPointer!) {
// no-op
}