2022-01-14 16:13:19 +00:00
|
|
|
//
|
|
|
|
// ExcerptGenerator.swift
|
|
|
|
// Reader
|
|
|
|
//
|
|
|
|
// Created by Shadowfacts on 1/13/22.
|
|
|
|
//
|
|
|
|
|
|
|
|
import Foundation
|
|
|
|
import OSLog
|
|
|
|
import CoreData
|
|
|
|
|
|
|
|
// public so that it can be imported in ReaderTests even when Reader is compiled in release mode (w/ testing disabled)
|
|
|
|
public struct ExcerptGenerator {
|
|
|
|
private init() {}
|
|
|
|
|
|
|
|
private static let logger = Logger(subsystem: Bundle.main.bundleIdentifier!, category: "ExcerptGenerator")
|
|
|
|
|
|
|
|
static func generateAll(_ fervorController: FervorController) {
|
|
|
|
let req = Item.fetchRequest()
|
|
|
|
req.predicate = NSPredicate(format: "generatedExcerpt = NO")
|
|
|
|
req.sortDescriptors = [NSSortDescriptor(key: "published", ascending: false)]
|
|
|
|
req.fetchBatchSize = 50
|
|
|
|
fervorController.persistentContainer.performBackgroundTask { ctx in
|
|
|
|
guard let items = try? ctx.fetch(req) else { return }
|
|
|
|
var count = 0
|
|
|
|
for item in items {
|
|
|
|
if let excerpt = excerpt(for: item) {
|
|
|
|
item.excerpt = excerpt
|
|
|
|
count += 1
|
|
|
|
if count % 50 == 0 {
|
|
|
|
logger.debug("Generated \(count, privacy: .public) excerpts")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
item.generatedExcerpt = true
|
|
|
|
}
|
|
|
|
logger.log("Generated excerpts for \(count, privacy: .public) items")
|
|
|
|
if ctx.hasChanges {
|
|
|
|
do {
|
|
|
|
// get the updated objects now, because this set is empty after .save is called
|
|
|
|
let updated = ctx.updatedObjects
|
|
|
|
try ctx.save()
|
|
|
|
|
|
|
|
// make sure the view context has the newly added excerpts
|
|
|
|
NSManagedObjectContext.mergeChanges(fromRemoteContextSave: [
|
|
|
|
NSUpdatedObjectsKey: Array(updated)
|
|
|
|
], into: [fervorController.persistentContainer.viewContext])
|
|
|
|
} catch {
|
|
|
|
logger.error("Unable to save context: \(error.localizedDescription, privacy: .public)")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
public static func excerpt(for item: Item) -> String? {
|
|
|
|
guard let content = item.content else {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return excerpt(from: content)
|
|
|
|
}
|
|
|
|
|
|
|
|
public static func excerpt(from html: String) -> String? {
|
|
|
|
var html = html
|
|
|
|
|
|
|
|
let builder = lol_html_rewriter_builder_new()!
|
|
|
|
let pSelector = lol_html_selector_parse("p", 1)!
|
|
|
|
var userData = UserData()
|
|
|
|
withUnsafeMutablePointer(to: &userData) { userDataPtr in
|
|
|
|
let rawPtr = UnsafeMutableRawPointer(userDataPtr)
|
|
|
|
let res = lol_html_rewriter_builder_add_element_content_handlers(builder, pSelector, elementHandler, rawPtr, nil, nil, textHandler, rawPtr)
|
|
|
|
guard res == 0 else {
|
|
|
|
lolHtmlError()
|
|
|
|
}
|
|
|
|
let memSettings = lol_html_memory_settings_t(preallocated_parsing_buffer_size: 1024, max_allowed_memory_usage: .max)
|
|
|
|
let rewriter = lol_html_rewriter_build(builder, "utf-8", 5, memSettings, outputSink, nil, true)
|
|
|
|
lol_html_rewriter_builder_free(builder)
|
|
|
|
lol_html_selector_free(pSelector)
|
|
|
|
|
|
|
|
guard let rewriter = rewriter else {
|
|
|
|
lolHtmlError()
|
|
|
|
}
|
|
|
|
|
|
|
|
_ = html.withUTF8 { buffer in
|
|
|
|
buffer.withMemoryRebound(to: CChar.self) { buffer in
|
|
|
|
lol_html_rewriter_write(rewriter, buffer.baseAddress!, buffer.count)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if userData.isInParagraph {
|
|
|
|
return userData.paragraphText.htmlUnescape().trimmingCharacters(in: .whitespacesAndNewlines)
|
|
|
|
// todo: steal css whitespace collapsing from tusker
|
|
|
|
} else {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private static func lolHtmlError() -> Never {
|
|
|
|
let lastError = lol_html_take_last_error()
|
|
|
|
let message = String(bytesNoCopy: UnsafeMutableRawPointer(mutating: lastError.data!), length: lastError.len, encoding: .utf8, freeWhenDone: false)
|
|
|
|
fatalError(message ?? "Unknown lol-html error")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private struct UserData {
|
|
|
|
var isInParagraph = false
|
|
|
|
var paragraphText = ""
|
|
|
|
}
|
|
|
|
|
|
|
|
private func elementHandler(element: OpaquePointer!, userData: UnsafeMutableRawPointer!) -> lol_html_rewriter_directive_t {
|
|
|
|
let userDataPtr = userData.assumingMemoryBound(to: UserData.self)
|
|
|
|
if userDataPtr.pointee.isInParagraph {
|
|
|
|
return LOL_HTML_STOP
|
|
|
|
} else {
|
|
|
|
let s = lol_html_element_tag_name_get(element)
|
|
|
|
let tagName = String(bytesNoCopy: UnsafeMutableRawPointer(mutating: s.data), length: s.len, encoding: .utf8, freeWhenDone: false)!
|
|
|
|
userDataPtr.pointee.isInParagraph = tagName == "p" || tagName == "P"
|
|
|
|
lol_html_str_free(s)
|
|
|
|
return LOL_HTML_CONTINUE
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private func textHandler(chunk: OpaquePointer!, userData: UnsafeMutableRawPointer!) -> lol_html_rewriter_directive_t {
|
|
|
|
let userDataPtr = userData.assumingMemoryBound(to: UserData.self)
|
2022-01-16 00:31:04 +00:00
|
|
|
if userDataPtr.pointee.isInParagraph {
|
|
|
|
let s = lol_html_text_chunk_content_get(chunk)
|
|
|
|
let content = String(bytesNoCopy: UnsafeMutableRawPointer(mutating: s.data), length: s.len, encoding: .utf8, freeWhenDone: false)!
|
|
|
|
userDataPtr.pointee.paragraphText += content
|
|
|
|
if userDataPtr.pointee.paragraphText.underestimatedCount >= 1024 {
|
|
|
|
// lol-html seems to get confused by img tags with hundreds of kilobytes of data in their src attributes
|
|
|
|
// and returns that data as text even though it's a tag
|
|
|
|
// if the text is over 1024 characters so far, we assume that's what's happened
|
|
|
|
// and abandon this attempt and try again at the next paragraph
|
|
|
|
userDataPtr.pointee.paragraphText = ""
|
|
|
|
userDataPtr.pointee.isInParagraph = false
|
|
|
|
}
|
|
|
|
}
|
2022-01-14 16:13:19 +00:00
|
|
|
return LOL_HTML_CONTINUE
|
|
|
|
}
|
|
|
|
|
|
|
|
private func outputSink(chunk: UnsafePointer<CChar>!, chunkLen: Int, userData: UnsafeMutableRawPointer!) {
|
|
|
|
// no-op
|
|
|
|
}
|