MastoSearch/MastoSearch/ImportController.swift

113 lines
4.2 KiB
Swift

//
// ImportController.swift
// MastoSearch
//
// Created by Shadowfacts on 12/10/21.
//
import Foundation
import TabularData
import Accelerate
import OSLog
/*
imports from pleroma csv dumps generated with the following psql command:
\copy (select a.id, a.data as activity_data, o.data as object_data from activities as a left join objects as o on o.data->>'id' = a.data->>'object' where a.data->>'actor'='https://social.shadowfacts.net/users/shadowfacts' and a.data->>'type'='Create' and (o.data->>'type'='Note' or a.data->'object'->>'type'='Note')) to '/home/pleroma/shadowfacts.csv' csv header;
*/
class ImportController {
static let shared = ImportController()
private let logger = Logger(subsystem: Bundle.main.bundleIdentifier!, category: "ImportController")
private let dateFormatter: DateFormatter = {
let f = DateFormatter()
f.dateFormat = "yyyy-MM-dd'T'HH:mm:ss.SZ"
f.timeZone = TimeZone(abbreviation: "UTC")
f.locale = Locale(identifier: "en_US_POSIX")
return f
}()
private init() {}
func importCSV(url: URL) {
var opts = CSVReadingOptions()
opts.usesQuoting = true
opts.addDateParseStrategy(Date.ISO8601FormatStyle(includingFractionalSeconds: true))
let dataFrame = try! DataFrame(contentsOfCSVFile: url, columns: ["id", "activity_data", "object_data"], types: [
"id": .string,
"activity_data": .data,
"object_data": .data,
], options: opts)
let statuses = dataFrame.rows.lazy.enumerated().compactMap { (index, row) -> Status? in
if index % 100 == 0 {
logger.debug("Parsing row \(index, privacy: .public)")
}
let uuid = row["id"] as! String
let activityData = row["activity_data"] as! Data
let activity = try! JSONSerialization.jsonObject(with: activityData, options: []) as! [String: Any]
let object: [String: Any]
if let objectData = row["object_data"] as? Data {
object = try! JSONSerialization.jsonObject(with: objectData, options: []) as! [String: Any]
} else {
object = activity["object"] as! [String: Any]
}
let id = uuidToFlakeIdStr(uuid)
let url = activity["id"] as! String
var summary = object["summary"] as? String
if let s = summary, s.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
summary = nil
}
let content = object["content"] as! String
let published = self.dateFormatter.date(from: activity["published"] as! String)!
return Status(id: id, url: url, summary: summary, content: content, published: published)
}
DatabaseController.shared.addStatuses(statuses)
}
// https://git.pleroma.social/pleroma/elixir-libraries/flake_id/-/blob/master/lib/flake_id/ecto/compat_type.ex
func uuidToFlakeIdStr(_ uuidStr: String) -> String {
let uuid = UUID(uuidString: uuidStr)!
var bytes = [UInt8](repeating: 0, count: 16)
bytes.withUnsafeMutableBufferPointer { buffer in
(uuid as NSUUID).getBytes(buffer.baseAddress!)
}
let num = bytes.withUnsafeBytes { raw -> UInt128 in
let uint64s = raw.bindMemory(to: UInt64.self)
return UInt128(upperBits: UInt64(bigEndian: uint64s[0]), lowerBits: UInt64(bigEndian: uint64s[1]))
}
if num.leadingZeroBitCount >= 64 {
return num.description
} else {
return encodeBase62(num)
}
}
private let base62Alphabet = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
func encodeBase62(_ value: UInt128) -> String {
var s = ""
var cur = value
while cur != .zero {
let (q, r) = cur.quotientAndRemainder(dividingBy: UInt128(base62Alphabet.count))
cur = q
let index = base62Alphabet.index(base62Alphabet.startIndex, offsetBy: Int(r))
let c = base62Alphabet[index]
s = String(c) + s
}
return s
}
}