Skip to content

Improve Deserialization Performance for Bitstream Files #236

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jun 30, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions Sources/TSCUtility/Bitstream.swift
Original file line number Diff line number Diff line change
Expand Up @@ -18,29 +18,50 @@ public struct Bitcode {
public let blockInfo: [UInt64: BlockInfo]
}

/// A non-owning view of a bitcode element.
public enum BitcodeElement {
public struct Block {
public var id: UInt64
public var elements: [BitcodeElement]
}

/// A record element.
///
/// - Warning: A `Record` element's fields and payload only live as long as
/// the `visit` function that provides them is called. To persist
/// a record, always make a copy of it.
public struct Record {
public enum Payload {
case none
case array([UInt64])
case char6String(String)
case blob(Data)
case blob(ArraySlice<UInt8>)
}

public var id: UInt64
public var fields: [UInt64]
public var fields: UnsafeBufferPointer<UInt64>
public var payload: Payload
}

case block(Block)
case record(Record)
}

extension BitcodeElement.Record.Payload: CustomStringConvertible {
public var description: String {
switch self {
case .none:
return "none"
case .array(let vals):
return "array(\(vals))"
case .char6String(let s):
return "char6String(\(s))"
case .blob(let s):
return "blob(\(s.count) bytes)"
}
}
}

public struct BlockInfo {
public var name: String = ""
public var recordNames: [UInt64: String] = [:]
Expand Down
123 changes: 50 additions & 73 deletions Sources/TSCUtility/BitstreamReader.swift
Original file line number Diff line number Diff line change
Expand Up @@ -12,35 +12,8 @@ import Foundation
import TSCBasic

extension Bitcode {
/// Parse a bitstream from data.
@available(*, deprecated, message: "Use Bitcode.init(bytes:) instead")
public init(data: Data) throws {
precondition(data.count > 4)
try self.init(bytes: ByteString(data))
}

public init(bytes: ByteString) throws {
precondition(bytes.count > 4)
var reader = BitstreamReader(buffer: bytes)
let signature = try reader.readSignature()
var visitor = CollectingVisitor()
try reader.readBlock(id: BitstreamReader.fakeTopLevelBlockID,
abbrevWidth: 2,
abbrevInfo: [],
visitor: &visitor)
self.init(signature: signature,
elements: visitor.finalizeTopLevelElements(),
blockInfo: reader.blockInfo)
}

/// Traverse a bitstream using the specified `visitor`, which will receive
/// callbacks when blocks and records are encountered.
@available(*, deprecated, message: "Use Bitcode.read(bytes:using:) instead")
public static func read<Visitor: BitstreamVisitor>(stream data: Data, using visitor: inout Visitor) throws {
precondition(data.count > 4)
try Self.read(bytes: ByteString(data), using: &visitor)
}

public static func read<Visitor: BitstreamVisitor>(bytes: ByteString, using visitor: inout Visitor) throws {
precondition(bytes.count > 4)
var reader = BitstreamReader(buffer: bytes)
Expand All @@ -52,36 +25,6 @@ extension Bitcode {
}
}

/// A basic visitor that collects all the blocks and records in a stream.
private struct CollectingVisitor: BitstreamVisitor {
var stack: [(UInt64, [BitcodeElement])] = [(BitstreamReader.fakeTopLevelBlockID, [])]

func validate(signature: Bitcode.Signature) throws {}

mutating func shouldEnterBlock(id: UInt64) throws -> Bool {
stack.append((id, []))
return true
}

mutating func didExitBlock() throws {
guard let (id, elements) = stack.popLast() else {
fatalError("Unbalanced calls to shouldEnterBlock/didExitBlock")
}

let block = BitcodeElement.Block(id: id, elements: elements)
stack[stack.endIndex-1].1.append(.block(block))
}

mutating func visit(record: BitcodeElement.Record) throws {
stack[stack.endIndex-1].1.append(.record(record))
}

func finalizeTopLevelElements() -> [BitcodeElement] {
assert(stack.count == 1)
return stack[0].1
}
}

private extension Bits.Cursor {
enum BitcodeError: Swift.Error {
case vbrOverflow
Expand Down Expand Up @@ -161,6 +104,7 @@ private struct BitstreamReader {
guard numOps > 0 else { throw Error.invalidAbbrev }

var operands: [Bitstream.Abbreviation.Operand] = []
operands.reserveCapacity(numOps)
for i in 0..<numOps {
operands.append(try readAbbrevOp())

Expand Down Expand Up @@ -204,15 +148,29 @@ private struct BitstreamReader {
}
}

mutating func readAbbreviatedRecord(_ abbrev: Bitstream.Abbreviation) throws -> BitcodeElement.Record {
/// Computes a non-owning view of a `BitcodeElement.Record` that is valid for
/// the lifetime of the call to `body`.
///
/// - Warning: If this function throws, the `body` block will not be called.
mutating func withAbbreviatedRecord(
_ abbrev: Bitstream.Abbreviation,
body: (BitcodeElement.Record) throws -> Void
) throws {
let code = try readSingleAbbreviatedRecordOperand(abbrev.operands.first!)

let lastOperand = abbrev.operands.last!
let lastRegularOperandIndex: Int = abbrev.operands.endIndex - (lastOperand.isPayload ? 1 : 0)

var fields = [UInt64]()
for op in abbrev.operands[1..<lastRegularOperandIndex] {
fields.append(try readSingleAbbreviatedRecordOperand(op))
// Safety: `lastRegularOperandIndex` is always at least 1. An abbreviation
// is required by the format to contain at least one operand. If that last
// operand is a payload (and thus we subtracted one from the total number of
// operands above), then that must mean it is either a trailing array
// or trailing blob. Both of these are preceded by their length field.
let fields = UnsafeMutableBufferPointer<UInt64>.allocate(capacity: lastRegularOperandIndex - 1)
defer { fields.deallocate() }

for (idx, op) in abbrev.operands[1..<lastRegularOperandIndex].enumerated() {
fields[idx] = try readSingleAbbreviatedRecordOperand(op)
}

let payload: BitcodeElement.Record.Payload
Expand All @@ -222,26 +180,42 @@ private struct BitstreamReader {
switch lastOperand {
case .array(let element):
let length = try cursor.readVBR(6)
var elements = [UInt64]()
for _ in 0..<length {
elements.append(try readSingleAbbreviatedRecordOperand(element))
}
if case .char6 = element {
payload = .char6String(String(String.UnicodeScalarView(elements.map { UnicodeScalar(UInt8($0)) })))
// FIXME: Once the minimum deployment target bumps to macOS 11, use
// the more ergonomic stdlib API everywhere.
if #available(macOS 11.0, *) {
payload = try .char6String(String(unsafeUninitializedCapacity: Int(length)) { buffer in
for i in 0..<Int(length) {
buffer[i] = try UInt8(readSingleAbbreviatedRecordOperand(element))
}
return Int(length)
})
} else {
let buffer = UnsafeMutableBufferPointer<UInt8>.allocate(capacity: Int(length))
defer { buffer.deallocate() }
for i in 0..<Int(length) {
buffer[i] = try UInt8(readSingleAbbreviatedRecordOperand(element))
}
payload = .char6String(String(decoding: buffer, as: UTF8.self))
}
} else {
var elements = [UInt64]()
for _ in 0..<length {
elements.append(try readSingleAbbreviatedRecordOperand(element))
}
payload = .array(elements)
}
case .blob:
let length = Int(try cursor.readVBR(6))
try cursor.advance(toBitAlignment: 32)
payload = .blob(try Data(cursor.read(bytes: length)))
payload = .blob(try cursor.read(bytes: length))
try cursor.advance(toBitAlignment: 32)
default:
fatalError()
}
}

return .init(id: code, fields: fields, payload: payload)
return try body(.init(id: code, fields: UnsafeBufferPointer(fields), payload: payload))
}

mutating func readBlockInfoBlock(abbrevWidth: Int) throws {
Expand Down Expand Up @@ -341,17 +315,20 @@ private struct BitstreamReader {
case Bitstream.AbbreviationID.unabbreviatedRecord.rawValue:
let code = try cursor.readVBR(6)
let numOps = try cursor.readVBR(6)
var operands = [UInt64]()
for _ in 0..<numOps {
operands.append(try cursor.readVBR(6))
let operands = UnsafeMutableBufferPointer<UInt64>.allocate(capacity: Int(numOps))
defer { operands.deallocate() }
for i in 0..<Int(numOps) {
operands[i] = try cursor.readVBR(6)
}
try visitor.visit(record: .init(id: code, fields: operands, payload: .none))
try visitor.visit(record: .init(id: code, fields: UnsafeBufferPointer(operands), payload: .none))

case let abbrevID:
guard Int(abbrevID) - 4 < abbrevInfo.count else {
throw Error.noSuchAbbrev(blockID: id, abbrevID: Int(abbrevID))
}
try visitor.visit(record: try readAbbreviatedRecord(abbrevInfo[Int(abbrevID) - 4]))
try withAbbreviatedRecord(abbrevInfo[Int(abbrevID) - 4]) { record in
try visitor.visit(record: record)
}
}
}

Expand Down
2 changes: 2 additions & 0 deletions Sources/TSCUtility/BitstreamWriter.swift
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,7 @@ extension BitstreamWriter {

fileprivate init() {
self.values = []
self.values.reserveCapacity(8)
}

fileprivate init<CodeType>(recordID: CodeType)
Expand Down Expand Up @@ -367,6 +368,7 @@ extension BitstreamWriter {
}

public mutating func append(_ string: String) {
self.values.reserveCapacity(self.values.capacity + string.utf8.count)
for byte in string.utf8 {
values.append(UInt32(byte))
}
Expand Down
67 changes: 46 additions & 21 deletions Sources/TSCUtility/SerializedDiagnostics.swift
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,6 @@ public struct SerializedDiagnostics {
/// Serialized diagnostics.
public var diagnostics: [Diagnostic]

@available(*, deprecated, message: "Use SerializedDiagnostics.init(bytes:) instead")
public init(data: Data) throws {
var reader = Reader()
try Bitcode.read(stream: data, using: &reader)
guard let version = reader.versionNumber else { throw Error.noMetadataBlock }
self.versionNumber = version
self.diagnostics = reader.diagnostics
}

public init(bytes: ByteString) throws {
var reader = Reader()
try Bitcode.read(bytes: bytes, using: &reader)
Expand Down Expand Up @@ -88,7 +79,7 @@ extension SerializedDiagnostics {
/// Fix-its associated with the diagnostic.
public var fixIts: [FixIt]

fileprivate init(records: [BitcodeElement.Record],
fileprivate init(records: [SerializedDiagnostics.OwnedRecord],
filenameMap: inout [UInt64: String],
flagMap: inout [UInt64: String],
categoryMap: inout [UInt64: String]) throws {
Expand All @@ -107,7 +98,7 @@ extension SerializedDiagnostics {
case .blob(let diagnosticBlob) = record.payload
else { throw Error.malformedRecord }

text = String(data: diagnosticBlob, encoding: .utf8)
text = String(decoding: diagnosticBlob, as: UTF8.self)
level = Level(rawValue: record.fields[0])
location = SourceLocation(fields: record.fields[1...4],
filenameMap: filenameMap)
Expand All @@ -125,38 +116,38 @@ extension SerializedDiagnostics {
}
case .flag:
guard record.fields.count == 2,
case .blob(let flagBlob) = record.payload,
let flagText = String(data: flagBlob, encoding: .utf8)
case .blob(let flagBlob) = record.payload
else { throw Error.malformedRecord }

let flagText = String(decoding: flagBlob, as: UTF8.self)
let diagnosticID = record.fields[0]
flagMap[diagnosticID] = flagText

case .category:
guard record.fields.count == 2,
case .blob(let categoryBlob) = record.payload,
let categoryText = String(data: categoryBlob, encoding: .utf8)
case .blob(let categoryBlob) = record.payload
else { throw Error.malformedRecord }

let categoryText = String(decoding: categoryBlob, as: UTF8.self)
let categoryID = record.fields[0]
categoryMap[categoryID] = categoryText

case .filename:
guard record.fields.count == 4,
case .blob(let filenameBlob) = record.payload,
let filenameText = String(data: filenameBlob, encoding: .utf8)
case .blob(let filenameBlob) = record.payload
else { throw Error.malformedRecord }

let filenameText = String(decoding: filenameBlob, as: UTF8.self)
let filenameID = record.fields[0]
// record.fields[1] and record.fields[2] are no longer used.
filenameMap[filenameID] = filenameText

case .fixit:
guard record.fields.count == 9,
case .blob(let fixItBlob) = record.payload,
let fixItText = String(data: fixItBlob, encoding: .utf8)
case .blob(let fixItBlob) = record.payload
else { throw Error.malformedRecord }

let fixItText = String(decoding: fixItBlob, as: UTF8.self)
if let start = SourceLocation(fields: record.fields[0...3],
filenameMap: filenameMap),
let end = SourceLocation(fields: record.fields[4...7],
Expand Down Expand Up @@ -223,7 +214,7 @@ extension SerializedDiagnostics {
var flagMap = [UInt64: String]()
var categoryMap = [UInt64: String]()

var currentDiagnosticRecords: [BitcodeElement.Record] = []
var currentDiagnosticRecords: [OwnedRecord] = []

func validate(signature: Bitcode.Signature) throws {
guard signature == .init(string: "DIAG") else { throw Error.badMagic }
Expand Down Expand Up @@ -256,10 +247,44 @@ extension SerializedDiagnostics {
}
versionNumber = Int(record.fields[0])
case .diagnostic:
currentDiagnosticRecords.append(record)
currentDiagnosticRecords.append(SerializedDiagnostics.OwnedRecord(record))
case nil:
throw Error.unexpectedTopLevelRecord
}
}
}
}

extension SerializedDiagnostics {
struct OwnedRecord {
public enum Payload {
case none
case array([UInt64])
case char6String(String)
case blob([UInt8])

init(_ payload: BitcodeElement.Record.Payload) {
switch payload {
case .none:
self = .none
case .array(let a):
self = .array(Array(a))
case .char6String(let s):
self = .char6String(s)
case .blob(let b):
self = .blob(Array(b))
}
}
}

public var id: UInt64
public var fields: [UInt64]
public var payload: Payload

init(_ record: BitcodeElement.Record) {
self.id = record.id
self.fields = Array(record.fields)
self.payload = Payload(record.payload)
}
}
}
Loading