Skip to content

Commit f944756

Browse files
committed
Package collections: improve search performance
This is an alternate to swiftlang#3090 but is a complete solution. Motivation: Currently to support search for package collections API we read and deserialize collection blobs from SQLite then perform string matchings on individual properties in memory (e.g., `package.summary.contains("foobar")`). This can be optimized. Modifications: Use SQLite FTS--define FTS virtual tables for packages and targets, and update implementation for `findPackage`, `searchPackages`, and `searchTargets` methods of `SQLitePackageCollectionsStorage`. Without optimization, `PackageCollectionsTests.testPackageSearchPerformance` and `testTargetsSearchPerformance` take about ~400ms to run on my local machine. With FTS, `testPackageSearchPerformance` takes ~40ms and `testTargetsSearchPerformance` ~50ms. The `testSearchTargetsPerformance` in `InMemoryPackageCollectionsSearchTests` (swiftlang#3090) yields result in ~10ms, though it queries the trie directly without going through the PackageCollections API layer. Since target search is either exact or prefix match and doesn't tokenize the query, and given the good results I saw in `InMemoryPackageCollectionsSearchTests`, this implementation includes a trie on top of SQLite FTS for target search. The trie is in-memory and loads from SQLite FTS during initialization. `put`/`remove` updates both the SQLite FTS and trie. The improvement with using trie varies--`testTargetsSearchPerformance` takes between ~15-50ms to complete. `put` now takes longer to complete because of the search index updates. Result: Better search performance.
1 parent 78adc74 commit f944756

File tree

7 files changed

+797
-86
lines changed

7 files changed

+797
-86
lines changed

Diff for: Sources/PackageCollections/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ add_library(PackageCollections
2323
Storage/PackageCollectionsSourcesStorage.swift
2424
Storage/PackageCollectionsStorage.swift
2525
Storage/SQLitePackageCollectionsStorage.swift
26+
Storage/Trie.swift
2627
API.swift
2728
PackageCollections.swift
2829
PackageCollections+Configuration.swift

Diff for: Sources/PackageCollections/Model/TargetListResult.swift

+5-1
Original file line numberDiff line numberDiff line change
@@ -48,11 +48,15 @@ extension PackageCollectionsModel.TargetListResult {
4848

4949
extension PackageCollectionsModel.TargetListResult {
5050
/// Represents a package version
51-
public struct PackageVersion: Hashable, Encodable {
51+
public struct PackageVersion: Hashable, Encodable, Comparable {
5252
/// The version
5353
public let version: TSCUtility.Version
5454

5555
/// Package name
5656
public let packageName: String
57+
58+
public static func < (lhs: PackageVersion, rhs: PackageVersion) -> Bool {
59+
lhs.version < rhs.version
60+
}
5761
}
5862
}

Diff for: Sources/PackageCollections/Storage/PackageCollectionsStorage.swift

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ public protocol PackageCollectionsStorage {
5353
query: String,
5454
callback: @escaping (Result<PackageCollectionsModel.PackageSearchResult, Error>) -> Void)
5555

56-
/// Returns optional `PackageSearchResult.Item` for the given package identity.
56+
/// Returns `PackageSearchResult.Item` for the given package identity.
5757
///
5858
/// - Parameters:
5959
/// - identifier: The package identifier

Diff for: Sources/PackageCollections/Storage/SQLitePackageCollectionsStorage.swift

+310-84
Large diffs are not rendered by default.

Diff for: Sources/PackageCollections/Storage/Trie.swift

+253
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,253 @@
1+
/*
2+
This source file is part of the Swift.org open source project
3+
Copyright (c) 2020 Apple Inc. and the Swift project authors
4+
Licensed under Apache License v2.0 with Runtime Library Exception
5+
See http://swift.org/LICENSE.txt for license information
6+
See http://swift.org/CONTRIBUTORS.txt for Swift project authors
7+
*/
8+
9+
import TSCBasic
10+
11+
import PackageModel
12+
13+
struct Trie<Document: Hashable> {
14+
private typealias Node = TrieNode<Character, Document>
15+
16+
private let root: Node
17+
18+
init() {
19+
self.root = Node()
20+
}
21+
22+
/// Inserts a word and its document to the trie.
23+
func insert(word: String, foundIn document: Document) {
24+
guard !word.isEmpty else { return }
25+
26+
var currentNode = self.root
27+
// Check if word already exists otherwise creates the node path
28+
for character in word.lowercased() {
29+
if let child = currentNode.children[character] {
30+
currentNode = child
31+
} else {
32+
currentNode = currentNode.add(value: character)
33+
}
34+
}
35+
36+
currentNode.add(document: document)
37+
}
38+
39+
/// Removes word occurrences found in the given document.
40+
func remove(document: Document) {
41+
func removeInSubTrie(root: Node, document: Document) {
42+
if root.isTerminating {
43+
root.remove(document: document)
44+
}
45+
46+
// Clean up sub-tries
47+
root.children.values.forEach {
48+
removeInSubTrie(root: $0, document: document)
49+
}
50+
51+
root.children.forEach { value, node in
52+
// If a child node doesn't have children (i.e., there are no words under it),
53+
// and itself is not a word, delete it since its path has become a deadend.
54+
if node.isLeaf, !node.isTerminating {
55+
root.remove(value: value)
56+
}
57+
}
58+
}
59+
60+
removeInSubTrie(root: self.root, document: document)
61+
}
62+
63+
/// Removes word occurrences found in the given document.
64+
func remove(where predicate: @escaping (Document) -> Bool) {
65+
func removeInSubTrie(root: Node, where predicate: @escaping (Document) -> Bool) {
66+
if root.isTerminating {
67+
root.remove(where: predicate)
68+
}
69+
70+
// Clean up sub-tries
71+
root.children.values.forEach {
72+
removeInSubTrie(root: $0, where: predicate)
73+
}
74+
75+
root.children.forEach { value, node in
76+
// If a child node doesn't have children (i.e., there are no words under it),
77+
// and itself is not a word, delete it since its path has become a deadend.
78+
if node.isLeaf, !node.isTerminating {
79+
root.remove(value: value)
80+
}
81+
}
82+
}
83+
84+
removeInSubTrie(root: self.root, where: predicate)
85+
}
86+
87+
/// Checks if the trie contains the exact word or words with matching prefix.
88+
func contains(word: String, prefixMatch: Bool = false) -> Bool {
89+
guard let node = self.findLastNodeOf(word: word) else {
90+
return false
91+
}
92+
return prefixMatch || node.isTerminating
93+
}
94+
95+
/// Finds the word in this trie and returns its documents.
96+
func find(word: String) throws -> Set<Document> {
97+
guard let node = self.findLastNodeOf(word: word), node.isTerminating else {
98+
throw NotFoundError(word)
99+
}
100+
return node.documents
101+
}
102+
103+
/// Finds words with matching prefix in this trie and returns their documents.
104+
func findWithPrefix(_ prefix: String) throws -> [String: Set<Document>] {
105+
guard let node = self.findLastNodeOf(word: prefix) else {
106+
throw NotFoundError(prefix)
107+
}
108+
109+
func wordsInSubTrie(root: Node, prefix: String) -> [String: Set<Document>] {
110+
precondition(root.value != nil, "Sub-trie root's value should not be nil")
111+
112+
var subTrieWords = [String: Set<Document>]()
113+
114+
// Construct the new prefix by adding the sub-trie root's character
115+
var previousCharacters = prefix
116+
previousCharacters.append(root.value!.lowercased()) // !-safe; see precondition
117+
118+
// The root actually forms a word
119+
if root.isTerminating {
120+
subTrieWords[previousCharacters] = root.documents
121+
}
122+
123+
// Collect all words under this sub-trie
124+
root.children.values.forEach {
125+
let childWords = wordsInSubTrie(root: $0, prefix: previousCharacters)
126+
subTrieWords.merge(childWords, uniquingKeysWith: { _, child in child })
127+
}
128+
129+
return subTrieWords
130+
}
131+
132+
var words = [String: Set<Document>]()
133+
134+
let prefix = prefix.lowercased()
135+
// The prefix is actually a word
136+
if node.isTerminating {
137+
words[prefix] = node.documents
138+
}
139+
140+
node.children.values.forEach {
141+
let childWords = wordsInSubTrie(root: $0, prefix: prefix)
142+
words.merge(childWords, uniquingKeysWith: { _, child in child })
143+
}
144+
145+
return words
146+
}
147+
148+
/// Finds the last node in the path of the given word if it exists in this trie.
149+
private func findLastNodeOf(word: String) -> Node? {
150+
guard !word.isEmpty else { return nil }
151+
152+
var currentNode = self.root
153+
// Traverse down the trie as far as we can
154+
for character in word.lowercased() {
155+
guard let child = currentNode.children[character] else {
156+
return nil
157+
}
158+
currentNode = child
159+
}
160+
return currentNode
161+
}
162+
}
163+
164+
private final class TrieNode<T: Hashable, Document: Hashable> {
165+
/// The value (i.e., character) that this node stores. `nil` if root.
166+
let value: T?
167+
168+
/// The parent of this node. `nil` if root.
169+
private weak var parent: TrieNode<T, Document>?
170+
171+
/// The children of this node identified by their corresponding value.
172+
private var _children = [T: TrieNode<T, Document>]()
173+
private let childrenLock = Lock()
174+
175+
/// If the path to this node forms a valid word, these are the documents where the word can be found.
176+
private var _documents = Set<Document>()
177+
private let documentsLock = Lock()
178+
179+
var isLeaf: Bool {
180+
self.childrenLock.withLock {
181+
self._children.isEmpty
182+
}
183+
}
184+
185+
/// `true` indicates the path to this node forms a valid word.
186+
var isTerminating: Bool {
187+
self.documentsLock.withLock {
188+
!self._documents.isEmpty
189+
}
190+
}
191+
192+
var children: [T: TrieNode<T, Document>] {
193+
self.childrenLock.withLock {
194+
self._children
195+
}
196+
}
197+
198+
var documents: Set<Document> {
199+
self.documentsLock.withLock {
200+
self._documents
201+
}
202+
}
203+
204+
init(value: T? = nil, parent: TrieNode<T, Document>? = nil) {
205+
self.value = value
206+
self.parent = parent
207+
}
208+
209+
/// Adds a subpath under this node.
210+
func add(value: T) -> TrieNode<T, Document> {
211+
self.childrenLock.withLock {
212+
if let existing = self._children[value] {
213+
return existing
214+
}
215+
216+
let child = TrieNode<T, Document>(value: value, parent: self)
217+
self._children[value] = child
218+
return child
219+
}
220+
}
221+
222+
/// Removes a subpath from this node.
223+
func remove(value: T) {
224+
_ = self.childrenLock.withLock {
225+
self._children.removeValue(forKey: value)
226+
}
227+
}
228+
229+
/// Adds a document in which the word formed by path leading to this node can be found.
230+
func add(document: Document) {
231+
_ = self.documentsLock.withLock {
232+
self._documents.insert(document)
233+
}
234+
}
235+
236+
/// Removes a referenced document.
237+
func remove(document: Document) {
238+
_ = self.documentsLock.withLock {
239+
self._documents.remove(document)
240+
}
241+
}
242+
243+
/// Removes documents that satisfy the given predicate.
244+
func remove(where predicate: @escaping (Document) -> Bool) {
245+
self.documentsLock.withLock {
246+
for document in self._documents {
247+
if predicate(document) {
248+
self._documents.remove(document)
249+
}
250+
}
251+
}
252+
}
253+
}

Diff for: Tests/PackageCollectionsTests/PackageCollectionsStorageTests.swift

+33
Original file line numberDiff line numberDiff line change
@@ -196,4 +196,37 @@ class PackageCollectionsStorageTests: XCTestCase {
196196
_ = try tsc_await { callback in storage.put(collection: mockCollections.last!, callback: callback) }
197197
XCTAssertEqual(list.count, mockCollections.count)
198198
}
199+
200+
func testPopulateTargetTrie() throws {
201+
try testWithTemporaryDirectory { tmpPath in
202+
let path = tmpPath.appending(component: "test.db")
203+
let storage = SQLitePackageCollectionsStorage(path: path)
204+
defer { XCTAssertNoThrow(try storage.close()) }
205+
206+
let mockCollections = makeMockCollections(count: 3)
207+
try mockCollections.forEach { collection in
208+
_ = try tsc_await { callback in storage.put(collection: collection, callback: callback) }
209+
}
210+
211+
let targetName = mockCollections.last!.packages.last!.versions.last!.targets.last!.name
212+
213+
do {
214+
let searchResult = try tsc_await { callback in storage.searchTargets(query: targetName, type: .exactMatch, callback: callback) }
215+
XCTAssert(searchResult.items.count > 0, "should get results")
216+
}
217+
218+
// Create another instance, which should read existing data and populate target trie with it.
219+
// Since we are not calling `storage2.put`, there is no other way for target trie to get populated.
220+
let storage2 = SQLitePackageCollectionsStorage(path: path)
221+
defer { XCTAssertNoThrow(try storage2.close()) }
222+
223+
// populateTargetTrie is called in `.init`; call it again explicitly so we know when it's finished
224+
try tsc_await { callback in storage2.populateTargetTrie(callback: callback) }
225+
226+
do {
227+
let searchResult = try tsc_await { callback in storage2.searchTargets(query: targetName, type: .exactMatch, callback: callback) }
228+
XCTAssert(searchResult.items.count > 0, "should get results")
229+
}
230+
}
231+
}
199232
}

0 commit comments

Comments
 (0)