Skip to content

Commit c23a8f9

Browse files
committed
Package collections: trie for search
Motivation: Currently to support search for package collections API we read and deserialize collection blobs from SQLite then perform string matchings on individual properties in memory (e.g., `package.summary.contains("foobar")`). This can be optimized. Modifications: Introduce `InMemoryPackageCollectionsSearch`, which uses `Trie`s as underlying search indexes. `InMemoryPackageCollectionsSearch` contains the same `findPackage`, `searchPackages`, and `searchTargets` method signatures as `SQLitePackageCollectionsStorage`. This PR only intends to show `InMemoryPackageCollectionsSearch` working standalone, and the next step is to weave `InMemoryPackageCollectionsSearch` into the existing business logic. This might mean using `InMemoryPackageCollectionsSearch` instead of `SQLitePackageCollectionsStorage` for search, or combining the two. We will also need to be able to de/serialize the indexes (tries) to avoid rebuilding them each time and to reduce "warm up" time. Result: A search index for package collections API that based on performance tests, yield better timings that current search strategy.
1 parent 2941fb2 commit c23a8f9

9 files changed

+1223
-3
lines changed

Diff for: Sources/PackageCollections/CMakeLists.txt

+3
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ add_library(PackageCollections
1919
Providers/JSONPackageCollectionProvider.swift
2020
Providers/PackageCollectionProvider.swift
2121
Providers/PackageMetadataProvider.swift
22+
Search/InMemoryPackageCollectionsSearch.swift
23+
Search/PackageCollectionsSearch.swift
24+
Search/Trie.swift
2225
Storage/FilePackageCollectionsSourcesStorage.swift
2326
Storage/PackageCollectionsSourcesStorage.swift
2427
Storage/PackageCollectionsStorage.swift

Diff for: Sources/PackageCollections/Model/TargetListResult.swift

+5-1
Original file line numberDiff line numberDiff line change
@@ -48,11 +48,15 @@ extension PackageCollectionsModel.TargetListResult {
4848

4949
extension PackageCollectionsModel.TargetListResult {
5050
/// Represents a package version
51-
public struct PackageVersion: Hashable {
51+
public struct PackageVersion: Hashable, Comparable {
5252
/// The version
5353
public let version: TSCUtility.Version
5454

5555
/// Package name
5656
public let packageName: String
57+
58+
public static func < (lhs: PackageVersion, rhs: PackageVersion) -> Bool {
59+
lhs.version < rhs.version
60+
}
5761
}
5862
}

Diff for: Sources/PackageCollections/PackageCollections.swift

+1-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ public struct PackageCollections: PackageCollectionsProtocol {
105105
sources.forEach { source in
106106
self.refreshCollectionFromSource(source: source) { refreshResult in
107107
lock.withLock { refreshResults.append(refreshResult) }
108-
if refreshResults.count == (lock.withLock { sources.count }) {
108+
if sources.count == (lock.withLock { refreshResults.count }) {
109109
let errors = refreshResults.compactMap { $0.failure }
110110
callback(errors.isEmpty ? .success(sources) : .failure(MultipleErrors(errors)))
111111
}

Diff for: Sources/PackageCollections/Search/InMemoryPackageCollectionsSearch.swift

+361
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/*
2+
This source file is part of the Swift.org open source project
3+
4+
Copyright (c) 2020 Apple Inc. and the Swift project authors
5+
Licensed under Apache License v2.0 with Runtime Library Exception
6+
7+
See http://swift.org/LICENSE.txt for license information
8+
See http://swift.org/CONTRIBUTORS.txt for Swift project authors
9+
*/
10+
11+
import PackageModel
12+
13+
protocol PackageCollectionsSearch {
14+
/// Adds the given `PackageCollectionsModel.Collection` to the search index.
15+
///
16+
/// - Parameters:
17+
/// - collection: The `PackageCollectionsModel.Collection` to index
18+
/// - callback: The closure to invoke when result becomes available
19+
func index(collection: Model.Collection,
20+
callback: @escaping (Result<Void, Error>) -> Void)
21+
22+
/// Removes the `PackageCollectionsModel.Collection` from the search index.
23+
///
24+
/// - Parameters:
25+
/// - identifier: The identifier of the `PackageCollectionsModel.Collection` to remove
26+
/// - callback: The closure to invoke when result becomes available
27+
func remove(identifier: Model.CollectionIdentifier,
28+
callback: @escaping (Result<Void, Error>) -> Void)
29+
30+
/// Returns `PackageSearchResult.Item` for the given package identity.
31+
///
32+
/// - Parameters:
33+
/// - identifier: The package identifier
34+
/// - collectionIdentifiers: Optional. The identifiers of the `PackageCollectionsModel.Collection`s to search under.
35+
/// - callback: The closure to invoke when result becomes available
36+
func findPackage(identifier: PackageIdentity,
37+
collectionIdentifiers: [Model.CollectionIdentifier]?,
38+
callback: @escaping (Result<Model.PackageSearchResult.Item, Error>) -> Void)
39+
40+
/// Returns `PackageSearchResult` for the given search criteria.
41+
///
42+
/// - Parameters:
43+
/// - identifiers: Optional. The identifiers of the `PackageCollectionsModel.Collection`s to search under.
44+
/// - query: The search query expression
45+
/// - callback: The closure to invoke when result becomes available
46+
func searchPackages(identifiers: [Model.CollectionIdentifier]?,
47+
query: String,
48+
callback: @escaping (Result<Model.PackageSearchResult, Error>) -> Void)
49+
50+
/// Returns `TargetSearchResult` for the given search criteria.
51+
///
52+
/// - Parameters:
53+
/// - identifiers: Optional. The identifiers of the `PackageCollectionsModel.Collection`s to search under.
54+
/// - query: The search query expression
55+
/// - type: The search type
56+
/// - callback: The closure to invoke when result becomes available
57+
func searchTargets(identifiers: [Model.CollectionIdentifier]?,
58+
query: String,
59+
type: Model.TargetSearchType,
60+
callback: @escaping (Result<Model.TargetSearchResult, Error>) -> Void)
61+
}

Diff for: Sources/PackageCollections/Search/Trie.swift

+220
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
/*
2+
This source file is part of the Swift.org open source project
3+
4+
Copyright (c) 2020 Apple Inc. and the Swift project authors
5+
Licensed under Apache License v2.0 with Runtime Library Exception
6+
7+
See http://swift.org/LICENSE.txt for license information
8+
See http://swift.org/CONTRIBUTORS.txt for Swift project authors
9+
*/
10+
11+
import TSCBasic
12+
13+
import PackageModel
14+
15+
final class Trie<Document: Hashable> {
16+
private typealias Node = TrieNode<Character, Document>
17+
18+
private let root: Node
19+
20+
init() {
21+
self.root = Node()
22+
}
23+
24+
/// Inserts a word and its document to the trie.
25+
func insert(word: String, foundIn document: Document) {
26+
guard !word.isEmpty else { return }
27+
28+
var currentNode = self.root
29+
// Check if word already exists otherwise creates the node path
30+
for character in word.lowercased() {
31+
if let child = currentNode.children[character] {
32+
currentNode = child
33+
} else {
34+
currentNode = currentNode.add(value: character)
35+
}
36+
}
37+
38+
currentNode.add(document: document)
39+
}
40+
41+
/// Removes word occurrences found in the given document.
42+
func remove(document: Document) {
43+
func removeInSubTrie(root: Node, document: Document) {
44+
if root.isTerminating {
45+
root.remove(document: document)
46+
}
47+
48+
// Clean up sub-tries
49+
root.children.values.forEach {
50+
removeInSubTrie(root: $0, document: document)
51+
}
52+
53+
root.children.forEach { value, node in
54+
// If a child node doesn't have children (i.e., there are no words under it),
55+
// and itself is not a word, delete it since its path has become a deadend.
56+
if node.isLeaf, !node.isTerminating {
57+
root.remove(value: value)
58+
}
59+
}
60+
}
61+
62+
removeInSubTrie(root: self.root, document: document)
63+
}
64+
65+
/// Checks if the trie contains the exact word or words with matching prefix.
66+
func contains(word: String, prefixMatch: Bool = false) -> Bool {
67+
guard let node = self.findLastNodeOf(word: word) else {
68+
return false
69+
}
70+
return prefixMatch || node.isTerminating
71+
}
72+
73+
/// Finds the word in this trie and returns its documents.
74+
func find(word: String) throws -> Set<Document> {
75+
guard let node = self.findLastNodeOf(word: word), node.isTerminating else {
76+
throw NotFoundError(word)
77+
}
78+
return node.documents
79+
}
80+
81+
/// Finds words with matching prefix in this trie and returns their documents.
82+
func findWithPrefix(_ prefix: String) throws -> [String: Set<Document>] {
83+
guard let node = self.findLastNodeOf(word: prefix) else {
84+
throw NotFoundError(prefix)
85+
}
86+
87+
func wordsInSubTrie(root: Node, prefix: String) -> [String: Set<Document>] {
88+
precondition(root.value != nil, "Sub-trie root's value should not be nil")
89+
90+
var subTrieWords = [String: Set<Document>]()
91+
92+
// Construct the new prefix by adding the sub-trie root's character
93+
var previousCharacters = prefix
94+
previousCharacters.append(root.value!.lowercased()) // !-safe; see precondition
95+
96+
// The root actually forms a word
97+
if root.isTerminating {
98+
subTrieWords[previousCharacters] = root.documents
99+
}
100+
101+
// Collect all words under this sub-trie
102+
root.children.values.forEach {
103+
let childWords = wordsInSubTrie(root: $0, prefix: previousCharacters)
104+
subTrieWords.merge(childWords, uniquingKeysWith: { _, child in child })
105+
}
106+
107+
return subTrieWords
108+
}
109+
110+
var words = [String: Set<Document>]()
111+
112+
let prefix = prefix.lowercased()
113+
// The prefix is actually a word
114+
if node.isTerminating {
115+
words[prefix] = node.documents
116+
}
117+
118+
node.children.values.forEach {
119+
let childWords = wordsInSubTrie(root: $0, prefix: prefix)
120+
words.merge(childWords, uniquingKeysWith: { _, child in child })
121+
}
122+
123+
return words
124+
}
125+
126+
/// Finds the last node in the path of the given word if it exists in this trie.
127+
private func findLastNodeOf(word: String) -> Node? {
128+
guard !word.isEmpty else { return nil }
129+
130+
var currentNode = self.root
131+
// Traverse down the trie as far as we can
132+
for character in word.lowercased() {
133+
guard let child = currentNode.children[character] else {
134+
return nil
135+
}
136+
currentNode = child
137+
}
138+
return currentNode
139+
}
140+
}
141+
142+
private final class TrieNode<T: Hashable, Document: Hashable> {
143+
/// The value (i.e., character) that this node stores. `nil` if root.
144+
let value: T?
145+
146+
/// The parent of this node. `nil` if root.
147+
private weak var parent: TrieNode<T, Document>?
148+
149+
/// The children of this node identified by their corresponding value.
150+
private var _children = [T: TrieNode<T, Document>]()
151+
private let childrenLock = Lock()
152+
153+
/// If the path to this node forms a valid word, these are the documents where the word can be found.
154+
private var _documents = Set<Document>()
155+
private let documentsLock = Lock()
156+
157+
var isLeaf: Bool {
158+
self.childrenLock.withLock {
159+
self._children.isEmpty
160+
}
161+
}
162+
163+
/// `true` indicates the path to this node forms a valid word.
164+
var isTerminating: Bool {
165+
self.documentsLock.withLock {
166+
!self._documents.isEmpty
167+
}
168+
}
169+
170+
var children: [T: TrieNode<T, Document>] {
171+
self.childrenLock.withLock {
172+
self._children
173+
}
174+
}
175+
176+
var documents: Set<Document> {
177+
self.documentsLock.withLock {
178+
self._documents
179+
}
180+
}
181+
182+
init(value: T? = nil, parent: TrieNode<T, Document>? = nil) {
183+
self.value = value
184+
self.parent = parent
185+
}
186+
187+
/// Adds a subpath under this node.
188+
func add(value: T) -> TrieNode<T, Document> {
189+
self.childrenLock.withLock {
190+
if let existing = self._children[value] {
191+
return existing
192+
}
193+
194+
let child = TrieNode<T, Document>(value: value, parent: self)
195+
self._children[value] = child
196+
return child
197+
}
198+
}
199+
200+
/// Removes a subpath from this node.
201+
func remove(value: T) {
202+
_ = self.childrenLock.withLock {
203+
self._children.removeValue(forKey: value)
204+
}
205+
}
206+
207+
/// Adds a document in which the word formed by path leading to this node can be found.
208+
func add(document: Document) {
209+
_ = self.documentsLock.withLock {
210+
self._documents.insert(document)
211+
}
212+
}
213+
214+
/// Removes a referenced document.
215+
func remove(document: Document) {
216+
_ = self.documentsLock.withLock {
217+
self._documents.remove(document)
218+
}
219+
}
220+
}

Diff for: Sources/PackageCollections/Storage/PackageCollectionsStorage.swift

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ public protocol PackageCollectionsStorage {
5353
query: String,
5454
callback: @escaping (Result<PackageCollectionsModel.PackageSearchResult, Error>) -> Void)
5555

56-
/// Returns optional `PackageSearchResult.Item` for the given package identity.
56+
/// Returns `PackageSearchResult.Item` for the given package identity.
5757
///
5858
/// - Parameters:
5959
/// - identifier: The package identifier

0 commit comments

Comments
 (0)