15
15
import Foundation
16
16
import ModelSupport
17
17
18
+ /// A dataset targeted at the problem of word segmentation.
19
+ ///
20
+ /// The reference archive was published in the paper "Learning to Discover,
21
+ /// Ground, and Use Words with Segmental Neural Language Models" by Kazuya
22
+ /// Kawakami, Chris Dyer, and Phil Blunsom:
23
+ /// https://www.aclweb.org/anthology/P19-1645.pdf.
18
24
public struct WordSegDataset {
19
- public let training : [ WordSegRecord ]
20
- public private( set) var testing : [ WordSegRecord ] ?
21
- public private( set) var validation : [ WordSegRecord ] ?
25
+
26
+ /// The training data.
27
+ public let trainingPhrases : [ Phrase ]
28
+
29
+ /// The test data.
30
+ public private( set) var testingPhrases : [ Phrase ]
31
+
32
+ /// The validation data.
33
+ public private( set) var validationPhrases : [ Phrase ]
34
+
35
+ /// A mapping between characters used in the dataset and densely-packed integers
22
36
public let alphabet : Alphabet
23
37
24
- private struct DownloadDetails {
25
- var archiveLocation = URL ( string: " https://s3.eu-west-2.amazonaws.com/k-kawakami " ) !
26
- var archiveFileName = " seg "
27
- var archiveExtension = " zip "
28
- var testingFilePath = " br/br-text/te.txt "
29
- var trainingFilePath = " br/br-text/tr.txt "
30
- var validationFilePath = " br/br-text/va.txt "
31
- }
38
+ /// A pointer to source data.
39
+ private struct DownloadableArchive {
32
40
33
- private static func load( data: Data ) throws -> [ String ] {
34
- guard let contents: String = String ( data: data, encoding: . utf8) else {
35
- throw CharacterErrors . nonUtf8Data
36
- }
37
- return load ( contents: contents)
38
- }
41
+ /// A [web resource](https://en.wikipedia.org/wiki/Web_resource) that can be unpacked
42
+ /// into data files described by other properties of `self`.
43
+ let location = URL ( string: " https://s3.eu-west-2.amazonaws.com/k-kawakami/seg.zip " ) !
39
44
40
- private static func load ( contents : String ) -> [ String ] {
41
- var strings = [ String ] ( )
45
+ /// The path to the test data within the unpacked archive.
46
+ let testingFilePath = " br/br-text/te.txt "
42
47
43
- for line in contents. components ( separatedBy: . newlines) {
44
- let trimmed = line. trimmingCharacters ( in: . whitespaces)
45
- if trimmed. isEmpty { continue }
46
- strings. append ( trimmed)
47
- }
48
- return strings
48
+ /// The path to the training data within the unpacked archive.
49
+ let trainingFilePath = " br/br-text/tr.txt "
50
+
51
+ /// The path to the validation data within the unpacked archive.
52
+ let validationFilePath = " br/br-text/va.txt "
53
+ }
54
+
55
+ /// Returns phrases parsed from `data` in UTF8, separated by newlines.
56
+ private static func load( data: Data ) -> [ Substring ] {
57
+ let contents = String ( decoding: data, as: Unicode . UTF8. self)
58
+ let splitContents = contents. split ( separator: " \n " , omittingEmptySubsequences: true )
59
+ return splitContents
49
60
}
50
61
62
+ /// Returns the union of all characters in `phrases`.
63
+ ///
64
+ /// - Parameter eos: the end of sequence marker.
65
+ /// - Parameter eow:the end of word marker.
66
+ /// - Parameter pad: the padding marker.
51
67
private static func makeAlphabet(
52
- datasets training: [ String ] ,
53
- _ otherSequences: [ String ] ? ... ,
68
+ phrases: [ Substring ] ,
54
69
eos: String = " </s> " ,
55
70
eow: String = " </w> " ,
56
71
pad: String = " </pad> "
57
72
) -> Alphabet {
58
- var letters : Set < Character > = [ ]
59
-
60
- for dataset in otherSequences + [ training] {
61
- guard let dataset = dataset else { continue }
62
- for sentence in dataset {
63
- for character in sentence {
64
- if !character. isWhitespace { letters. insert ( character) }
65
- }
66
- }
67
- }
73
+ let letters = Set ( phrases. joined ( ) . lazy. filter { !$0. isWhitespace } )
68
74
69
75
// Sort the letters to make it easier to interpret ints vs letters.
70
- var sorted = Array ( letters)
71
- sorted. sort ( )
76
+ let sorted = Array ( letters) . sorted ( )
72
77
73
78
return Alphabet ( sorted, eos: eos, eow: eow, pad: pad)
74
79
}
75
80
76
- private static func convertDataset( _ dataset: [ String ] , alphabet: Alphabet ) throws
77
- -> [ WordSegRecord ]
78
- {
79
- return try dataset. map {
80
- let trimmed = $0. components ( separatedBy: . whitespaces) . joined ( )
81
- return try WordSegRecord (
82
- plainText: $0,
83
- numericalizedText: CharacterSequence (
84
- alphabet: alphabet, appendingEoSTo: trimmed) )
85
- }
86
- }
87
- private static func convertDataset( _ dataset: [ String ] ? , alphabet: Alphabet ) throws
88
- -> [ WordSegRecord ] ?
81
+ /// Numericalizes `dataset` with the mapping in `alphabet`, to be used with the
82
+ /// WordSeg model.
83
+ ///
84
+ /// - Note: Omits any phrase that cannot be converted to `CharacterSequence`.
85
+ private static func numericalizeDataset( _ dataset: [ Substring ] , alphabet: Alphabet )
86
+ -> [ Phrase ]
89
87
{
90
- if let ds = dataset {
91
- let tmp : [ WordSegRecord ] = try convertDataset ( ds, alphabet: alphabet) // Use tmp to disambiguate function
92
- return tmp
88
+ var phrases = [ Phrase] ( )
89
+
90
+ for data in dataset {
91
+ let trimmed = data. split ( separator: " " , omittingEmptySubsequences: true ) . joined ( )
92
+ guard
93
+ let numericalizedText = try ? CharacterSequence (
94
+ alphabet: alphabet, appendingEoSTo: trimmed)
95
+ else { continue }
96
+ let phrase = Phrase (
97
+ plainText: String ( data) ,
98
+ numericalizedText: numericalizedText)
99
+ phrases. append ( phrase)
93
100
}
94
- return nil
101
+
102
+ return phrases
95
103
}
96
104
105
+ /// Creates an instance containing phrases from the reference archive.
106
+ ///
107
+ /// - Throws: an error in the Cocoa domain, if the default training file
108
+ /// cannot be read.
97
109
public init ( ) throws {
98
- let downloadDetails = DownloadDetails ( )
110
+ let source = DownloadableArchive ( )
99
111
let localStorageDirectory : URL = DatasetUtilities . defaultDirectory
100
112
. appendingPathComponent ( " WordSeg " , isDirectory: true )
101
113
102
- WordSegDataset . downloadIfNotPresent ( to: localStorageDirectory, downloadDetails: downloadDetails)
114
+ Self . downloadIfNotPresent (
115
+ to: localStorageDirectory, source: source)
103
116
117
+ let archiveFileName = source. location. deletingPathExtension ( ) . lastPathComponent
104
118
let archiveDirectory =
105
119
localStorageDirectory
106
- . appendingPathComponent ( downloadDetails . archiveFileName)
120
+ . appendingPathComponent ( archiveFileName)
107
121
let trainingFilePath =
108
122
archiveDirectory
109
- . appendingPathComponent ( downloadDetails . trainingFilePath) . path
123
+ . appendingPathComponent ( source . trainingFilePath) . path
110
124
let validationFilePath =
111
125
archiveDirectory
112
- . appendingPathComponent ( downloadDetails . validationFilePath) . path
126
+ . appendingPathComponent ( source . validationFilePath) . path
113
127
let testingFilePath =
114
128
archiveDirectory
115
- . appendingPathComponent ( downloadDetails . testingFilePath) . path
129
+ . appendingPathComponent ( source . testingFilePath) . path
116
130
117
131
try self . init (
118
132
training: trainingFilePath, validation: validationFilePath,
119
133
testing: testingFilePath)
120
134
}
121
135
136
+ /// Creates an instance containing phrases from `trainingFile`, and
137
+ /// optionally `validationFile` and `testingFile`.
138
+ ///
139
+ /// - Throws: an error in the Cocoa domain, if `trainingFile` cannot be
140
+ /// read.
122
141
public init (
123
142
training trainingFile: String ,
124
143
validation validationFile: String ? = nil ,
@@ -127,53 +146,38 @@ public struct WordSegDataset {
127
146
let trainingData = try Data (
128
147
contentsOf: URL ( fileURLWithPath: trainingFile) ,
129
148
options: . alwaysMapped)
130
- let training = try Self . load ( data: trainingData)
131
149
132
- var validation : [ String ] ? = nil
133
- var testing : [ String ] ? = nil
150
+ let validationData = try Data (
151
+ contentsOf: URL ( fileURLWithPath: validationFile ?? " /dev/null " ) ,
152
+ options: . alwaysMapped)
134
153
135
- if let validationFile = validationFile {
136
- let data = try Data (
137
- contentsOf: URL ( fileURLWithPath: validationFile) ,
138
- options: . alwaysMapped)
139
- validation = try Self . load ( data: data)
140
- }
154
+ let testingData = try Data (
155
+ contentsOf: URL ( fileURLWithPath: testingFile ?? " /dev/null " ) ,
156
+ options: . alwaysMapped)
141
157
142
- if let testingFile = testingFile {
143
- let data : Data = try Data (
144
- contentsOf: URL ( fileURLWithPath: testingFile) ,
145
- options: . alwaysMapped)
146
- testing = try Self . load ( data: data)
147
- }
148
- self . alphabet = Self . makeAlphabet ( datasets: training, validation, testing)
149
- self . training = try Self . convertDataset ( training, alphabet: self . alphabet)
150
- self . validation = try Self . convertDataset ( validation, alphabet: self . alphabet)
151
- self . testing = try Self . convertDataset ( testing, alphabet: self . alphabet)
158
+ self . init (
159
+ training: trainingData, validation: validationData, testing: testingData)
152
160
}
153
161
162
+ /// Creates an instance containing phrases from `trainingData`, and
163
+ /// optionally `validationData` and `testingData`.
154
164
public init (
155
165
training trainingData: Data , validation validationData: Data ? , testing testingData: Data ?
156
- )
157
- throws
158
- {
159
- let training = try Self . load ( data: trainingData)
160
- var validation : [ String ] ? = nil
161
- var testing : [ String ] ? = nil
162
- if let validationData = validationData {
163
- validation = try Self . load ( data: validationData)
164
- }
165
- if let testingData = testingData {
166
- testing = try Self . load ( data: testingData)
167
- }
168
-
169
- self . alphabet = Self . makeAlphabet ( datasets: training, validation, testing)
170
- self . training = try Self . convertDataset ( training, alphabet: self . alphabet)
171
- self . validation = try Self . convertDataset ( validation, alphabet: self . alphabet)
172
- self . testing = try Self . convertDataset ( testing, alphabet: self . alphabet)
166
+ ) {
167
+ let training = Self . load ( data: trainingData)
168
+ let validation = Self . load ( data: validationData ?? Data ( ) )
169
+ let testing = Self . load ( data: testingData ?? Data ( ) )
170
+
171
+ self . alphabet = Self . makeAlphabet ( phrases: training + validation + testing)
172
+ self . trainingPhrases = Self . numericalizeDataset ( training, alphabet: self . alphabet)
173
+ self . validationPhrases = Self . numericalizeDataset ( validation, alphabet: self . alphabet)
174
+ self . testingPhrases = Self . numericalizeDataset ( testing, alphabet: self . alphabet)
173
175
}
174
176
177
+ /// Downloads and unpacks `source` to `directory` if it does not
178
+ /// exist locally.
175
179
private static func downloadIfNotPresent(
176
- to directory: URL , downloadDetails : DownloadDetails
180
+ to directory: URL , source : DownloadableArchive
177
181
) {
178
182
let downloadPath = directory. path
179
183
let directoryExists = FileManager . default. fileExists ( atPath: downloadPath)
@@ -182,11 +186,15 @@ public struct WordSegDataset {
182
186
183
187
guard !directoryExists || directoryEmpty else { return }
184
188
189
+ let remoteRoot = source. location. deletingLastPathComponent ( )
190
+ let filename = source. location. deletingPathExtension ( ) . lastPathComponent
191
+ let fileExtension = source. location. pathExtension
192
+
185
193
// Downloads and extracts dataset files.
186
194
let _ = DatasetUtilities . downloadResource (
187
- filename: downloadDetails . archiveFileName ,
188
- fileExtension: downloadDetails . archiveExtension ,
189
- remoteRoot: downloadDetails . archiveLocation ,
195
+ filename: filename ,
196
+ fileExtension: fileExtension ,
197
+ remoteRoot: remoteRoot ,
190
198
localStorageDirectory: directory, extract: true )
191
199
}
192
200
}
0 commit comments