Skip to content
This repository was archived by the owner on Apr 23, 2025. It is now read-only.

Commit 20fa285

Browse files
texasmichelleDave Abrahams
and
Dave Abrahams
authored
[WordSeg] Add inline documentation (#566)
* Add documentation * Lint * Rename WordSegRecord to Phrase Rename DownloadDetails to ReferenceArchive Combine URL with filename and extension Update main summary in WordSegDataset Add blank line before doc comments in WordSegDataset * Update CMakeLists * Clarify more summaries. Remove explicit parameter descriptions and add them to summaries. Handle errors instead of throwing. Remove CharacterErrors.nonUtf8Data. Update attribute names in dataset tests. * Add blank lines Update summary in Phrase to include parameter names. Remove explicit parameter descriptions and add them to summaries. Conform parameter names to Swift conventions. * Clarify lattice summary. * Summary refinement * Clarify end marker behavior and assumptions * Rename ReferenceArchive to DownloadableArchive Change members to lets * Update Datasets/WordSeg/WordSegDataset.swift Co-authored-by: Dave Abrahams <[email protected]> * Update Datasets/WordSeg/WordSegDataset.swift Co-authored-by: Dave Abrahams <[email protected]> * Remove implied text from comments with phrase. * Remove Foundation string processing Remove unnecessary additional `load()` Remove unnecessary optional from `testingPhrases` and `validationPhrases` Simplify optional filename handling in init() Remove extra `)` from training loss output Add test for loading only training file * Update Datasets/WordSeg/WordSegDataset.swift Co-authored-by: Dave Abrahams <[email protected]> * Remove variadic arguments in makeAlphabet Simplify and remove redundant init code * Rename convertDataset to numericalizeDataset * Remove raw loop in makeAlphabet Rename downloadableArchive to source Preserve intermediate array type * Update Datasets/WordSeg/WordSegDataset.swift Co-authored-by: Dave Abrahams <[email protected]> * s/densly/densely/ * Remove hard-coded path * Replace `WordSegDataset` with `Self`
1 parent d1c0921 commit 20fa285

File tree

14 files changed

+497
-260
lines changed

14 files changed

+497
-260
lines changed

Benchmarks/Models/WordSeg.swift

+5-5
Original file line numberDiff line numberDiff line change
@@ -106,14 +106,14 @@ struct WordSegBenchmark: Benchmark {
106106
from: [sentence],
107107
alphabet: dataset.alphabet,
108108
maxLength: maximumSequenceLength,
109-
minFreq: 10
109+
minFrequency: 10
110110
)
111111

112112
let modelParameters = SNLM.Parameters(
113-
ndim: 512,
114-
dropoutProb: 0.5,
115-
chrVocab: dataset.alphabet,
116-
strVocab: lexicon,
113+
hiddenSize: 512,
114+
dropoutProbability: 0.5,
115+
alphabet: dataset.alphabet,
116+
lexicon: lexicon,
117117
order: 5
118118
)
119119

Datasets/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ add_library(Datasets
2121
TensorPair.swift
2222
TextUnsupervised/TextUnsupervised.swift
2323
WordSeg/WordSegDataset.swift
24-
WordSeg/WordSegRecord.swift
24+
WordSeg/Phrase.swift
2525
ImageSegmentationDataset.swift
2626
OxfordIIITPets/OxfordIIITPets.swift)
2727
target_link_libraries(Datasets PUBLIC

Datasets/WordSeg/WordSegRecord.swift renamed to Datasets/WordSeg/Phrase.swift

+8-1
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,17 @@
1414

1515
import ModelSupport
1616

17-
public struct WordSegRecord {
17+
/// A sequence of text for use in word segmentation.
18+
public struct Phrase {
19+
20+
/// A raw, unprocessed sequence of text.
1821
public let plainText: String
22+
23+
/// A sequence of text in numeric form, derived from `plainText`.
1924
public let numericalizedText: CharacterSequence
2025

26+
/// Creates an instance containing both raw (`plainText`) and processed
27+
/// (`numericalizedText`) forms of a sequence of text.
2128
public init(plainText: String, numericalizedText: CharacterSequence) {
2229
self.plainText = plainText
2330
self.numericalizedText = numericalizedText

Datasets/WordSeg/WordSegDataset.swift

+110-102
Original file line numberDiff line numberDiff line change
@@ -15,110 +15,129 @@
1515
import Foundation
1616
import ModelSupport
1717

18+
/// A dataset targeted at the problem of word segmentation.
19+
///
20+
/// The reference archive was published in the paper "Learning to Discover,
21+
/// Ground, and Use Words with Segmental Neural Language Models" by Kazuya
22+
/// Kawakami, Chris Dyer, and Phil Blunsom:
23+
/// https://www.aclweb.org/anthology/P19-1645.pdf.
1824
public struct WordSegDataset {
19-
public let training: [WordSegRecord]
20-
public private(set) var testing: [WordSegRecord]?
21-
public private(set) var validation: [WordSegRecord]?
25+
26+
/// The training data.
27+
public let trainingPhrases: [Phrase]
28+
29+
/// The test data.
30+
public private(set) var testingPhrases: [Phrase]
31+
32+
/// The validation data.
33+
public private(set) var validationPhrases: [Phrase]
34+
35+
/// A mapping between characters used in the dataset and densely-packed integers
2236
public let alphabet: Alphabet
2337

24-
private struct DownloadDetails {
25-
var archiveLocation = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami")!
26-
var archiveFileName = "seg"
27-
var archiveExtension = "zip"
28-
var testingFilePath = "br/br-text/te.txt"
29-
var trainingFilePath = "br/br-text/tr.txt"
30-
var validationFilePath = "br/br-text/va.txt"
31-
}
38+
/// A pointer to source data.
39+
private struct DownloadableArchive {
3240

33-
private static func load(data: Data) throws -> [String] {
34-
guard let contents: String = String(data: data, encoding: .utf8) else {
35-
throw CharacterErrors.nonUtf8Data
36-
}
37-
return load(contents: contents)
38-
}
41+
/// A [web resource](https://en.wikipedia.org/wiki/Web_resource) that can be unpacked
42+
/// into data files described by other properties of `self`.
43+
let location = URL(string: "https://s3.eu-west-2.amazonaws.com/k-kawakami/seg.zip")!
3944

40-
private static func load(contents: String) -> [String] {
41-
var strings = [String]()
45+
/// The path to the test data within the unpacked archive.
46+
let testingFilePath = "br/br-text/te.txt"
4247

43-
for line in contents.components(separatedBy: .newlines) {
44-
let trimmed = line.trimmingCharacters(in: .whitespaces)
45-
if trimmed.isEmpty { continue }
46-
strings.append(trimmed)
47-
}
48-
return strings
48+
/// The path to the training data within the unpacked archive.
49+
let trainingFilePath = "br/br-text/tr.txt"
50+
51+
/// The path to the validation data within the unpacked archive.
52+
let validationFilePath = "br/br-text/va.txt"
53+
}
54+
55+
/// Returns phrases parsed from `data` in UTF8, separated by newlines.
56+
private static func load(data: Data) -> [Substring] {
57+
let contents = String(decoding: data, as: Unicode.UTF8.self)
58+
let splitContents = contents.split(separator: "\n", omittingEmptySubsequences: true)
59+
return splitContents
4960
}
5061

62+
/// Returns the union of all characters in `phrases`.
63+
///
64+
/// - Parameter eos: the end of sequence marker.
65+
/// - Parameter eow:the end of word marker.
66+
/// - Parameter pad: the padding marker.
5167
private static func makeAlphabet(
52-
datasets training: [String],
53-
_ otherSequences: [String]?...,
68+
phrases: [Substring],
5469
eos: String = "</s>",
5570
eow: String = "</w>",
5671
pad: String = "</pad>"
5772
) -> Alphabet {
58-
var letters: Set<Character> = []
59-
60-
for dataset in otherSequences + [training] {
61-
guard let dataset = dataset else { continue }
62-
for sentence in dataset {
63-
for character in sentence {
64-
if !character.isWhitespace { letters.insert(character) }
65-
}
66-
}
67-
}
73+
let letters = Set(phrases.joined().lazy.filter { !$0.isWhitespace })
6874

6975
// Sort the letters to make it easier to interpret ints vs letters.
70-
var sorted = Array(letters)
71-
sorted.sort()
76+
let sorted = Array(letters).sorted()
7277

7378
return Alphabet(sorted, eos: eos, eow: eow, pad: pad)
7479
}
7580

76-
private static func convertDataset(_ dataset: [String], alphabet: Alphabet) throws
77-
-> [WordSegRecord]
78-
{
79-
return try dataset.map {
80-
let trimmed = $0.components(separatedBy: .whitespaces).joined()
81-
return try WordSegRecord(
82-
plainText: $0,
83-
numericalizedText: CharacterSequence(
84-
alphabet: alphabet, appendingEoSTo: trimmed))
85-
}
86-
}
87-
private static func convertDataset(_ dataset: [String]?, alphabet: Alphabet) throws
88-
-> [WordSegRecord]?
81+
/// Numericalizes `dataset` with the mapping in `alphabet`, to be used with the
82+
/// WordSeg model.
83+
///
84+
/// - Note: Omits any phrase that cannot be converted to `CharacterSequence`.
85+
private static func numericalizeDataset(_ dataset: [Substring], alphabet: Alphabet)
86+
-> [Phrase]
8987
{
90-
if let ds = dataset {
91-
let tmp: [WordSegRecord] = try convertDataset(ds, alphabet: alphabet) // Use tmp to disambiguate function
92-
return tmp
88+
var phrases = [Phrase]()
89+
90+
for data in dataset {
91+
let trimmed = data.split(separator: " ", omittingEmptySubsequences: true).joined()
92+
guard
93+
let numericalizedText = try? CharacterSequence(
94+
alphabet: alphabet, appendingEoSTo: trimmed)
95+
else { continue }
96+
let phrase = Phrase(
97+
plainText: String(data),
98+
numericalizedText: numericalizedText)
99+
phrases.append(phrase)
93100
}
94-
return nil
101+
102+
return phrases
95103
}
96104

105+
/// Creates an instance containing phrases from the reference archive.
106+
///
107+
/// - Throws: an error in the Cocoa domain, if the default training file
108+
/// cannot be read.
97109
public init() throws {
98-
let downloadDetails = DownloadDetails()
110+
let source = DownloadableArchive()
99111
let localStorageDirectory: URL = DatasetUtilities.defaultDirectory
100112
.appendingPathComponent("WordSeg", isDirectory: true)
101113

102-
WordSegDataset.downloadIfNotPresent(to: localStorageDirectory, downloadDetails: downloadDetails)
114+
Self.downloadIfNotPresent(
115+
to: localStorageDirectory, source: source)
103116

117+
let archiveFileName = source.location.deletingPathExtension().lastPathComponent
104118
let archiveDirectory =
105119
localStorageDirectory
106-
.appendingPathComponent(downloadDetails.archiveFileName)
120+
.appendingPathComponent(archiveFileName)
107121
let trainingFilePath =
108122
archiveDirectory
109-
.appendingPathComponent(downloadDetails.trainingFilePath).path
123+
.appendingPathComponent(source.trainingFilePath).path
110124
let validationFilePath =
111125
archiveDirectory
112-
.appendingPathComponent(downloadDetails.validationFilePath).path
126+
.appendingPathComponent(source.validationFilePath).path
113127
let testingFilePath =
114128
archiveDirectory
115-
.appendingPathComponent(downloadDetails.testingFilePath).path
129+
.appendingPathComponent(source.testingFilePath).path
116130

117131
try self.init(
118132
training: trainingFilePath, validation: validationFilePath,
119133
testing: testingFilePath)
120134
}
121135

136+
/// Creates an instance containing phrases from `trainingFile`, and
137+
/// optionally `validationFile` and `testingFile`.
138+
///
139+
/// - Throws: an error in the Cocoa domain, if `trainingFile` cannot be
140+
/// read.
122141
public init(
123142
training trainingFile: String,
124143
validation validationFile: String? = nil,
@@ -127,53 +146,38 @@ public struct WordSegDataset {
127146
let trainingData = try Data(
128147
contentsOf: URL(fileURLWithPath: trainingFile),
129148
options: .alwaysMapped)
130-
let training = try Self.load(data: trainingData)
131149

132-
var validation: [String]? = nil
133-
var testing: [String]? = nil
150+
let validationData = try Data(
151+
contentsOf: URL(fileURLWithPath: validationFile ?? "/dev/null"),
152+
options: .alwaysMapped)
134153

135-
if let validationFile = validationFile {
136-
let data = try Data(
137-
contentsOf: URL(fileURLWithPath: validationFile),
138-
options: .alwaysMapped)
139-
validation = try Self.load(data: data)
140-
}
154+
let testingData = try Data(
155+
contentsOf: URL(fileURLWithPath: testingFile ?? "/dev/null"),
156+
options: .alwaysMapped)
141157

142-
if let testingFile = testingFile {
143-
let data: Data = try Data(
144-
contentsOf: URL(fileURLWithPath: testingFile),
145-
options: .alwaysMapped)
146-
testing = try Self.load(data: data)
147-
}
148-
self.alphabet = Self.makeAlphabet(datasets: training, validation, testing)
149-
self.training = try Self.convertDataset(training, alphabet: self.alphabet)
150-
self.validation = try Self.convertDataset(validation, alphabet: self.alphabet)
151-
self.testing = try Self.convertDataset(testing, alphabet: self.alphabet)
158+
self.init(
159+
training: trainingData, validation: validationData, testing: testingData)
152160
}
153161

162+
/// Creates an instance containing phrases from `trainingData`, and
163+
/// optionally `validationData` and `testingData`.
154164
public init(
155165
training trainingData: Data, validation validationData: Data?, testing testingData: Data?
156-
)
157-
throws
158-
{
159-
let training = try Self.load(data: trainingData)
160-
var validation: [String]? = nil
161-
var testing: [String]? = nil
162-
if let validationData = validationData {
163-
validation = try Self.load(data: validationData)
164-
}
165-
if let testingData = testingData {
166-
testing = try Self.load(data: testingData)
167-
}
168-
169-
self.alphabet = Self.makeAlphabet(datasets: training, validation, testing)
170-
self.training = try Self.convertDataset(training, alphabet: self.alphabet)
171-
self.validation = try Self.convertDataset(validation, alphabet: self.alphabet)
172-
self.testing = try Self.convertDataset(testing, alphabet: self.alphabet)
166+
) {
167+
let training = Self.load(data: trainingData)
168+
let validation = Self.load(data: validationData ?? Data())
169+
let testing = Self.load(data: testingData ?? Data())
170+
171+
self.alphabet = Self.makeAlphabet(phrases: training + validation + testing)
172+
self.trainingPhrases = Self.numericalizeDataset(training, alphabet: self.alphabet)
173+
self.validationPhrases = Self.numericalizeDataset(validation, alphabet: self.alphabet)
174+
self.testingPhrases = Self.numericalizeDataset(testing, alphabet: self.alphabet)
173175
}
174176

177+
/// Downloads and unpacks `source` to `directory` if it does not
178+
/// exist locally.
175179
private static func downloadIfNotPresent(
176-
to directory: URL, downloadDetails: DownloadDetails
180+
to directory: URL, source: DownloadableArchive
177181
) {
178182
let downloadPath = directory.path
179183
let directoryExists = FileManager.default.fileExists(atPath: downloadPath)
@@ -182,11 +186,15 @@ public struct WordSegDataset {
182186

183187
guard !directoryExists || directoryEmpty else { return }
184188

189+
let remoteRoot = source.location.deletingLastPathComponent()
190+
let filename = source.location.deletingPathExtension().lastPathComponent
191+
let fileExtension = source.location.pathExtension
192+
185193
// Downloads and extracts dataset files.
186194
let _ = DatasetUtilities.downloadResource(
187-
filename: downloadDetails.archiveFileName,
188-
fileExtension: downloadDetails.archiveExtension,
189-
remoteRoot: downloadDetails.archiveLocation,
195+
filename: filename,
196+
fileExtension: fileExtension,
197+
remoteRoot: remoteRoot,
190198
localStorageDirectory: directory, extract: true)
191199
}
192200
}

0 commit comments

Comments
 (0)