Skip to content

Commit df8919e

Browse files
committed
Merge branch 'just-one-more-benchmark-suite' into temp
2 parents 3a2b324 + fdf2c23 commit df8919e

16 files changed

+18906
-34
lines changed

Sources/RegexBenchmark/Benchmark.swift

+17-14
Original file line numberDiff line numberDiff line change
@@ -69,11 +69,12 @@ struct CrossBenchmark {
6969
/// TODO: Probably better ot have a whole-line vs search anywhere, maybe
7070
/// accomodate multi-line matching, etc.
7171
var isWhole: Bool = false
72+
73+
/// Whether or not to do firstMatch as well or just allMatches
74+
var includeFirst: Bool = false
7275

7376
func register(_ runner: inout BenchmarkRunner) {
7477
let swiftRegex = try! Regex(regex)
75-
76-
let nsPattern = isWhole ? "^" + regex + "$" : regex
7778
let nsRegex: NSRegularExpression
7879
if isWhole {
7980
nsRegex = try! NSRegularExpression(pattern: "^" + regex + "$")
@@ -95,30 +96,32 @@ struct CrossBenchmark {
9596
type: .first,
9697
target: input))
9798
} else {
98-
runner.register(
99-
Benchmark(
100-
name: baseName + "First",
101-
regex: swiftRegex,
102-
type: .first,
103-
target: input))
10499
runner.register(
105100
Benchmark(
106101
name: baseName + "All",
107102
regex: swiftRegex,
108103
type: .allMatches,
109104
target: input))
110-
runner.register(
111-
NSBenchmark(
112-
name: baseName + "First_NS",
113-
regex: nsRegex,
114-
type: .first,
115-
target: input))
116105
runner.register(
117106
NSBenchmark(
118107
name: baseName + "All_NS",
119108
regex: nsRegex,
120109
type: .allMatches,
121110
target: input))
111+
if includeFirst {
112+
runner.register(
113+
Benchmark(
114+
name: baseName + "First",
115+
regex: swiftRegex,
116+
type: .first,
117+
target: input))
118+
runner.register(
119+
NSBenchmark(
120+
name: baseName + "First_NS",
121+
regex: nsRegex,
122+
type: .first,
123+
target: input))
124+
}
122125
}
123126
}
124127
}

Sources/RegexBenchmark/BenchmarkRegistration.swift

+2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ extension BenchmarkRunner {
1616
benchmark.addHTML()
1717
benchmark.addEmail()
1818
benchmark.addCustomCharacterClasses()
19+
benchmark.addDna()
20+
benchmark.addUnicode()
1921
// -- end of registrations --
2022
return benchmark
2123
}

Sources/RegexBenchmark/CLI.swift

+16-5
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import ArgumentParser
22

33
@main
44
struct Runner: ParsableCommand {
5-
@Argument(help: "Names of benchmarks to run")
5+
@Argument(help: "Patterns for benchmarks to run")
66
var specificBenchmarks: [String] = []
77

88
@Flag(help: "Run only once for profiling purposes")
@@ -20,19 +20,30 @@ struct Runner: ParsableCommand {
2020
@Flag(help: "Should the results be saved")
2121
var save = false
2222

23-
@Flag(help: "Compare this result with the latest saved result")
23+
@Flag(help: "Compare this result with a saved result")
2424
var compare = false
2525

2626
@Option(help: "The result file to compare against, if this flag is not set it will compare against the most recent result file")
2727
var compareFile: String?
2828

29+
@Flag(help: "Exclude the comparisons to NSRegex")
30+
var excludeNs = false
31+
2932
mutating func run() throws {
3033
var runner = BenchmarkRunner.makeRunner(samples, outputPath)
31-
32-
// todo: regex based filter
34+
3335
if !self.specificBenchmarks.isEmpty {
34-
runner.suite = runner.suite.filter { b in specificBenchmarks.contains(b.name) }
36+
runner.suite = runner.suite.filter { b in
37+
specificBenchmarks.contains { pattern in
38+
try! Regex(pattern).wholeMatch(in: b.name) != nil
39+
}
40+
}
3541
}
42+
43+
if excludeNs {
44+
runner.suite = runner.suite.filter { b in !b.name.contains("NS") }
45+
}
46+
3647
switch (profile, debug) {
3748
case (true, true): print("Cannot run both profile and debug")
3849
case (true, false): runner.profile()

Sources/RegexBenchmark/Inputs/DnaFASTA.swift

+16,676
Large diffs are not rendered by default.

Sources/RegexBenchmark/Inputs/TaggedUnicode.swift

+2,008
Large diffs are not rendered by default.

Sources/RegexBenchmark/Suite/CssRegex.swift

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ extension BenchmarkRunner {
66
let r = #"--([a-zA-Z0-9_-]+)\s*:\s*(.*?);"#
77

88
let css = CrossBenchmark(
9-
baseName: "css", regex: r, input: Inputs.swiftOrgCSS)
9+
baseName: "Css", regex: r, input: Inputs.swiftOrgCSS)
1010
css.register(&self)
1111
}
1212
}

Sources/RegexBenchmark/Suite/CustomCharacterClasses.swift

+6-6
Original file line numberDiff line numberDiff line change
@@ -13,37 +13,37 @@ extension BenchmarkRunner {
1313
let input = Inputs.graphemeBreakData
1414

1515
register(Benchmark(
16-
name: "basicCCC",
16+
name: "BasicCCC",
1717
regex: try! Regex(basic),
1818
type: .allMatches,
1919
target: input))
2020

2121
register(Benchmark(
22-
name: "basicRangeCCC",
22+
name: "BasicRangeCCC",
2323
regex: try! Regex(basicRange),
2424
type: .allMatches,
2525
target: input))
2626

2727
register(Benchmark(
28-
name: "caseInsensitiveCCC",
28+
name: "CaseInsensitiveCCC",
2929
regex: try! Regex(caseInsensitive),
3030
type: .allMatches,
3131
target: input))
3232

3333
register(Benchmark(
34-
name: "invertedCCC",
34+
name: "InvertedCCC",
3535
regex: try! Regex(inverted),
3636
type: .allMatches,
3737
target: input))
3838

3939
register(Benchmark(
40-
name: "subtractionCCC",
40+
name: "SubtractionCCC",
4141
regex: try! Regex(subtraction),
4242
type: .allMatches,
4343
target: input))
4444

4545
register(Benchmark(
46-
name: "intersectionCCC",
46+
name: "IntersectionCCC",
4747
regex: try! Regex(intersection),
4848
type: .allMatches,
4949
target: input))
+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import _StringProcessing
2+
3+
extension BenchmarkRunner {
4+
mutating func addDna() {
5+
// regex-redux from the benchmarks game
6+
// https://benchmarksgame-team.pages.debian.net/benchmarksgame/description/regexredux.html#regexredux
7+
let dna = "agg[act]taaa|ttta[agt]cct"
8+
let ends = "aND|caN|Ha[DS]|WaS"
9+
10+
let dnaMatching = CrossBenchmark(
11+
baseName: "DnaMatch",
12+
regex: dna,
13+
input: Inputs.dnaFASTA,
14+
includeFirst: true)
15+
16+
let sequenceEnds = CrossBenchmark(
17+
baseName: "DnaEndsMatch",
18+
regex: ends,
19+
input: Inputs.dnaFASTA,
20+
includeFirst: true)
21+
22+
dnaMatching.register(&self)
23+
sequenceEnds.register(&self)
24+
}
25+
}

Sources/RegexBenchmark/Suite/EmailRegex.swift

+4-4
Original file line numberDiff line numberDiff line change
@@ -13,22 +13,22 @@ extension BenchmarkRunner {
1313
let emailWithLookaheads = #"(?=[A-z0-9][A-z0-9@._%+-]{5,253})[A-z0-9._%+-]{1,64}@(?:(?=[A-z0-9-]{1,63}\.)[A-z0-9]+(?:-[A-z0-9]+)*\.){1,8}[A-z]{2,63}"#
1414

1515
let emailRFCValid = CrossBenchmark(
16-
baseName: "emailRFC", regex: emailRFC, input: Inputs.validEmails)
16+
baseName: "EmailRFC", regex: emailRFC, input: Inputs.validEmails)
1717

1818
let emailRFCInvalid = CrossBenchmark(
19-
baseName: "emailRFCNoMatches",
19+
baseName: "EmailRFCNoMatches",
2020
regex: emailRFC,
2121
input: Inputs.graphemeBreakData
2222
)
2323

2424
let emailValid = CrossBenchmark(
25-
baseName: "emailLookahead",
25+
baseName: "EmailLookahead",
2626
regex: emailWithLookaheads,
2727
input: Inputs.validEmails
2828
)
2929

3030
let emailInvalid = CrossBenchmark(
31-
baseName: "emailLookaheadNoMatches",
31+
baseName: "EmailLookaheadNoMatches",
3232
regex: emailWithLookaheads,
3333
input: Inputs.graphemeBreakData
3434
)

Sources/RegexBenchmark/Suite/GraphemeBreak.swift

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ extension BenchmarkRunner {
1818
let regex = #"HANGUL SYLLABLE [A-Z]+(?:\.\.HANGUL SYLLABLE [A-Z]+)?"#
1919

2020
let benchmark = CrossBenchmark(
21-
baseName: "HangulSyllable", regex: regex, input: input)
21+
baseName: "HangulSyllable", regex: regex, input: input, includeFirst: true)
2222
benchmark.register(&self)
2323
}
2424
}

Sources/RegexBenchmark/Suite/HtmlRegex.swift

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ extension BenchmarkRunner {
66
let r = #"<(\w*)\b[^>]*>(.*?)<\/\1>"#
77

88
let html = CrossBenchmark(
9-
baseName: "html", regex: r, input: Inputs.swiftOrgHTML)
9+
baseName: "Html", regex: r, input: Inputs.swiftOrgHTML)
1010
html.register(&self)
1111
}
1212
}

Sources/RegexBenchmark/Suite/NotFound.swift

+2-2
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@ extension BenchmarkRunner {
66
let input = String(repeating: " ", count: 100_000)
77

88
let notFound = CrossBenchmark(
9-
baseName: "notFound", regex: "a", input: input)
9+
baseName: "NotFound", regex: "a", input: input)
1010
notFound.register(&self)
1111

1212
let anchoredNotFound = CrossBenchmark(
13-
baseName: "notFound", regex: "^ +a", input: input)
13+
baseName: "AnchoredNotFound", regex: "^ +a", input: input)
1414
anchoredNotFound.register(&self)
1515
}
1616
}
+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import _StringProcessing
2+
3+
extension BenchmarkRunner {
4+
mutating func addUnicode() {
5+
// tagged unicode: unicode characters surrounded by html tags
6+
// use the same html regex, uses backreference + reluctant quantification
7+
let tags = #"<(\w*)\b[^>]*>(.*?)<\/\1>"#
8+
let taggedEmojis = CrossBenchmark(
9+
baseName: "TaggedEmojis",
10+
regex: tags,
11+
input: Inputs.taggedEmojis)
12+
13+
// Now actually matching emojis
14+
let emoji = #"(😃|😀|😳|😲|😦|😊|🙊|😘|😏|😳|😒){2,5}"#
15+
16+
let emojiRegex = CrossBenchmark(
17+
baseName: "EmojiRegex",
18+
regex: emoji,
19+
input: Inputs.taggedEmojis)
20+
21+
taggedEmojis.register(&self)
22+
emojiRegex.register(&self)
23+
}
24+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# DnaFasta.swift was generated with python3 Utils/generateFasta.py 100000
2+
3+
# The Computer Language Benchmarks Game
4+
# https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
5+
#
6+
# modified by Ian Osgood
7+
# modified again by Heinrich Acker
8+
# modified by Justin Peel
9+
# 2to3
10+
11+
"""Copyright © 2004-2008 Brent Fulgham, 2005-2022 Isaac Gouy
12+
All rights reserved.
13+
14+
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
15+
16+
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
17+
18+
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
19+
20+
3. Neither the name "The Computer Language Benchmarks Game" nor the name "The Benchmarks Game" nor the name "The Computer Language Shootout Benchmarks" nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
21+
22+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."""
23+
24+
import sys, bisect
25+
26+
alu = (
27+
'GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG'
28+
'GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA'
29+
'CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT'
30+
'ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA'
31+
'GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG'
32+
'AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC'
33+
'AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA')
34+
35+
iub = list(zip('acgtBDHKMNRSVWY', [0.27, 0.12, 0.12, 0.27] + [0.02]*11))
36+
37+
homosapiens = [
38+
('a', 0.3029549426680),
39+
('c', 0.1979883004921),
40+
('g', 0.1975473066391),
41+
('t', 0.3015094502008),
42+
]
43+
44+
45+
def genRandom(ia = 3877, ic = 29573, im = 139968):
46+
seed = 42
47+
imf = float(im)
48+
while 1:
49+
seed = (seed * ia + ic) % im
50+
yield seed / imf
51+
52+
Random = genRandom()
53+
54+
def makeCumulative(table):
55+
P = []
56+
C = []
57+
prob = 0.
58+
for char, p in table:
59+
prob += p
60+
P += [prob]
61+
C += [char]
62+
return (P, C)
63+
64+
def repeatFasta(src, n):
65+
width = 60
66+
r = len(src)
67+
s = src + src + src[:n % r]
68+
for j in range(n // width):
69+
i = j*width % r
70+
print(s[i:i+width])
71+
if n % width:
72+
print(s[-(n % width):])
73+
74+
def randomFasta(table, n):
75+
width = 60
76+
r = range(width)
77+
gR = Random.__next__
78+
bb = bisect.bisect
79+
jn = ''.join
80+
probs, chars = makeCumulative(table)
81+
for j in range(n // width):
82+
x = jn([chars[bb(probs, gR())] for i in r])
83+
print(x)
84+
if n % width:
85+
print(jn([chars[bb(probs, gR())] for i in range(n % width)]))
86+
87+
def main():
88+
n = int(sys.argv[1])
89+
90+
print('>ONE Homo sapiens alu')
91+
repeatFasta(alu, n*2)
92+
93+
print('>TWO IUB ambiguity codes')
94+
randomFasta(iub, n*3)
95+
96+
print('>THREE Homo sapiens frequency')
97+
randomFasta(homosapiens, n*5)
98+
99+
main()

0 commit comments

Comments
 (0)