-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathrequire-unicode-regexp.ts
369 lines (325 loc) · 10.6 KB
/
require-unicode-regexp.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
import { visitRegExpAST, RegExpParser } from "@eslint-community/regexpp"
import type {
Character,
CharacterClass,
CharacterSet,
Node,
Pattern,
Quantifier,
} from "@eslint-community/regexpp/ast"
import type { RegExpVisitor } from "@eslint-community/regexpp/visitor"
import type { CharRange } from "refa"
import type { ReadonlyFlags } from "regexp-ast-analysis"
import {
hasSomeDescendant,
toCache,
getFirstCharAfter,
toUnicodeSet,
} from "regexp-ast-analysis"
import type { RegExpContext } from "../utils"
import { createRule, defineRegexpVisitor } from "../utils"
const UTF16_MAX = 0xffff
/**
* Returns whether the given pattern is compatible with unicode-mode on a
* syntactical level. So means that:
*
* 1. The raw regex is syntactically valid with the u flag.
* 2. The regex is parsed the same way (*).
*
* (*) Unicode mode parses surrogates as one character while non-Unicode mode
* parses the pair as two separate code points. We will ignore this difference.
* We will also ignore the sematic differences between escape sequences and
* so on.
*
* @returns `false` or the parsed Unicode pattern
*/
function isSyntacticallyCompatible(pattern: Pattern): false | Pattern {
const INCOMPATIBLE = {}
// See whether it's syntactically valid
let uPattern
try {
uPattern = new RegExpParser().parsePattern(
pattern.raw,
undefined,
undefined,
{ unicode: true },
)
} catch {
return false
}
// See whether it's parsed the same way
// We will try to find constructs in the non-Unicode regex that we know
// will either result in a syntax error or a different construct. Since
// we already checked for syntax errors, we know that it's the second
// option.
// There is another construct that get interpreted differently: Surrogates.
// We want to make sure that no surrogate is a quantified element or
// character class element.
try {
visitRegExpAST(pattern, {
onCharacterEnter(node) {
if (/^\\(?![bfnrtv])[A-Za-z]$/u.test(node.raw)) {
// All cool Unicode feature are behind escapes like \p.
throw INCOMPATIBLE
}
},
})
// See no-misleading-character-class for more details
visitRegExpAST(uPattern, {
onCharacterEnter(node) {
if (
node.value > UTF16_MAX &&
(node.parent.type === "CharacterClass" ||
node.parent.type === "CharacterClassRange")
) {
// /[😃]/ != /[😃]/u
throw INCOMPATIBLE
}
},
onQuantifierEnter(node) {
if (
node.element.type === "Character" &&
node.element.value > UTF16_MAX
) {
// /😃+/ != /😃+/u
throw INCOMPATIBLE
}
},
})
} catch (error) {
if (error === INCOMPATIBLE) {
return false
}
// just rethrow
throw error
}
return uPattern
}
const HIGH_SURROGATES: CharRange = { min: 0xd800, max: 0xdbff }
const LOW_SURROGATES: CharRange = { min: 0xdc00, max: 0xdfff }
const SURROGATES: CharRange = { min: 0xd800, max: 0xdfff }
const ASTRAL: CharRange = { min: 0x10000, max: 0x10ffff }
/** Returns whether the two given ranges are equal. */
function rangeEqual(a: readonly CharRange[], b: readonly CharRange[]): boolean {
if (a.length !== b.length) {
return false
}
for (let i = 0; i < a.length; i++) {
const x = a[i]
const y = b[i]
if (x.min !== y.min || x.max !== y.max) {
return false
}
}
return true
}
type CharLike = Character | CharacterClass | CharacterSet
/** Whether the given element is character-like element. */
function isChar(node: Node): node is CharLike {
return (
node.type === "Character" ||
node.type === "CharacterClass" ||
node.type === "CharacterSet"
)
}
/**
* Whether the given char-like accepts the same characters with and without
* the u flag.
*/
function isCompatibleCharLike(
char: CharLike,
flags: ReadonlyFlags,
uFlags: ReadonlyFlags,
): boolean {
const cs = toUnicodeSet(char, flags)
if (!cs.isDisjointWith(SURROGATES)) {
// If the character (class/set) contains high or low
// surrogates, then we won't be able to guarantee that the
// Unicode pattern will behave the same way.
return false
}
const uCs = toUnicodeSet(char, uFlags)
// Compare the ranges.
return rangeEqual(cs.chars.ranges, uCs.chars.ranges)
}
/**
* Whether the given quantifier accepts the same characters with and without
* the u flag.
*
* This will return `undefined` if the function cannot decide.
*/
function isCompatibleQuantifier(
q: Quantifier,
flags: ReadonlyFlags,
uFlags: ReadonlyFlags,
): boolean | undefined {
if (!isChar(q.element)) {
return undefined
}
if (isCompatibleCharLike(q.element, flags, uFlags)) {
// trivial
return true
}
// A quantifier `n*` or `n+` is the same with and without the
// u flag if all of the following conditions are true:
//
// 1. The UTF16 characters of the element contain all
// surrogates characters (U+D800-U+DFFF).
// 2. The Unicode characters of the element contain all
// surrogates characters (U+D800-U+DFFF) and astral
// characters (U+10000-U+10FFFF).
// 3. All non-surrogate and non-astral characters of the UTF16
// and Unicode characters of the element as the same.
// 4. The first character before the quantifier is not a
// high surrogate (U+D800-U+DBFF).
// 5. The first character after the quantifier is not a
// low surrogate (U+DC00-U+DFFF).
if (q.min > 1 || q.max !== Infinity) {
return undefined
}
const cs = toUnicodeSet(q.element, flags)
if (!cs.isSupersetOf(SURROGATES)) {
// failed condition 1
return false
}
const uCs = toUnicodeSet(q.element, uFlags)
if (!uCs.isSupersetOf(SURROGATES) || !uCs.isSupersetOf(ASTRAL)) {
// failed condition 2
return false
}
if (!rangeEqual(cs.chars.ranges, uCs.without(ASTRAL).chars.ranges)) {
// failed condition 3
return false
}
const before = getFirstCharAfter(q, "rtl", flags).char
if (!before.isDisjointWith(HIGH_SURROGATES)) {
// failed condition 4
return false
}
const after = getFirstCharAfter(q, "ltr", flags).char
if (!after.isDisjointWith(LOW_SURROGATES)) {
// failed condition 5
return false
}
return true
}
/**
* Returns whether the regex would keep its behaviour if the u flag were to be
* added.
*/
function isSemanticallyCompatible(
regexpContext: RegExpContext,
uPattern: Pattern,
): boolean {
const surrogatePositions = new Set<number>()
visitRegExpAST(uPattern, {
onCharacterEnter(node) {
if (node.value > UTF16_MAX) {
for (let i = node.start; i < node.end; i++) {
surrogatePositions.add(i)
}
}
},
})
const pattern = regexpContext.patternAst
const flags = regexpContext.flags
const uFlags = toCache({ ...flags, unicode: true })
const skip = new Set<Node>()
return !hasSomeDescendant(
pattern,
(n) => {
// The goal is find something that is will change when adding the
// Unicode flag.
// Surrogates don't change
if (n.type === "Character" && surrogatePositions.has(n.start)) {
return false
}
if (
n.type === "Assertion" &&
n.kind === "word" &&
flags.ignoreCase
) {
// The case canonicalization in Unicode mode is different which
// causes `\b` and `\B` to accept/reject a few more characters.
return true
}
if (isChar(n)) {
return !isCompatibleCharLike(n, flags, uFlags)
}
if (n.type === "Quantifier") {
const result = isCompatibleQuantifier(n, flags, uFlags)
if (result !== undefined) {
skip.add(n)
return !result
}
}
return false
},
(n) => {
// Don't go into character classes, we already checked them.
// We also don't want to go into elements, we explicitly skipped.
return n.type !== "CharacterClass" && !skip.has(n)
},
)
}
/**
* Returns whether the regex would keep its behaviour if the u flag were to be
* added.
*/
function isCompatible(regexpContext: RegExpContext): boolean {
const uPattern = isSyntacticallyCompatible(regexpContext.patternAst)
if (!uPattern) {
return false
}
return isSemanticallyCompatible(regexpContext, uPattern)
}
export default createRule("require-unicode-regexp", {
meta: {
docs: {
description: "enforce the use of the `u` flag",
category: "Best Practices",
recommended: false,
},
schema: [],
fixable: "code",
messages: {
require: "Use the 'u' flag.",
},
type: "suggestion", // "problem",
},
create(context) {
function createVisitor(
regexpContext: RegExpContext,
): RegExpVisitor.Handlers {
const {
node,
flags,
flagsString,
getFlagsLocation,
fixReplaceFlags,
} = regexpContext
if (flagsString === null) {
// This means that there are flags (probably) but we were
// unable to evaluate them.
return {}
}
if (!flags.unicode && !flags.unicodeSets) {
context.report({
node,
loc: getFlagsLocation(),
messageId: "require",
fix: fixReplaceFlags(() => {
if (!isCompatible(regexpContext)) {
return null
}
return `${flagsString}u`
}),
})
}
return {}
}
return defineRegexpVisitor(context, {
createVisitor,
})
},
})