@@ -19,7 +19,7 @@ use crate::unicode;
19
19
//
20
20
// Some of the implementation complexity here is a result of me wanting to
21
21
// preserve the sequential representation without using additional memory.
22
- // In many cases, we do use linear extra memory, but it is at most 2x and it
22
+ // In some cases, we do use linear extra memory, but it is at most 2x and it
23
23
// is amortized. If we relaxed the memory requirements, this implementation
24
24
// could become much simpler. The extra memory is honestly probably OK, but
25
25
// character classes (especially of the Unicode variety) can become quite
@@ -81,14 +81,45 @@ impl<I: Interval> IntervalSet<I> {
81
81
82
82
/// Add a new interval to this set.
83
83
pub fn push ( & mut self , interval : I ) {
84
- // TODO: This could be faster. e.g., Push the interval such that
85
- // it preserves canonicalization.
86
- self . ranges . push ( interval) ;
87
- self . canonicalize ( ) ;
88
84
// We don't know whether the new interval added here is considered
89
85
// case folded, so we conservatively assume that the entire set is
90
86
// no longer case folded if it was previously.
91
87
self . folded = false ;
88
+
89
+ if self . ranges . is_empty ( ) {
90
+ self . ranges . push ( interval) ;
91
+ return ;
92
+ }
93
+
94
+ // Find the first range that is not greater than the new interval.
95
+ // This is the first range that could possibly be unioned with the
96
+ // new interval.
97
+ let mut drain_end = self . ranges . len ( ) ;
98
+ while drain_end > 0
99
+ && self . ranges [ drain_end - 1 ] . lower ( ) > interval. upper ( )
100
+ && !self . ranges [ drain_end - 1 ] . is_contiguous ( & interval)
101
+ {
102
+ drain_end -= 1 ;
103
+ }
104
+
105
+ // Try to union the new interval with old intervals backwards.
106
+ if drain_end > 0 && self . ranges [ drain_end - 1 ] . is_contiguous ( & interval)
107
+ {
108
+ self . ranges [ drain_end - 1 ] =
109
+ self . ranges [ drain_end - 1 ] . union ( & interval) . unwrap ( ) ;
110
+ for i in ( 0 ..drain_end - 1 ) . rev ( ) {
111
+ if let Some ( union) =
112
+ self . ranges [ drain_end - 1 ] . union ( & self . ranges [ i] )
113
+ {
114
+ self . ranges [ drain_end - 1 ] = union;
115
+ } else {
116
+ self . ranges . drain ( i + 1 ..drain_end - 1 ) ;
117
+ break ;
118
+ }
119
+ }
120
+ } else {
121
+ self . ranges . insert ( drain_end, interval) ;
122
+ }
92
123
}
93
124
94
125
/// Return an iterator over all intervals in this set.
@@ -192,34 +223,13 @@ impl<I: Interval> IntervalSet<I> {
192
223
// Folks seem to suggest interval or segment trees, but I'd like to
193
224
// avoid the overhead (both runtime and conceptual) of that.
194
225
//
195
- // The following is basically my Shitty First Draft. Therefore, in
196
- // order to grok it, you probably need to read each line carefully.
197
- // Simplifications are most welcome!
198
- //
199
226
// Remember, we can assume the canonical format invariant here, which
200
227
// says that all ranges are sorted, not overlapping and not adjacent in
201
228
// each class.
202
229
let drain_end = self . ranges . len ( ) ;
203
- let ( mut a, mut b) = ( 0 , 0 ) ;
204
- ' LOOP : while a < drain_end && b < other. ranges . len ( ) {
205
- // Basically, the easy cases are when neither range overlaps with
206
- // each other. If the `b` range is less than our current `a`
207
- // range, then we can skip it and move on.
208
- if other. ranges [ b] . upper ( ) < self . ranges [ a] . lower ( ) {
209
- b += 1 ;
210
- continue ;
211
- }
212
- // ... similarly for the `a` range. If it's less than the smallest
213
- // `b` range, then we can add it as-is.
214
- if self . ranges [ a] . upper ( ) < other. ranges [ b] . lower ( ) {
215
- let range = self . ranges [ a] ;
216
- self . ranges . push ( range) ;
217
- a += 1 ;
218
- continue ;
219
- }
220
- // Otherwise, we have overlapping ranges.
221
- assert ! ( !self . ranges[ a] . is_intersection_empty( & other. ranges[ b] ) ) ;
222
230
231
+ let mut b = 0 ;
232
+ for a in 0 ..drain_end {
223
233
// This part is tricky and was non-obvious to me without looking
224
234
// at explicit examples (see the tests). The trickiness stems from
225
235
// two things: 1) subtracting a range from another range could
@@ -231,47 +241,34 @@ impl<I: Interval> IntervalSet<I> {
231
241
// For example, if our `a` range is `a-t` and our next three `b`
232
242
// ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply
233
243
// subtraction three times before moving on to the next `a` range.
234
- let mut range = self . ranges [ a] ;
244
+ self . ranges . push ( self . ranges [ a] ) ;
245
+ // Only when `b` is not above `a`, `b` might apply to current
246
+ // `a` range.
235
247
while b < other. ranges . len ( )
236
- && !range . is_intersection_empty ( & other. ranges [ b] )
248
+ && other. ranges [ b] . lower ( ) <= self . ranges [ a ] . upper ( )
237
249
{
238
- let old_range = range;
239
- range = match range. difference ( & other. ranges [ b] ) {
240
- ( None , None ) => {
241
- // We lost the entire range, so move on to the next
242
- // without adding this one.
243
- a += 1 ;
244
- continue ' LOOP ;
250
+ match self . ranges . pop ( ) . unwrap ( ) . difference ( & other. ranges [ b] ) {
251
+ ( Some ( range1) , None ) | ( None , Some ( range1) ) => {
252
+ self . ranges . push ( range1) ;
245
253
}
246
- ( Some ( range1) , None ) | ( None , Some ( range1) ) => range1,
247
254
( Some ( range1) , Some ( range2) ) => {
248
255
self . ranges . push ( range1) ;
249
- range2
256
+ self . ranges . push ( range2) ;
250
257
}
251
- } ;
252
- // It's possible that the `b` range has more to contribute
253
- // here. In particular, if it is greater than the original
254
- // range, then it might impact the next `a` range *and* it
255
- // has impacted the current `a` range as much as possible,
256
- // so we can quit. We don't bump `b` so that the next `a`
257
- // range can apply it.
258
- if other. ranges [ b] . upper ( ) > old_range. upper ( ) {
259
- break ;
258
+ ( None , None ) => { }
260
259
}
261
- // Otherwise, the next `b` range might apply to the current
260
+ // The next `b` range might apply to the current
262
261
// `a` range.
263
262
b += 1 ;
264
263
}
265
- self . ranges . push ( range) ;
266
- a += 1 ;
267
- }
268
- while a < drain_end {
269
- let range = self . ranges [ a] ;
270
- self . ranges . push ( range) ;
271
- a += 1 ;
264
+ // It's possible that the last `b` range has more to
265
+ // contribute to the next `a`. We don't bump the last
266
+ // `b` so that the next `a` range can apply it.
267
+ b = b. saturating_sub ( 1 ) ;
272
268
}
269
+
273
270
self . ranges . drain ( ..drain_end) ;
274
- self . folded = self . folded && other. folded ;
271
+ self . folded = self . ranges . is_empty ( ) || ( self . folded && other. folded ) ;
275
272
}
276
273
277
274
/// Compute the symmetric difference of the two sets, in place.
@@ -282,11 +279,83 @@ impl<I: Interval> IntervalSet<I> {
282
279
/// set. That is, the set will contain all elements in either set,
283
280
/// but will not contain any elements that are in both sets.
284
281
pub fn symmetric_difference ( & mut self , other : & IntervalSet < I > ) {
285
- // TODO(burntsushi): Fix this so that it amortizes allocation.
286
- let mut intersection = self . clone ( ) ;
287
- intersection. intersect ( other) ;
288
- self . union ( other) ;
289
- self . difference ( & intersection) ;
282
+ if self . ranges . is_empty ( ) {
283
+ self . ranges . extend ( & other. ranges ) ;
284
+ self . folded = other. folded ;
285
+ return ;
286
+ }
287
+ if other. ranges . is_empty ( ) {
288
+ return ;
289
+ }
290
+
291
+ // There should be a way to do this in-place with constant memory,
292
+ // but I couldn't figure out a simple way to do it. So just append
293
+ // the symmetric difference to the end of this range, and then drain
294
+ // it before we're done.
295
+ let drain_end = self . ranges . len ( ) ;
296
+ let mut b = 0 ;
297
+ let mut b_range = Some ( other. ranges [ b] ) ;
298
+ for a in 0 ..drain_end {
299
+ self . ranges . push ( self . ranges [ a] ) ;
300
+ while b_range
301
+ . map_or ( false , |r| r. lower ( ) <= self . ranges [ a] . upper ( ) )
302
+ {
303
+ let ( range1, range2) = match self
304
+ . ranges
305
+ . pop ( )
306
+ . unwrap ( )
307
+ . symmetric_difference ( & b_range. as_ref ( ) . unwrap ( ) )
308
+ {
309
+ ( Some ( range1) , None ) | ( None , Some ( range1) ) => {
310
+ ( Some ( range1) , None )
311
+ }
312
+ ( Some ( range1) , Some ( range2) ) => {
313
+ ( Some ( range1) , Some ( range2) )
314
+ }
315
+ ( None , None ) => ( None , None ) ,
316
+ } ;
317
+ if let Some ( range) = range1 {
318
+ if self . ranges . len ( ) > drain_end
319
+ && self . ranges . last ( ) . unwrap ( ) . is_contiguous ( & range)
320
+ {
321
+ self . ranges
322
+ . last_mut ( )
323
+ . map ( |last| * last = last. union ( & range) . unwrap ( ) ) ;
324
+ } else {
325
+ self . ranges . push ( range) ;
326
+ }
327
+ }
328
+ if let Some ( range) = range2 {
329
+ self . ranges . push ( range) ;
330
+ }
331
+
332
+ b_range = if self . ranges . len ( ) > drain_end
333
+ && self . ranges . last ( ) . unwrap ( ) . upper ( )
334
+ > self . ranges [ a] . upper ( )
335
+ {
336
+ Some ( * self . ranges . last ( ) . unwrap ( ) )
337
+ } else {
338
+ b += 1 ;
339
+ other. ranges . get ( b) . cloned ( )
340
+ } ;
341
+ }
342
+ }
343
+ while let Some ( range) = b_range {
344
+ if self . ranges . len ( ) > drain_end
345
+ && self . ranges . last ( ) . unwrap ( ) . is_contiguous ( & range)
346
+ {
347
+ self . ranges
348
+ . last_mut ( )
349
+ . map ( |last| * last = last. union ( & range) . unwrap ( ) ) ;
350
+ } else {
351
+ self . ranges . push ( range) ;
352
+ }
353
+ b += 1 ;
354
+ b_range = other. ranges . get ( b) . cloned ( ) ;
355
+ }
356
+
357
+ self . ranges . drain ( ..drain_end) ;
358
+ self . folded = self . ranges . is_empty ( ) || ( self . folded && other. folded ) ;
290
359
}
291
360
292
361
/// Negate this interval set.
@@ -302,28 +371,44 @@ impl<I: Interval> IntervalSet<I> {
302
371
return ;
303
372
}
304
373
305
- // There should be a way to do this in-place with constant memory,
306
- // but I couldn't figure out a simple way to do it. So just append
307
- // the negation to the end of this range, and then drain it before
308
- // we're done.
309
- let drain_end = self . ranges . len ( ) ;
310
-
311
374
// We do checked arithmetic below because of the canonical ordering
312
375
// invariant.
313
376
if self . ranges [ 0 ] . lower ( ) > I :: Bound :: min_value ( ) {
314
- let upper = self . ranges [ 0 ] . lower ( ) . decrement ( ) ;
315
- self . ranges . push ( I :: create ( I :: Bound :: min_value ( ) , upper) ) ;
316
- }
317
- for i in 1 ..drain_end {
318
- let lower = self . ranges [ i - 1 ] . upper ( ) . increment ( ) ;
319
- let upper = self . ranges [ i] . lower ( ) . decrement ( ) ;
320
- self . ranges . push ( I :: create ( lower, upper) ) ;
321
- }
322
- if self . ranges [ drain_end - 1 ] . upper ( ) < I :: Bound :: max_value ( ) {
323
- let lower = self . ranges [ drain_end - 1 ] . upper ( ) . increment ( ) ;
324
- self . ranges . push ( I :: create ( lower, I :: Bound :: max_value ( ) ) ) ;
377
+ let mut pre_upper = self . ranges [ 0 ] . upper ( ) ;
378
+ self . ranges [ 0 ] = I :: create (
379
+ I :: Bound :: min_value ( ) ,
380
+ self . ranges [ 0 ] . lower ( ) . decrement ( ) ,
381
+ ) ;
382
+ for i in 1 ..self . ranges . len ( ) {
383
+ let lower = pre_upper. increment ( ) ;
384
+ pre_upper = self . ranges [ i] . upper ( ) ;
385
+ self . ranges [ i] =
386
+ I :: create ( lower, self . ranges [ i] . lower ( ) . decrement ( ) ) ;
387
+ }
388
+ if pre_upper < I :: Bound :: max_value ( ) {
389
+ self . ranges . push ( I :: create (
390
+ pre_upper. increment ( ) ,
391
+ I :: Bound :: max_value ( ) ,
392
+ ) ) ;
393
+ }
394
+ } else {
395
+ for i in 1 ..self . ranges . len ( ) {
396
+ self . ranges [ i - 1 ] = I :: create (
397
+ self . ranges [ i - 1 ] . upper ( ) . increment ( ) ,
398
+ self . ranges [ i] . lower ( ) . decrement ( ) ,
399
+ ) ;
400
+ }
401
+ if self . ranges . last ( ) . unwrap ( ) . upper ( ) < I :: Bound :: max_value ( ) {
402
+ self . ranges . last_mut ( ) . map ( |range| {
403
+ * range = I :: create (
404
+ range. upper ( ) . increment ( ) ,
405
+ I :: Bound :: max_value ( ) ,
406
+ )
407
+ } ) ;
408
+ } else {
409
+ self . ranges . pop ( ) ;
410
+ }
325
411
}
326
- self . ranges . drain ( ..drain_end) ;
327
412
// We don't need to update whether this set is folded or not, because
328
413
// it is conservatively preserved through negation. Namely, if a set
329
414
// is not folded, then it is possible that its negation is folded, for
@@ -337,6 +422,7 @@ impl<I: Interval> IntervalSet<I> {
337
422
// of case folded characters. Negating it in turn means that all
338
423
// equivalence classes in the set are negated, and any equivalence
339
424
// class that was previously not in the set is now entirely in the set.
425
+ self . folded = self . ranges . is_empty ( ) || self . folded ;
340
426
}
341
427
342
428
/// Converts this set into a canonical ordering.
@@ -347,24 +433,20 @@ impl<I: Interval> IntervalSet<I> {
347
433
self . ranges . sort ( ) ;
348
434
assert ! ( !self . ranges. is_empty( ) ) ;
349
435
350
- // Is there a way to do this in-place with constant memory? I couldn't
351
- // figure out a way to do it. So just append the canonicalization to
352
- // the end of this range, and then drain it before we're done.
353
- let drain_end = self . ranges . len ( ) ;
354
- for oldi in 0 ..drain_end {
355
- // If we've added at least one new range, then check if we can
356
- // merge this range in the previously added range.
357
- if self . ranges . len ( ) > drain_end {
358
- let ( last, rest) = self . ranges . split_last_mut ( ) . unwrap ( ) ;
359
- if let Some ( union) = last. union ( & rest[ oldi] ) {
360
- * last = union;
361
- continue ;
362
- }
436
+ // We maintain the canonicalization results in-place at `0..newi`.
437
+ // `newi` will keep track of the end of the canonicalized ranges.
438
+ let mut newi = 0 ;
439
+ for oldi in 1 ..self . ranges . len ( ) {
440
+ // The last new range gets merged with currnet old range when
441
+ // unionable. If not, we update `newi` and store it as a new range.
442
+ if let Some ( union) = self . ranges [ newi] . union ( & self . ranges [ oldi] ) {
443
+ self . ranges [ newi] = union;
444
+ } else {
445
+ newi += 1 ;
446
+ self . ranges [ newi] = self . ranges [ oldi] ;
363
447
}
364
- let range = self . ranges [ oldi] ;
365
- self . ranges . push ( range) ;
366
448
}
367
- self . ranges . drain ( ..drain_end ) ;
449
+ self . ranges . truncate ( newi + 1 ) ;
368
450
}
369
451
370
452
/// Returns true if and only if this class is in a canonical ordering.
@@ -486,7 +568,13 @@ pub trait Interval:
486
568
other : & Self ,
487
569
) -> ( Option < Self > , Option < Self > ) {
488
570
let union = match self . union ( other) {
489
- None => return ( Some ( self . clone ( ) ) , Some ( other. clone ( ) ) ) ,
571
+ None => {
572
+ return if self . upper ( ) < other. lower ( ) {
573
+ ( Some ( self . clone ( ) ) , Some ( other. clone ( ) ) )
574
+ } else {
575
+ ( Some ( other. clone ( ) ) , Some ( self . clone ( ) ) )
576
+ }
577
+ }
490
578
Some ( union) => union,
491
579
} ;
492
580
let intersection = match self . intersect ( other) {
0 commit comments