regex_syntax/hir/literal/
mod.rs

1/*!
2Provides routines for extracting literal prefixes and suffixes from an `Hir`.
3*/
4
5use std::cmp;
6use std::fmt;
7use std::iter;
8use std::mem;
9use std::ops;
10
11use crate::hir::{self, Hir, HirKind};
12
13/// A set of literal byte strings extracted from a regular expression.
14///
15/// Every member of the set is a `Literal`, which is represented by a
16/// `Vec<u8>`. (Notably, it may contain invalid UTF-8.) Every member is
17/// said to be either *complete* or *cut*. A complete literal means that
18/// it extends until the beginning (or end) of the regular expression. In
19/// some circumstances, this can be used to indicate a match in the regular
20/// expression.
21///
22/// A key aspect of literal extraction is knowing when to stop. It is not
23/// feasible to blindly extract all literals from a regular expression, even if
24/// there are finitely many. For example, the regular expression `[0-9]{10}`
25/// has `10^10` distinct literals. For this reason, literal extraction is
26/// bounded to some low number by default using heuristics, but the limits can
27/// be tweaked.
28///
29/// **WARNING**: Literal extraction uses stack space proportional to the size
30/// of the `Hir` expression. At some point, this drawback will be eliminated.
31/// To protect yourself, set a reasonable
32/// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit).
33/// This is done for you by default.
34#[derive(Clone, Eq, PartialEq)]
35pub struct Literals {
36    lits: Vec<Literal>,
37    limit_size: usize,
38    limit_class: usize,
39}
40
41/// A single member of a set of literals extracted from a regular expression.
42///
43/// This type has `Deref` and `DerefMut` impls to `Vec<u8>` so that all slice
44/// and `Vec` operations are available.
45#[derive(Clone, Eq, Ord)]
46pub struct Literal {
47    v: Vec<u8>,
48    cut: bool,
49}
50
51impl Literals {
52    /// Returns a new empty set of literals using default limits.
53    pub fn empty() -> Literals {
54        Literals { lits: vec![], limit_size: 250, limit_class: 10 }
55    }
56
57    /// Returns a set of literal prefixes extracted from the given `Hir`.
58    pub fn prefixes(expr: &Hir) -> Literals {
59        let mut lits = Literals::empty();
60        lits.union_prefixes(expr);
61        lits
62    }
63
64    /// Returns a set of literal suffixes extracted from the given `Hir`.
65    pub fn suffixes(expr: &Hir) -> Literals {
66        let mut lits = Literals::empty();
67        lits.union_suffixes(expr);
68        lits
69    }
70
71    /// Get the approximate size limit (in bytes) of this set.
72    pub fn limit_size(&self) -> usize {
73        self.limit_size
74    }
75
76    /// Set the approximate size limit (in bytes) of this set.
77    ///
78    /// If extracting a literal would put the set over this limit, then
79    /// extraction stops.
80    ///
81    /// The new limits will only apply to additions to this set. Existing
82    /// members remain unchanged, even if the set exceeds the new limit.
83    pub fn set_limit_size(&mut self, size: usize) -> &mut Literals {
84        self.limit_size = size;
85        self
86    }
87
88    /// Get the character class size limit for this set.
89    pub fn limit_class(&self) -> usize {
90        self.limit_class
91    }
92
93    /// Limits the size of character(or byte) classes considered.
94    ///
95    /// A value of `0` prevents all character classes from being considered.
96    ///
97    /// This limit also applies to case insensitive literals, since each
98    /// character in the case insensitive literal is converted to a class, and
99    /// then case folded.
100    ///
101    /// The new limits will only apply to additions to this set. Existing
102    /// members remain unchanged, even if the set exceeds the new limit.
103    pub fn set_limit_class(&mut self, size: usize) -> &mut Literals {
104        self.limit_class = size;
105        self
106    }
107
108    /// Returns the set of literals as a slice. Its order is unspecified.
109    pub fn literals(&self) -> &[Literal] {
110        &self.lits
111    }
112
113    /// Returns the length of the smallest literal.
114    ///
115    /// Returns None is there are no literals in the set.
116    pub fn min_len(&self) -> Option<usize> {
117        let mut min = None;
118        for lit in &self.lits {
119            match min {
120                None => min = Some(lit.len()),
121                Some(m) if lit.len() < m => min = Some(lit.len()),
122                _ => {}
123            }
124        }
125        min
126    }
127
128    /// Returns true if all members in this set are complete.
129    pub fn all_complete(&self) -> bool {
130        !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut())
131    }
132
133    /// Returns true if any member in this set is complete.
134    pub fn any_complete(&self) -> bool {
135        self.lits.iter().any(|lit| !lit.is_cut())
136    }
137
138    /// Returns true if this set contains an empty literal.
139    pub fn contains_empty(&self) -> bool {
140        self.lits.iter().any(|lit| lit.is_empty())
141    }
142
143    /// Returns true if this set is empty or if all of its members is empty.
144    pub fn is_empty(&self) -> bool {
145        self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty())
146    }
147
148    /// Returns a new empty set of literals using this set's limits.
149    pub fn to_empty(&self) -> Literals {
150        let mut lits = Literals::empty();
151        lits.set_limit_size(self.limit_size).set_limit_class(self.limit_class);
152        lits
153    }
154
155    /// Returns the longest common prefix of all members in this set.
156    pub fn longest_common_prefix(&self) -> &[u8] {
157        if self.is_empty() {
158            return &[];
159        }
160        let lit0 = &*self.lits[0];
161        let mut len = lit0.len();
162        for lit in &self.lits[1..] {
163            len = cmp::min(
164                len,
165                lit.iter().zip(lit0).take_while(|&(a, b)| a == b).count(),
166            );
167        }
168        &self.lits[0][..len]
169    }
170
171    /// Returns the longest common suffix of all members in this set.
172    pub fn longest_common_suffix(&self) -> &[u8] {
173        if self.is_empty() {
174            return &[];
175        }
176        let lit0 = &*self.lits[0];
177        let mut len = lit0.len();
178        for lit in &self.lits[1..] {
179            len = cmp::min(
180                len,
181                lit.iter()
182                    .rev()
183                    .zip(lit0.iter().rev())
184                    .take_while(|&(a, b)| a == b)
185                    .count(),
186            );
187        }
188        &self.lits[0][self.lits[0].len() - len..]
189    }
190
191    /// Returns a new set of literals with the given number of bytes trimmed
192    /// from the suffix of each literal.
193    ///
194    /// If any literal would be cut out completely by trimming, then None is
195    /// returned.
196    ///
197    /// Any duplicates that are created as a result of this transformation are
198    /// removed.
199    pub fn trim_suffix(&self, num_bytes: usize) -> Option<Literals> {
200        if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) {
201            return None;
202        }
203        let mut new = self.to_empty();
204        for mut lit in self.lits.iter().cloned() {
205            let new_len = lit.len() - num_bytes;
206            lit.truncate(new_len);
207            lit.cut();
208            new.lits.push(lit);
209        }
210        new.lits.sort();
211        new.lits.dedup();
212        Some(new)
213    }
214
215    /// Returns a new set of prefixes of this set of literals that are
216    /// guaranteed to be unambiguous.
217    ///
218    /// Any substring match with a member of the set is returned is guaranteed
219    /// to never overlap with a substring match of another member of the set
220    /// at the same starting position.
221    ///
222    /// Given any two members of the returned set, neither is a substring of
223    /// the other.
224    pub fn unambiguous_prefixes(&self) -> Literals {
225        if self.lits.is_empty() {
226            return self.to_empty();
227        }
228        let mut old = self.lits.to_vec();
229        let mut new = self.to_empty();
230        'OUTER: while let Some(mut candidate) = old.pop() {
231            if candidate.is_empty() {
232                continue;
233            }
234            if new.lits.is_empty() {
235                new.lits.push(candidate);
236                continue;
237            }
238            for lit2 in &mut new.lits {
239                if lit2.is_empty() {
240                    continue;
241                }
242                if &candidate == lit2 {
243                    // If the literal is already in the set, then we can
244                    // just drop it. But make sure that cut literals are
245                    // infectious!
246                    candidate.cut = candidate.cut || lit2.cut;
247                    lit2.cut = candidate.cut;
248                    continue 'OUTER;
249                }
250                if candidate.len() < lit2.len() {
251                    if let Some(i) = position(&candidate, &lit2) {
252                        candidate.cut();
253                        let mut lit3 = lit2.clone();
254                        lit3.truncate(i);
255                        lit3.cut();
256                        old.push(lit3);
257                        lit2.clear();
258                    }
259                } else if let Some(i) = position(&lit2, &candidate) {
260                    lit2.cut();
261                    let mut new_candidate = candidate.clone();
262                    new_candidate.truncate(i);
263                    new_candidate.cut();
264                    old.push(new_candidate);
265                    candidate.clear();
266                }
267                // Oops, the candidate is already represented in the set.
268                if candidate.is_empty() {
269                    continue 'OUTER;
270                }
271            }
272            new.lits.push(candidate);
273        }
274        new.lits.retain(|lit| !lit.is_empty());
275        new.lits.sort();
276        new.lits.dedup();
277        new
278    }
279
280    /// Returns a new set of suffixes of this set of literals that are
281    /// guaranteed to be unambiguous.
282    ///
283    /// Any substring match with a member of the set is returned is guaranteed
284    /// to never overlap with a substring match of another member of the set
285    /// at the same ending position.
286    ///
287    /// Given any two members of the returned set, neither is a substring of
288    /// the other.
289    pub fn unambiguous_suffixes(&self) -> Literals {
290        // This is a touch wasteful...
291        let mut lits = self.clone();
292        lits.reverse();
293        let mut unamb = lits.unambiguous_prefixes();
294        unamb.reverse();
295        unamb
296    }
297
298    /// Unions the prefixes from the given expression to this set.
299    ///
300    /// If prefixes could not be added (for example, this set would exceed its
301    /// size limits or the set of prefixes from `expr` includes the empty
302    /// string), then false is returned.
303    ///
304    /// Note that prefix literals extracted from `expr` are said to be complete
305    /// if and only if the literal extends from the beginning of `expr` to the
306    /// end of `expr`.
307    pub fn union_prefixes(&mut self, expr: &Hir) -> bool {
308        let mut lits = self.to_empty();
309        prefixes(expr, &mut lits);
310        !lits.is_empty() && !lits.contains_empty() && self.union(lits)
311    }
312
313    /// Unions the suffixes from the given expression to this set.
314    ///
315    /// If suffixes could not be added (for example, this set would exceed its
316    /// size limits or the set of suffixes from `expr` includes the empty
317    /// string), then false is returned.
318    ///
319    /// Note that prefix literals extracted from `expr` are said to be complete
320    /// if and only if the literal extends from the end of `expr` to the
321    /// beginning of `expr`.
322    pub fn union_suffixes(&mut self, expr: &Hir) -> bool {
323        let mut lits = self.to_empty();
324        suffixes(expr, &mut lits);
325        lits.reverse();
326        !lits.is_empty() && !lits.contains_empty() && self.union(lits)
327    }
328
329    /// Unions this set with another set.
330    ///
331    /// If the union would cause the set to exceed its limits, then the union
332    /// is skipped and it returns false. Otherwise, if the union succeeds, it
333    /// returns true.
334    pub fn union(&mut self, lits: Literals) -> bool {
335        if self.num_bytes() + lits.num_bytes() > self.limit_size {
336            return false;
337        }
338        if lits.is_empty() {
339            self.lits.push(Literal::empty());
340        } else {
341            self.lits.extend(lits.lits);
342        }
343        true
344    }
345
346    /// Extends this set with another set.
347    ///
348    /// The set of literals is extended via a cross product.
349    ///
350    /// If a cross product would cause this set to exceed its limits, then the
351    /// cross product is skipped and it returns false. Otherwise, if the cross
352    /// product succeeds, it returns true.
353    pub fn cross_product(&mut self, lits: &Literals) -> bool {
354        if lits.is_empty() {
355            return true;
356        }
357        // Check that we make sure we stay in our limits.
358        let mut size_after;
359        if self.is_empty() || !self.any_complete() {
360            size_after = self.num_bytes();
361            for lits_lit in lits.literals() {
362                size_after += lits_lit.len();
363            }
364        } else {
365            size_after = self.lits.iter().fold(0, |accum, lit| {
366                accum + if lit.is_cut() { lit.len() } else { 0 }
367            });
368            for lits_lit in lits.literals() {
369                for self_lit in self.literals() {
370                    if !self_lit.is_cut() {
371                        size_after += self_lit.len() + lits_lit.len();
372                    }
373                }
374            }
375        }
376        if size_after > self.limit_size {
377            return false;
378        }
379
380        let mut base = self.remove_complete();
381        if base.is_empty() {
382            base = vec![Literal::empty()];
383        }
384        for lits_lit in lits.literals() {
385            for mut self_lit in base.clone() {
386                self_lit.extend(&**lits_lit);
387                self_lit.cut = lits_lit.cut;
388                self.lits.push(self_lit);
389            }
390        }
391        true
392    }
393
394    /// Extends each literal in this set with the bytes given.
395    ///
396    /// If the set is empty, then the given literal is added to the set.
397    ///
398    /// If adding any number of bytes to all members of this set causes a limit
399    /// to be exceeded, then no bytes are added and false is returned. If a
400    /// prefix of `bytes` can be fit into this set, then it is used and all
401    /// resulting literals are cut.
402    pub fn cross_add(&mut self, bytes: &[u8]) -> bool {
403        // N.B. This could be implemented by simply calling cross_product with
404        // a literal set containing just `bytes`, but we can be smarter about
405        // taking shorter prefixes of `bytes` if they'll fit.
406        if bytes.is_empty() {
407            return true;
408        }
409        if self.lits.is_empty() {
410            let i = cmp::min(self.limit_size, bytes.len());
411            self.lits.push(Literal::new(bytes[..i].to_owned()));
412            self.lits[0].cut = i < bytes.len();
413            return !self.lits[0].is_cut();
414        }
415        let size = self.num_bytes();
416        if size + self.lits.len() >= self.limit_size {
417            return false;
418        }
419        let mut i = 1;
420        while size + (i * self.lits.len()) <= self.limit_size
421            && i < bytes.len()
422        {
423            i += 1;
424        }
425        for lit in &mut self.lits {
426            if !lit.is_cut() {
427                lit.extend(&bytes[..i]);
428                if i < bytes.len() {
429                    lit.cut();
430                }
431            }
432        }
433        true
434    }
435
436    /// Adds the given literal to this set.
437    ///
438    /// Returns false if adding this literal would cause the class to be too
439    /// big.
440    pub fn add(&mut self, lit: Literal) -> bool {
441        if self.num_bytes() + lit.len() > self.limit_size {
442            return false;
443        }
444        self.lits.push(lit);
445        true
446    }
447
448    /// Extends each literal in this set with the character class given.
449    ///
450    /// Returns false if the character class was too big to add.
451    pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool {
452        self._add_char_class(cls, false)
453    }
454
455    /// Extends each literal in this set with the character class given,
456    /// writing the bytes of each character in reverse.
457    ///
458    /// Returns false if the character class was too big to add.
459    fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool {
460        self._add_char_class(cls, true)
461    }
462
463    fn _add_char_class(
464        &mut self,
465        cls: &hir::ClassUnicode,
466        reverse: bool,
467    ) -> bool {
468        use std::char;
469
470        if self.class_exceeds_limits(cls_char_count(cls)) {
471            return false;
472        }
473        let mut base = self.remove_complete();
474        if base.is_empty() {
475            base = vec![Literal::empty()];
476        }
477        for r in cls.iter() {
478            let (s, e) = (r.start as u32, r.end as u32 + 1);
479            for c in (s..e).filter_map(char::from_u32) {
480                for mut lit in base.clone() {
481                    let mut bytes = c.to_string().into_bytes();
482                    if reverse {
483                        bytes.reverse();
484                    }
485                    lit.extend(&bytes);
486                    self.lits.push(lit);
487                }
488            }
489        }
490        true
491    }
492
493    /// Extends each literal in this set with the byte class given.
494    ///
495    /// Returns false if the byte class was too big to add.
496    pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool {
497        if self.class_exceeds_limits(cls_byte_count(cls)) {
498            return false;
499        }
500        let mut base = self.remove_complete();
501        if base.is_empty() {
502            base = vec![Literal::empty()];
503        }
504        for r in cls.iter() {
505            let (s, e) = (r.start as u32, r.end as u32 + 1);
506            for b in (s..e).map(|b| b as u8) {
507                for mut lit in base.clone() {
508                    lit.push(b);
509                    self.lits.push(lit);
510                }
511            }
512        }
513        true
514    }
515
516    /// Cuts every member of this set. When a member is cut, it can never
517    /// be extended.
518    pub fn cut(&mut self) {
519        for lit in &mut self.lits {
520            lit.cut();
521        }
522    }
523
524    /// Reverses all members in place.
525    pub fn reverse(&mut self) {
526        for lit in &mut self.lits {
527            lit.reverse();
528        }
529    }
530
531    /// Clears this set of all members.
532    pub fn clear(&mut self) {
533        self.lits.clear();
534    }
535
536    /// Pops all complete literals out of this set.
537    fn remove_complete(&mut self) -> Vec<Literal> {
538        let mut base = vec![];
539        for lit in mem::replace(&mut self.lits, vec![]) {
540            if lit.is_cut() {
541                self.lits.push(lit);
542            } else {
543                base.push(lit);
544            }
545        }
546        base
547    }
548
549    /// Returns the total number of bytes in this set.
550    fn num_bytes(&self) -> usize {
551        self.lits.iter().fold(0, |accum, lit| accum + lit.len())
552    }
553
554    /// Returns true if a character class with the given size would cause this
555    /// set to exceed its limits.
556    ///
557    /// The size given should correspond to the number of items in the class.
558    fn class_exceeds_limits(&self, size: usize) -> bool {
559        if size > self.limit_class {
560            return true;
561        }
562        // This is an approximation since codepoints in a char class can encode
563        // to 1-4 bytes.
564        let new_byte_count = if self.lits.is_empty() {
565            size
566        } else {
567            self.lits.iter().fold(0, |accum, lit| {
568                accum
569                    + if lit.is_cut() {
570                        // If the literal is cut, then we'll never add
571                        // anything to it, so don't count it.
572                        0
573                    } else {
574                        (lit.len() + 1) * size
575                    }
576            })
577        };
578        new_byte_count > self.limit_size
579    }
580}
581
582fn prefixes(expr: &Hir, lits: &mut Literals) {
583    match *expr.kind() {
584        HirKind::Literal(hir::Literal::Unicode(c)) => {
585            let mut buf = [0; 4];
586            lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
587        }
588        HirKind::Literal(hir::Literal::Byte(b)) => {
589            lits.cross_add(&[b]);
590        }
591        HirKind::Class(hir::Class::Unicode(ref cls)) => {
592            if !lits.add_char_class(cls) {
593                lits.cut();
594            }
595        }
596        HirKind::Class(hir::Class::Bytes(ref cls)) => {
597            if !lits.add_byte_class(cls) {
598                lits.cut();
599            }
600        }
601        HirKind::Group(hir::Group { ref hir, .. }) => {
602            prefixes(&**hir, lits);
603        }
604        HirKind::Repetition(ref x) => match x.kind {
605            hir::RepetitionKind::ZeroOrOne => {
606                repeat_zero_or_one_literals(&x.hir, lits, prefixes);
607            }
608            hir::RepetitionKind::ZeroOrMore => {
609                repeat_zero_or_more_literals(&x.hir, lits, prefixes);
610            }
611            hir::RepetitionKind::OneOrMore => {
612                repeat_one_or_more_literals(&x.hir, lits, prefixes);
613            }
614            hir::RepetitionKind::Range(ref rng) => {
615                let (min, max) = match *rng {
616                    hir::RepetitionRange::Exactly(m) => (m, Some(m)),
617                    hir::RepetitionRange::AtLeast(m) => (m, None),
618                    hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
619                };
620                repeat_range_literals(
621                    &x.hir, min, max, x.greedy, lits, prefixes,
622                )
623            }
624        },
625        HirKind::Concat(ref es) if es.is_empty() => {}
626        HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits),
627        HirKind::Concat(ref es) => {
628            for e in es {
629                if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() {
630                    if !lits.is_empty() {
631                        lits.cut();
632                        break;
633                    }
634                    lits.add(Literal::empty());
635                    continue;
636                }
637                let mut lits2 = lits.to_empty();
638                prefixes(e, &mut lits2);
639                if !lits.cross_product(&lits2) || !lits2.any_complete() {
640                    // If this expression couldn't yield any literal that
641                    // could be extended, then we need to quit. Since we're
642                    // short-circuiting, we also need to freeze every member.
643                    lits.cut();
644                    break;
645                }
646            }
647        }
648        HirKind::Alternation(ref es) => {
649            alternate_literals(es, lits, prefixes);
650        }
651        _ => lits.cut(),
652    }
653}
654
655fn suffixes(expr: &Hir, lits: &mut Literals) {
656    match *expr.kind() {
657        HirKind::Literal(hir::Literal::Unicode(c)) => {
658            let mut buf = [0u8; 4];
659            let i = c.encode_utf8(&mut buf).len();
660            let buf = &mut buf[..i];
661            buf.reverse();
662            lits.cross_add(buf);
663        }
664        HirKind::Literal(hir::Literal::Byte(b)) => {
665            lits.cross_add(&[b]);
666        }
667        HirKind::Class(hir::Class::Unicode(ref cls)) => {
668            if !lits.add_char_class_reverse(cls) {
669                lits.cut();
670            }
671        }
672        HirKind::Class(hir::Class::Bytes(ref cls)) => {
673            if !lits.add_byte_class(cls) {
674                lits.cut();
675            }
676        }
677        HirKind::Group(hir::Group { ref hir, .. }) => {
678            suffixes(&**hir, lits);
679        }
680        HirKind::Repetition(ref x) => match x.kind {
681            hir::RepetitionKind::ZeroOrOne => {
682                repeat_zero_or_one_literals(&x.hir, lits, suffixes);
683            }
684            hir::RepetitionKind::ZeroOrMore => {
685                repeat_zero_or_more_literals(&x.hir, lits, suffixes);
686            }
687            hir::RepetitionKind::OneOrMore => {
688                repeat_one_or_more_literals(&x.hir, lits, suffixes);
689            }
690            hir::RepetitionKind::Range(ref rng) => {
691                let (min, max) = match *rng {
692                    hir::RepetitionRange::Exactly(m) => (m, Some(m)),
693                    hir::RepetitionRange::AtLeast(m) => (m, None),
694                    hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
695                };
696                repeat_range_literals(
697                    &x.hir, min, max, x.greedy, lits, suffixes,
698                )
699            }
700        },
701        HirKind::Concat(ref es) if es.is_empty() => {}
702        HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits),
703        HirKind::Concat(ref es) => {
704            for e in es.iter().rev() {
705                if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() {
706                    if !lits.is_empty() {
707                        lits.cut();
708                        break;
709                    }
710                    lits.add(Literal::empty());
711                    continue;
712                }
713                let mut lits2 = lits.to_empty();
714                suffixes(e, &mut lits2);
715                if !lits.cross_product(&lits2) || !lits2.any_complete() {
716                    // If this expression couldn't yield any literal that
717                    // could be extended, then we need to quit. Since we're
718                    // short-circuiting, we also need to freeze every member.
719                    lits.cut();
720                    break;
721                }
722            }
723        }
724        HirKind::Alternation(ref es) => {
725            alternate_literals(es, lits, suffixes);
726        }
727        _ => lits.cut(),
728    }
729}
730
731fn repeat_zero_or_one_literals<F: FnMut(&Hir, &mut Literals)>(
732    e: &Hir,
733    lits: &mut Literals,
734    mut f: F,
735) {
736    f(
737        &Hir::repetition(hir::Repetition {
738            kind: hir::RepetitionKind::ZeroOrMore,
739            // FIXME: Our literal extraction doesn't care about greediness.
740            // Which is partially why we're treating 'e?' as 'e*'. Namely,
741            // 'ab??' yields [Complete(ab), Complete(a)], but it should yield
742            // [Complete(a), Complete(ab)] because of the non-greediness.
743            greedy: true,
744            hir: Box::new(e.clone()),
745        }),
746        lits,
747    );
748}
749
750fn repeat_zero_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
751    e: &Hir,
752    lits: &mut Literals,
753    mut f: F,
754) {
755    let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty());
756    lits3.set_limit_size(lits.limit_size() / 2);
757    f(e, &mut lits3);
758
759    if lits3.is_empty() || !lits2.cross_product(&lits3) {
760        lits.cut();
761        return;
762    }
763    lits2.cut();
764    lits2.add(Literal::empty());
765    if !lits.union(lits2) {
766        lits.cut();
767    }
768}
769
770fn repeat_one_or_more_literals<F: FnMut(&Hir, &mut Literals)>(
771    e: &Hir,
772    lits: &mut Literals,
773    mut f: F,
774) {
775    f(e, lits);
776    lits.cut();
777}
778
779fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
780    e: &Hir,
781    min: u32,
782    max: Option<u32>,
783    greedy: bool,
784    lits: &mut Literals,
785    mut f: F,
786) {
787    if min == 0 {
788        // This is a bit conservative. If `max` is set, then we could
789        // treat this as a finite set of alternations. For now, we
790        // just treat it as `e*`.
791        f(
792            &Hir::repetition(hir::Repetition {
793                kind: hir::RepetitionKind::ZeroOrMore,
794                greedy,
795                hir: Box::new(e.clone()),
796            }),
797            lits,
798        );
799    } else {
800        if min > 0 {
801            let n = cmp::min(lits.limit_size, min as usize);
802            let es = iter::repeat(e.clone()).take(n).collect();
803            f(&Hir::concat(es), lits);
804            if n < min as usize || lits.contains_empty() {
805                lits.cut();
806            }
807        }
808        if max.map_or(true, |max| min < max) {
809            lits.cut();
810        }
811    }
812}
813
814fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
815    es: &[Hir],
816    lits: &mut Literals,
817    mut f: F,
818) {
819    let mut lits2 = lits.to_empty();
820    for e in es {
821        let mut lits3 = lits.to_empty();
822        lits3.set_limit_size(lits.limit_size() / 5);
823        f(e, &mut lits3);
824        if lits3.is_empty() || !lits2.union(lits3) {
825            // If we couldn't find suffixes for *any* of the
826            // alternates, then the entire alternation has to be thrown
827            // away and any existing members must be frozen. Similarly,
828            // if the union couldn't complete, stop and freeze.
829            lits.cut();
830            return;
831        }
832    }
833    if !lits.cross_product(&lits2) {
834        lits.cut();
835    }
836}
837
838impl fmt::Debug for Literals {
839    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
840        f.debug_struct("Literals")
841            .field("lits", &self.lits)
842            .field("limit_size", &self.limit_size)
843            .field("limit_class", &self.limit_class)
844            .finish()
845    }
846}
847
848impl Literal {
849    /// Returns a new complete literal with the bytes given.
850    pub fn new(bytes: Vec<u8>) -> Literal {
851        Literal { v: bytes, cut: false }
852    }
853
854    /// Returns a new complete empty literal.
855    pub fn empty() -> Literal {
856        Literal { v: vec![], cut: false }
857    }
858
859    /// Returns true if this literal was "cut."
860    pub fn is_cut(&self) -> bool {
861        self.cut
862    }
863
864    /// Cuts this literal.
865    pub fn cut(&mut self) {
866        self.cut = true;
867    }
868}
869
870impl PartialEq for Literal {
871    fn eq(&self, other: &Literal) -> bool {
872        self.v == other.v
873    }
874}
875
876impl PartialOrd for Literal {
877    fn partial_cmp(&self, other: &Literal) -> Option<cmp::Ordering> {
878        self.v.partial_cmp(&other.v)
879    }
880}
881
882impl fmt::Debug for Literal {
883    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
884        if self.is_cut() {
885            write!(f, "Cut({})", escape_unicode(&self.v))
886        } else {
887            write!(f, "Complete({})", escape_unicode(&self.v))
888        }
889    }
890}
891
892impl AsRef<[u8]> for Literal {
893    fn as_ref(&self) -> &[u8] {
894        &self.v
895    }
896}
897
898impl ops::Deref for Literal {
899    type Target = Vec<u8>;
900    fn deref(&self) -> &Vec<u8> {
901        &self.v
902    }
903}
904
905impl ops::DerefMut for Literal {
906    fn deref_mut(&mut self) -> &mut Vec<u8> {
907        &mut self.v
908    }
909}
910
911fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> {
912    let mut i = 0;
913    while haystack.len() >= needle.len() {
914        if needle == &haystack[..needle.len()] {
915            return Some(i);
916        }
917        i += 1;
918        haystack = &haystack[1..];
919    }
920    None
921}
922
923fn escape_unicode(bytes: &[u8]) -> String {
924    let show = match ::std::str::from_utf8(bytes) {
925        Ok(v) => v.to_string(),
926        Err(_) => escape_bytes(bytes),
927    };
928    let mut space_escaped = String::new();
929    for c in show.chars() {
930        if c.is_whitespace() {
931            let escaped = if c as u32 <= 0x7F {
932                escape_byte(c as u8)
933            } else if c as u32 <= 0xFFFF {
934                format!(r"\u{{{:04x}}}", c as u32)
935            } else {
936                format!(r"\U{{{:08x}}}", c as u32)
937            };
938            space_escaped.push_str(&escaped);
939        } else {
940            space_escaped.push(c);
941        }
942    }
943    space_escaped
944}
945
946fn escape_bytes(bytes: &[u8]) -> String {
947    let mut s = String::new();
948    for &b in bytes {
949        s.push_str(&escape_byte(b));
950    }
951    s
952}
953
954fn escape_byte(byte: u8) -> String {
955    use std::ascii::escape_default;
956
957    let escaped: Vec<u8> = escape_default(byte).collect();
958    String::from_utf8_lossy(&escaped).into_owned()
959}
960
961fn cls_char_count(cls: &hir::ClassUnicode) -> usize {
962    cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
963        as usize
964}
965
966fn cls_byte_count(cls: &hir::ClassBytes) -> usize {
967    cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::<u32>()
968        as usize
969}
970
971#[cfg(test)]
972mod tests {
973    use std::fmt;
974
975    use super::{escape_bytes, Literal, Literals};
976    use crate::hir::Hir;
977    use crate::ParserBuilder;
978
979    // To make test failures easier to read.
980    #[derive(Debug, Eq, PartialEq)]
981    struct Bytes(Vec<ULiteral>);
982    #[derive(Debug, Eq, PartialEq)]
983    struct Unicode(Vec<ULiteral>);
984
985    fn escape_lits(blits: &[Literal]) -> Vec<ULiteral> {
986        let mut ulits = vec![];
987        for blit in blits {
988            ulits
989                .push(ULiteral { v: escape_bytes(&blit), cut: blit.is_cut() });
990        }
991        ulits
992    }
993
994    fn create_lits<I: IntoIterator<Item = Literal>>(it: I) -> Literals {
995        Literals {
996            lits: it.into_iter().collect(),
997            limit_size: 0,
998            limit_class: 0,
999        }
1000    }
1001
1002    // Needs to be pub for 1.3?
1003    #[derive(Clone, Eq, PartialEq)]
1004    pub struct ULiteral {
1005        v: String,
1006        cut: bool,
1007    }
1008
1009    impl ULiteral {
1010        fn is_cut(&self) -> bool {
1011            self.cut
1012        }
1013    }
1014
1015    impl fmt::Debug for ULiteral {
1016        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1017            if self.is_cut() {
1018                write!(f, "Cut({})", self.v)
1019            } else {
1020                write!(f, "Complete({})", self.v)
1021            }
1022        }
1023    }
1024
1025    impl PartialEq<Literal> for ULiteral {
1026        fn eq(&self, other: &Literal) -> bool {
1027            self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut()
1028        }
1029    }
1030
1031    impl PartialEq<ULiteral> for Literal {
1032        fn eq(&self, other: &ULiteral) -> bool {
1033            &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut()
1034        }
1035    }
1036
1037    #[allow(non_snake_case)]
1038    fn C(s: &'static str) -> ULiteral {
1039        ULiteral { v: s.to_owned(), cut: true }
1040    }
1041    #[allow(non_snake_case)]
1042    fn M(s: &'static str) -> ULiteral {
1043        ULiteral { v: s.to_owned(), cut: false }
1044    }
1045
1046    fn prefixes(lits: &mut Literals, expr: &Hir) {
1047        lits.union_prefixes(expr);
1048    }
1049
1050    fn suffixes(lits: &mut Literals, expr: &Hir) {
1051        lits.union_suffixes(expr);
1052    }
1053
1054    macro_rules! assert_lit_eq {
1055        ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{
1056            let expected: Vec<ULiteral> = vec![$($expected_lit),*];
1057            let lits = $got_lits;
1058            assert_eq!(
1059                $which(expected.clone()),
1060                $which(escape_lits(lits.literals())));
1061            assert_eq!(
1062                !expected.is_empty() && expected.iter().all(|l| !l.is_cut()),
1063                lits.all_complete());
1064            assert_eq!(
1065                expected.iter().any(|l| !l.is_cut()),
1066                lits.any_complete());
1067        }};
1068    }
1069
1070    macro_rules! test_lit {
1071        ($name:ident, $which:ident, $re:expr) => {
1072            test_lit!($name, $which, $re,);
1073        };
1074        ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1075            #[test]
1076            fn $name() {
1077                let expr = ParserBuilder::new()
1078                    .build()
1079                    .parse($re)
1080                    .unwrap();
1081                let lits = Literals::$which(&expr);
1082                assert_lit_eq!(Unicode, lits, $($lit),*);
1083
1084                let expr = ParserBuilder::new()
1085                    .allow_invalid_utf8(true)
1086                    .unicode(false)
1087                    .build()
1088                    .parse($re)
1089                    .unwrap();
1090                let lits = Literals::$which(&expr);
1091                assert_lit_eq!(Bytes, lits, $($lit),*);
1092            }
1093        };
1094    }
1095
1096    // ************************************************************************
1097    // Tests for prefix literal extraction.
1098    // ************************************************************************
1099
1100    // Elementary tests.
1101    test_lit!(pfx_one_lit1, prefixes, "a", M("a"));
1102    test_lit!(pfx_one_lit2, prefixes, "abc", M("abc"));
1103    test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83"));
1104    #[cfg(feature = "unicode-case")]
1105    test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
1106    test_lit!(pfx_class1, prefixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
1107    test_lit!(
1108        pfx_class2,
1109        prefixes,
1110        "(?u)[☃Ⅰ]",
1111        M("\\xe2\\x85\\xa0"),
1112        M("\\xe2\\x98\\x83")
1113    );
1114    #[cfg(feature = "unicode-case")]
1115    test_lit!(
1116        pfx_class3,
1117        prefixes,
1118        "(?ui)[☃Ⅰ]",
1119        M("\\xe2\\x85\\xa0"),
1120        M("\\xe2\\x85\\xb0"),
1121        M("\\xe2\\x98\\x83")
1122    );
1123    test_lit!(pfx_one_lit_casei1, prefixes, "(?i-u)a", M("A"), M("a"));
1124    test_lit!(
1125        pfx_one_lit_casei2,
1126        prefixes,
1127        "(?i-u)abc",
1128        M("ABC"),
1129        M("aBC"),
1130        M("AbC"),
1131        M("abC"),
1132        M("ABc"),
1133        M("aBc"),
1134        M("Abc"),
1135        M("abc")
1136    );
1137    test_lit!(pfx_group1, prefixes, "(a)", M("a"));
1138    test_lit!(pfx_rep_zero_or_one1, prefixes, "a?");
1139    test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?");
1140    test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?", C("ab"), M("a"));
1141    // FIXME: This should return [M("a"), M("ab")] because of the non-greedy
1142    // repetition. As a work-around, we rewrite ab?? as ab*?, and thus we get
1143    // a cut literal.
1144    test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??", C("ab"), M("a"));
1145    test_lit!(pfx_rep_zero_or_more1, prefixes, "a*");
1146    test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*");
1147    test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a"));
1148    test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc"));
1149    test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a"));
1150    test_lit!(pfx_rep_range1, prefixes, "a{0}");
1151    test_lit!(pfx_rep_range2, prefixes, "a{0,}");
1152    test_lit!(pfx_rep_range3, prefixes, "a{0,1}");
1153    test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a"));
1154    test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa"));
1155    test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a"));
1156    test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa"));
1157
1158    // Test regexes with concatenations.
1159    test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab"));
1160    test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz"));
1161    test_lit!(
1162        pfx_cat3,
1163        prefixes,
1164        "(?i-u)[ab]z",
1165        M("AZ"),
1166        M("BZ"),
1167        M("aZ"),
1168        M("bZ"),
1169        M("Az"),
1170        M("Bz"),
1171        M("az"),
1172        M("bz")
1173    );
1174    test_lit!(
1175        pfx_cat4,
1176        prefixes,
1177        "[ab][yz]",
1178        M("ay"),
1179        M("by"),
1180        M("az"),
1181        M("bz")
1182    );
1183    test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b"));
1184    test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c"));
1185    test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c"));
1186    test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b"));
1187    test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b"));
1188    test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a"));
1189    test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac"));
1190    test_lit!(pfx_cat12, prefixes, "ab+", C("ab"));
1191    test_lit!(pfx_cat13, prefixes, "ab+c", C("ab"));
1192    test_lit!(pfx_cat14, prefixes, "a^", C("a"));
1193    test_lit!(pfx_cat15, prefixes, "$a");
1194    test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac"));
1195    test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab"));
1196    test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb"));
1197    test_lit!(pfx_cat19, prefixes, "a.z", C("a"));
1198
1199    // Test regexes with alternations.
1200    test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b"));
1201    test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1202    test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz"));
1203    test_lit!(pfx_alt4, prefixes, "a|b*");
1204    test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b"));
1205    test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)");
1206    test_lit!(
1207        pfx_alt7,
1208        prefixes,
1209        "(a|b)*c|(a|ab)*c",
1210        C("a"),
1211        C("b"),
1212        M("c"),
1213        C("a"),
1214        C("ab"),
1215        M("c")
1216    );
1217    test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c"));
1218
1219    // Test regexes with empty assertions.
1220    test_lit!(pfx_empty1, prefixes, "^a", M("a"));
1221    test_lit!(pfx_empty2, prefixes, "a${2}", C("a"));
1222    test_lit!(pfx_empty3, prefixes, "^abc", M("abc"));
1223    test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z"));
1224
1225    // Make sure some curious regexes have no prefixes.
1226    test_lit!(pfx_nothing1, prefixes, ".");
1227    test_lit!(pfx_nothing2, prefixes, "(?s).");
1228    test_lit!(pfx_nothing3, prefixes, "^");
1229    test_lit!(pfx_nothing4, prefixes, "$");
1230    test_lit!(pfx_nothing6, prefixes, "(?m)$");
1231    test_lit!(pfx_nothing7, prefixes, r"\b");
1232    test_lit!(pfx_nothing8, prefixes, r"\B");
1233
1234    // Test a few regexes that defeat any prefix literal detection.
1235    test_lit!(pfx_defeated1, prefixes, ".a");
1236    test_lit!(pfx_defeated2, prefixes, "(?s).a");
1237    test_lit!(pfx_defeated3, prefixes, "a*b*c*");
1238    test_lit!(pfx_defeated4, prefixes, "a|.");
1239    test_lit!(pfx_defeated5, prefixes, ".|a");
1240    test_lit!(pfx_defeated6, prefixes, "a|^");
1241    test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))");
1242    test_lit!(pfx_defeated8, prefixes, "$a");
1243    test_lit!(pfx_defeated9, prefixes, "(?m)$a");
1244    test_lit!(pfx_defeated10, prefixes, r"\ba");
1245    test_lit!(pfx_defeated11, prefixes, r"\Ba");
1246    test_lit!(pfx_defeated12, prefixes, "^*a");
1247    test_lit!(pfx_defeated13, prefixes, "^+a");
1248
1249    test_lit!(
1250        pfx_crazy1,
1251        prefixes,
1252        r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]",
1253        C("Mo\\'"),
1254        C("Mu\\'"),
1255        C("Moam"),
1256        C("Muam")
1257    );
1258
1259    // ************************************************************************
1260    // Tests for quiting prefix literal search.
1261    // ************************************************************************
1262
1263    macro_rules! test_exhausted {
1264        ($name:ident, $which:ident, $re:expr) => {
1265            test_exhausted!($name, $which, $re,);
1266        };
1267        ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => {
1268            #[test]
1269            fn $name() {
1270                let expr = ParserBuilder::new()
1271                    .build()
1272                    .parse($re)
1273                    .unwrap();
1274                let mut lits = Literals::empty();
1275                lits.set_limit_size(20).set_limit_class(10);
1276                $which(&mut lits, &expr);
1277                assert_lit_eq!(Unicode, lits, $($lit),*);
1278
1279                let expr = ParserBuilder::new()
1280                    .allow_invalid_utf8(true)
1281                    .unicode(false)
1282                    .build()
1283                    .parse($re)
1284                    .unwrap();
1285                let mut lits = Literals::empty();
1286                lits.set_limit_size(20).set_limit_class(10);
1287                $which(&mut lits, &expr);
1288                assert_lit_eq!(Bytes, lits, $($lit),*);
1289            }
1290        };
1291    }
1292
1293    // These test use a much lower limit than the default so that we can
1294    // write test cases of reasonable size.
1295    test_exhausted!(pfx_exhausted1, prefixes, "[a-z]");
1296    test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A");
1297    test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A"));
1298    test_exhausted!(
1299        pfx_exhausted4,
1300        prefixes,
1301        "(?i-u)foobar",
1302        C("FO"),
1303        C("fO"),
1304        C("Fo"),
1305        C("fo")
1306    );
1307    test_exhausted!(
1308        pfx_exhausted5,
1309        prefixes,
1310        "(?:ab){100}",
1311        C("abababababababababab")
1312    );
1313    test_exhausted!(
1314        pfx_exhausted6,
1315        prefixes,
1316        "(?:(?:ab){100})*cd",
1317        C("ababababab"),
1318        M("cd")
1319    );
1320    test_exhausted!(
1321        pfx_exhausted7,
1322        prefixes,
1323        "z(?:(?:ab){100})*cd",
1324        C("zababababab"),
1325        M("zcd")
1326    );
1327    test_exhausted!(
1328        pfx_exhausted8,
1329        prefixes,
1330        "aaaaaaaaaaaaaaaaaaaaz",
1331        C("aaaaaaaaaaaaaaaaaaaa")
1332    );
1333
1334    // ************************************************************************
1335    // Tests for suffix literal extraction.
1336    // ************************************************************************
1337
1338    // Elementary tests.
1339    test_lit!(sfx_one_lit1, suffixes, "a", M("a"));
1340    test_lit!(sfx_one_lit2, suffixes, "abc", M("abc"));
1341    test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83"));
1342    #[cfg(feature = "unicode-case")]
1343    test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83"));
1344    test_lit!(sfx_class1, suffixes, "[1-4]", M("1"), M("2"), M("3"), M("4"));
1345    test_lit!(
1346        sfx_class2,
1347        suffixes,
1348        "(?u)[☃Ⅰ]",
1349        M("\\xe2\\x85\\xa0"),
1350        M("\\xe2\\x98\\x83")
1351    );
1352    #[cfg(feature = "unicode-case")]
1353    test_lit!(
1354        sfx_class3,
1355        suffixes,
1356        "(?ui)[☃Ⅰ]",
1357        M("\\xe2\\x85\\xa0"),
1358        M("\\xe2\\x85\\xb0"),
1359        M("\\xe2\\x98\\x83")
1360    );
1361    test_lit!(sfx_one_lit_casei1, suffixes, "(?i-u)a", M("A"), M("a"));
1362    test_lit!(
1363        sfx_one_lit_casei2,
1364        suffixes,
1365        "(?i-u)abc",
1366        M("ABC"),
1367        M("ABc"),
1368        M("AbC"),
1369        M("Abc"),
1370        M("aBC"),
1371        M("aBc"),
1372        M("abC"),
1373        M("abc")
1374    );
1375    test_lit!(sfx_group1, suffixes, "(a)", M("a"));
1376    test_lit!(sfx_rep_zero_or_one1, suffixes, "a?");
1377    test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?");
1378    test_lit!(sfx_rep_zero_or_more1, suffixes, "a*");
1379    test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*");
1380    test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a"));
1381    test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc"));
1382    test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a"));
1383    test_lit!(sfx_rep_range1, suffixes, "a{0}");
1384    test_lit!(sfx_rep_range2, suffixes, "a{0,}");
1385    test_lit!(sfx_rep_range3, suffixes, "a{0,1}");
1386    test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a"));
1387    test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa"));
1388    test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a"));
1389    test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa"));
1390
1391    // Test regexes with concatenations.
1392    test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab"));
1393    test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz"));
1394    test_lit!(
1395        sfx_cat3,
1396        suffixes,
1397        "(?i-u)[ab]z",
1398        M("AZ"),
1399        M("Az"),
1400        M("BZ"),
1401        M("Bz"),
1402        M("aZ"),
1403        M("az"),
1404        M("bZ"),
1405        M("bz")
1406    );
1407    test_lit!(
1408        sfx_cat4,
1409        suffixes,
1410        "[ab][yz]",
1411        M("ay"),
1412        M("az"),
1413        M("by"),
1414        M("bz")
1415    );
1416    test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b"));
1417    test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c"));
1418    test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c"));
1419    test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc"));
1420    test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b"));
1421    test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a"));
1422    test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac"));
1423    test_lit!(sfx_cat12, suffixes, "ab+", C("b"));
1424    test_lit!(sfx_cat13, suffixes, "ab+c", C("bc"));
1425    test_lit!(sfx_cat14, suffixes, "a^");
1426    test_lit!(sfx_cat15, suffixes, "$a", C("a"));
1427    test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac"));
1428    test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc"));
1429    test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb"));
1430    test_lit!(sfx_cat19, suffixes, "a.z", C("z"));
1431
1432    // Test regexes with alternations.
1433    test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b"));
1434    test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b"));
1435    test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz"));
1436    test_lit!(sfx_alt4, suffixes, "a|b*");
1437    test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b"));
1438    test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)");
1439    test_lit!(
1440        sfx_alt7,
1441        suffixes,
1442        "(a|b)*c|(a|ab)*c",
1443        C("ac"),
1444        C("bc"),
1445        M("c"),
1446        C("ac"),
1447        C("abc"),
1448        M("c")
1449    );
1450    test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c"));
1451
1452    // Test regexes with empty assertions.
1453    test_lit!(sfx_empty1, suffixes, "a$", M("a"));
1454    test_lit!(sfx_empty2, suffixes, "${2}a", C("a"));
1455
1456    // Make sure some curious regexes have no suffixes.
1457    test_lit!(sfx_nothing1, suffixes, ".");
1458    test_lit!(sfx_nothing2, suffixes, "(?s).");
1459    test_lit!(sfx_nothing3, suffixes, "^");
1460    test_lit!(sfx_nothing4, suffixes, "$");
1461    test_lit!(sfx_nothing6, suffixes, "(?m)$");
1462    test_lit!(sfx_nothing7, suffixes, r"\b");
1463    test_lit!(sfx_nothing8, suffixes, r"\B");
1464
1465    // Test a few regexes that defeat any suffix literal detection.
1466    test_lit!(sfx_defeated1, suffixes, "a.");
1467    test_lit!(sfx_defeated2, suffixes, "(?s)a.");
1468    test_lit!(sfx_defeated3, suffixes, "a*b*c*");
1469    test_lit!(sfx_defeated4, suffixes, "a|.");
1470    test_lit!(sfx_defeated5, suffixes, ".|a");
1471    test_lit!(sfx_defeated6, suffixes, "a|^");
1472    test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c)).");
1473    test_lit!(sfx_defeated8, suffixes, "a^");
1474    test_lit!(sfx_defeated9, suffixes, "(?m)a$");
1475    test_lit!(sfx_defeated10, suffixes, r"a\b");
1476    test_lit!(sfx_defeated11, suffixes, r"a\B");
1477    test_lit!(sfx_defeated12, suffixes, "a^*");
1478    test_lit!(sfx_defeated13, suffixes, "a^+");
1479
1480    // These test use a much lower limit than the default so that we can
1481    // write test cases of reasonable size.
1482    test_exhausted!(sfx_exhausted1, suffixes, "[a-z]");
1483    test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*");
1484    test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z"));
1485    test_exhausted!(
1486        sfx_exhausted4,
1487        suffixes,
1488        "(?i-u)foobar",
1489        C("AR"),
1490        C("Ar"),
1491        C("aR"),
1492        C("ar")
1493    );
1494    test_exhausted!(
1495        sfx_exhausted5,
1496        suffixes,
1497        "(?:ab){100}",
1498        C("abababababababababab")
1499    );
1500    test_exhausted!(
1501        sfx_exhausted6,
1502        suffixes,
1503        "cd(?:(?:ab){100})*",
1504        C("ababababab"),
1505        M("cd")
1506    );
1507    test_exhausted!(
1508        sfx_exhausted7,
1509        suffixes,
1510        "cd(?:(?:ab){100})*z",
1511        C("abababababz"),
1512        M("cdz")
1513    );
1514    test_exhausted!(
1515        sfx_exhausted8,
1516        suffixes,
1517        "zaaaaaaaaaaaaaaaaaaaa",
1518        C("aaaaaaaaaaaaaaaaaaaa")
1519    );
1520
1521    // ************************************************************************
1522    // Tests for generating unambiguous literal sets.
1523    // ************************************************************************
1524
1525    macro_rules! test_unamb {
1526        ($name:ident, $given:expr, $expected:expr) => {
1527            #[test]
1528            fn $name() {
1529                let given: Vec<Literal> = $given
1530                    .into_iter()
1531                    .map(|ul| {
1532                        let cut = ul.is_cut();
1533                        Literal { v: ul.v.into_bytes(), cut: cut }
1534                    })
1535                    .collect();
1536                let lits = create_lits(given);
1537                let got = lits.unambiguous_prefixes();
1538                assert_eq!($expected, escape_lits(got.literals()));
1539            }
1540        };
1541    }
1542
1543    test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]);
1544    test_unamb!(
1545        unambiguous2,
1546        vec![M("zaaaaaa"), M("aa")],
1547        vec![C("aa"), C("z")]
1548    );
1549    test_unamb!(
1550        unambiguous3,
1551        vec![M("Sherlock"), M("Watson")],
1552        vec![M("Sherlock"), M("Watson")]
1553    );
1554    test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]);
1555    test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]);
1556    test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]);
1557    test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]);
1558    test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]);
1559    test_unamb!(
1560        unambiguous9,
1561        vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")],
1562        vec![C("a"), C("b"), C("c")]
1563    );
1564    test_unamb!(
1565        unambiguous10,
1566        vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")],
1567        vec![C("Mo"), C("Mu")]
1568    );
1569    test_unamb!(
1570        unambiguous11,
1571        vec![M("zazb"), M("azb")],
1572        vec![C("a"), C("z")]
1573    );
1574    test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]);
1575    test_unamb!(
1576        unambiguous13,
1577        vec![M("ABCX"), M("CDAX"), M("BCX")],
1578        vec![C("A"), C("BCX"), C("CD")]
1579    );
1580    test_unamb!(
1581        unambiguous14,
1582        vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")],
1583        vec![M("DSX"), C("I"), C("MGX"), C("MV")]
1584    );
1585    test_unamb!(
1586        unambiguous15,
1587        vec![M("IMG_"), M("MG_"), M("CIMG")],
1588        vec![C("C"), C("I"), C("MG_")]
1589    );
1590
1591    // ************************************************************************
1592    // Tests for suffix trimming.
1593    // ************************************************************************
1594    macro_rules! test_trim {
1595        ($name:ident, $trim:expr, $given:expr, $expected:expr) => {
1596            #[test]
1597            fn $name() {
1598                let given: Vec<Literal> = $given
1599                    .into_iter()
1600                    .map(|ul| {
1601                        let cut = ul.is_cut();
1602                        Literal { v: ul.v.into_bytes(), cut: cut }
1603                    })
1604                    .collect();
1605                let lits = create_lits(given);
1606                let got = lits.trim_suffix($trim).unwrap();
1607                assert_eq!($expected, escape_lits(got.literals()));
1608            }
1609        };
1610    }
1611
1612    test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]);
1613    test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]);
1614    test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]);
1615    test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]);
1616
1617    // ************************************************************************
1618    // Tests for longest common prefix.
1619    // ************************************************************************
1620
1621    macro_rules! test_lcp {
1622        ($name:ident, $given:expr, $expected:expr) => {
1623            #[test]
1624            fn $name() {
1625                let given: Vec<Literal> = $given
1626                    .into_iter()
1627                    .map(|s: &str| Literal {
1628                        v: s.to_owned().into_bytes(),
1629                        cut: false,
1630                    })
1631                    .collect();
1632                let lits = create_lits(given);
1633                let got = lits.longest_common_prefix();
1634                assert_eq!($expected, escape_bytes(got));
1635            }
1636        };
1637    }
1638
1639    test_lcp!(lcp1, vec!["a"], "a");
1640    test_lcp!(lcp2, vec![], "");
1641    test_lcp!(lcp3, vec!["a", "b"], "");
1642    test_lcp!(lcp4, vec!["ab", "ab"], "ab");
1643    test_lcp!(lcp5, vec!["ab", "a"], "a");
1644    test_lcp!(lcp6, vec!["a", "ab"], "a");
1645    test_lcp!(lcp7, vec!["ab", "b"], "");
1646    test_lcp!(lcp8, vec!["b", "ab"], "");
1647    test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba");
1648    test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], "");
1649    test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], "");
1650    test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f");
1651
1652    // ************************************************************************
1653    // Tests for longest common suffix.
1654    // ************************************************************************
1655
1656    macro_rules! test_lcs {
1657        ($name:ident, $given:expr, $expected:expr) => {
1658            #[test]
1659            fn $name() {
1660                let given: Vec<Literal> = $given
1661                    .into_iter()
1662                    .map(|s: &str| Literal {
1663                        v: s.to_owned().into_bytes(),
1664                        cut: false,
1665                    })
1666                    .collect();
1667                let lits = create_lits(given);
1668                let got = lits.longest_common_suffix();
1669                assert_eq!($expected, escape_bytes(got));
1670            }
1671        };
1672    }
1673
1674    test_lcs!(lcs1, vec!["a"], "a");
1675    test_lcs!(lcs2, vec![], "");
1676    test_lcs!(lcs3, vec!["a", "b"], "");
1677    test_lcs!(lcs4, vec!["ab", "ab"], "ab");
1678    test_lcs!(lcs5, vec!["ab", "a"], "");
1679    test_lcs!(lcs6, vec!["a", "ab"], "");
1680    test_lcs!(lcs7, vec!["ab", "b"], "b");
1681    test_lcs!(lcs8, vec!["b", "ab"], "b");
1682    test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo");
1683    test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], "");
1684    test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], "");
1685    test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b");
1686}