regex_syntax/hir/
mod.rs

1/*!
2Defines a high-level intermediate representation for regular expressions.
3*/
4use std::char;
5use std::cmp;
6use std::error;
7use std::fmt;
8use std::result;
9use std::u8;
10
11use crate::ast::Span;
12use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter};
13use crate::unicode;
14
15pub use crate::hir::visitor::{visit, Visitor};
16pub use crate::unicode::CaseFoldError;
17
18mod interval;
19pub mod literal;
20pub mod print;
21pub mod translate;
22mod visitor;
23
24/// An error that can occur while translating an `Ast` to a `Hir`.
25#[derive(Clone, Debug, Eq, PartialEq)]
26pub struct Error {
27    /// The kind of error.
28    kind: ErrorKind,
29    /// The original pattern that the translator's Ast was parsed from. Every
30    /// span in an error is a valid range into this string.
31    pattern: String,
32    /// The span of this error, derived from the Ast given to the translator.
33    span: Span,
34}
35
36impl Error {
37    /// Return the type of this error.
38    pub fn kind(&self) -> &ErrorKind {
39        &self.kind
40    }
41
42    /// The original pattern string in which this error occurred.
43    ///
44    /// Every span reported by this error is reported in terms of this string.
45    pub fn pattern(&self) -> &str {
46        &self.pattern
47    }
48
49    /// Return the span at which this error occurred.
50    pub fn span(&self) -> &Span {
51        &self.span
52    }
53}
54
55/// The type of an error that occurred while building an `Hir`.
56#[derive(Clone, Debug, Eq, PartialEq)]
57pub enum ErrorKind {
58    /// This error occurs when a Unicode feature is used when Unicode
59    /// support is disabled. For example `(?-u:\pL)` would trigger this error.
60    UnicodeNotAllowed,
61    /// This error occurs when translating a pattern that could match a byte
62    /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled.
63    InvalidUtf8,
64    /// This occurs when an unrecognized Unicode property name could not
65    /// be found.
66    UnicodePropertyNotFound,
67    /// This occurs when an unrecognized Unicode property value could not
68    /// be found.
69    UnicodePropertyValueNotFound,
70    /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
71    /// `\d`) could not be found. This can occur when the `unicode-perl`
72    /// crate feature is not enabled.
73    UnicodePerlClassNotFound,
74    /// This occurs when the Unicode simple case mapping tables are not
75    /// available, and the regular expression required Unicode aware case
76    /// insensitivity.
77    UnicodeCaseUnavailable,
78    /// This occurs when the translator attempts to construct a character class
79    /// that is empty.
80    ///
81    /// Note that this restriction in the translator may be removed in the
82    /// future.
83    EmptyClassNotAllowed,
84    /// Hints that destructuring should not be exhaustive.
85    ///
86    /// This enum may grow additional variants, so this makes sure clients
87    /// don't count on exhaustive matching. (Otherwise, adding a new variant
88    /// could break existing code.)
89    #[doc(hidden)]
90    __Nonexhaustive,
91}
92
93impl ErrorKind {
94    // TODO: Remove this method entirely on the next breaking semver release.
95    #[allow(deprecated)]
96    fn description(&self) -> &str {
97        use self::ErrorKind::*;
98        match *self {
99            UnicodeNotAllowed => "Unicode not allowed here",
100            InvalidUtf8 => "pattern can match invalid UTF-8",
101            UnicodePropertyNotFound => "Unicode property not found",
102            UnicodePropertyValueNotFound => "Unicode property value not found",
103            UnicodePerlClassNotFound => {
104                "Unicode-aware Perl class not found \
105                 (make sure the unicode-perl feature is enabled)"
106            }
107            UnicodeCaseUnavailable => {
108                "Unicode-aware case insensitivity matching is not available \
109                 (make sure the unicode-case feature is enabled)"
110            }
111            EmptyClassNotAllowed => "empty character classes are not allowed",
112            __Nonexhaustive => unreachable!(),
113        }
114    }
115}
116
117impl error::Error for Error {
118    // TODO: Remove this method entirely on the next breaking semver release.
119    #[allow(deprecated)]
120    fn description(&self) -> &str {
121        self.kind.description()
122    }
123}
124
125impl fmt::Display for Error {
126    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
127        crate::error::Formatter::from(self).fmt(f)
128    }
129}
130
131impl fmt::Display for ErrorKind {
132    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
133        // TODO: Remove this on the next breaking semver release.
134        #[allow(deprecated)]
135        f.write_str(self.description())
136    }
137}
138
139/// A high-level intermediate representation (HIR) for a regular expression.
140///
141/// The HIR of a regular expression represents an intermediate step between its
142/// abstract syntax (a structured description of the concrete syntax) and
143/// compiled byte codes. The purpose of HIR is to make regular expressions
144/// easier to analyze. In particular, the AST is much more complex than the
145/// HIR. For example, while an AST supports arbitrarily nested character
146/// classes, the HIR will flatten all nested classes into a single set. The HIR
147/// will also "compile away" every flag present in the concrete syntax. For
148/// example, users of HIR expressions never need to worry about case folding;
149/// it is handled automatically by the translator (e.g., by translating `(?i)A`
150/// to `[aA]`).
151///
152/// If the HIR was produced by a translator that disallows invalid UTF-8, then
153/// the HIR is guaranteed to match UTF-8 exclusively.
154///
155/// This type defines its own destructor that uses constant stack space and
156/// heap space proportional to the size of the HIR.
157///
158/// The specific type of an HIR expression can be accessed via its `kind`
159/// or `into_kind` methods. This extra level of indirection exists for two
160/// reasons:
161///
162/// 1. Construction of an HIR expression *must* use the constructor methods
163///    on this `Hir` type instead of building the `HirKind` values directly.
164///    This permits construction to enforce invariants like "concatenations
165///    always consist of two or more sub-expressions."
166/// 2. Every HIR expression contains attributes that are defined inductively,
167///    and can be computed cheaply during the construction process. For
168///    example, one such attribute is whether the expression must match at the
169///    beginning of the text.
170///
171/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
172/// expression pattern string, and uses constant stack space and heap space
173/// proportional to the size of the `Hir`.
174#[derive(Clone, Debug, Eq, PartialEq)]
175pub struct Hir {
176    /// The underlying HIR kind.
177    kind: HirKind,
178    /// Analysis info about this HIR, computed during construction.
179    info: HirInfo,
180}
181
182/// The kind of an arbitrary `Hir` expression.
183#[derive(Clone, Debug, Eq, PartialEq)]
184pub enum HirKind {
185    /// The empty regular expression, which matches everything, including the
186    /// empty string.
187    Empty,
188    /// A single literal character that matches exactly this character.
189    Literal(Literal),
190    /// A single character class that matches any of the characters in the
191    /// class. A class can either consist of Unicode scalar values as
192    /// characters, or it can use bytes.
193    Class(Class),
194    /// An anchor assertion. An anchor assertion match always has zero length.
195    Anchor(Anchor),
196    /// A word boundary assertion, which may or may not be Unicode aware. A
197    /// word boundary assertion match always has zero length.
198    WordBoundary(WordBoundary),
199    /// A repetition operation applied to a child expression.
200    Repetition(Repetition),
201    /// A possibly capturing group, which contains a child expression.
202    Group(Group),
203    /// A concatenation of expressions. A concatenation always has at least two
204    /// child expressions.
205    ///
206    /// A concatenation matches only if each of its child expression matches
207    /// one after the other.
208    Concat(Vec<Hir>),
209    /// An alternation of expressions. An alternation always has at least two
210    /// child expressions.
211    ///
212    /// An alternation matches only if at least one of its child expression
213    /// matches. If multiple expressions match, then the leftmost is preferred.
214    Alternation(Vec<Hir>),
215}
216
217impl Hir {
218    /// Returns a reference to the underlying HIR kind.
219    pub fn kind(&self) -> &HirKind {
220        &self.kind
221    }
222
223    /// Consumes ownership of this HIR expression and returns its underlying
224    /// `HirKind`.
225    pub fn into_kind(mut self) -> HirKind {
226        use std::mem;
227        mem::replace(&mut self.kind, HirKind::Empty)
228    }
229
230    /// Returns an empty HIR expression.
231    ///
232    /// An empty HIR expression always matches, including the empty string.
233    pub fn empty() -> Hir {
234        let mut info = HirInfo::new();
235        info.set_always_utf8(true);
236        info.set_all_assertions(true);
237        info.set_anchored_start(false);
238        info.set_anchored_end(false);
239        info.set_line_anchored_start(false);
240        info.set_line_anchored_end(false);
241        info.set_any_anchored_start(false);
242        info.set_any_anchored_end(false);
243        info.set_match_empty(true);
244        info.set_literal(false);
245        info.set_alternation_literal(false);
246        Hir { kind: HirKind::Empty, info }
247    }
248
249    /// Creates a literal HIR expression.
250    ///
251    /// If the given literal has a `Byte` variant with an ASCII byte, then this
252    /// method panics. This enforces the invariant that `Byte` variants are
253    /// only used to express matching of invalid UTF-8.
254    pub fn literal(lit: Literal) -> Hir {
255        if let Literal::Byte(b) = lit {
256            assert!(b > 0x7F);
257        }
258
259        let mut info = HirInfo::new();
260        info.set_always_utf8(lit.is_unicode());
261        info.set_all_assertions(false);
262        info.set_anchored_start(false);
263        info.set_anchored_end(false);
264        info.set_line_anchored_start(false);
265        info.set_line_anchored_end(false);
266        info.set_any_anchored_start(false);
267        info.set_any_anchored_end(false);
268        info.set_match_empty(false);
269        info.set_literal(true);
270        info.set_alternation_literal(true);
271        Hir { kind: HirKind::Literal(lit), info }
272    }
273
274    /// Creates a class HIR expression.
275    pub fn class(class: Class) -> Hir {
276        let mut info = HirInfo::new();
277        info.set_always_utf8(class.is_always_utf8());
278        info.set_all_assertions(false);
279        info.set_anchored_start(false);
280        info.set_anchored_end(false);
281        info.set_line_anchored_start(false);
282        info.set_line_anchored_end(false);
283        info.set_any_anchored_start(false);
284        info.set_any_anchored_end(false);
285        info.set_match_empty(false);
286        info.set_literal(false);
287        info.set_alternation_literal(false);
288        Hir { kind: HirKind::Class(class), info }
289    }
290
291    /// Creates an anchor assertion HIR expression.
292    pub fn anchor(anchor: Anchor) -> Hir {
293        let mut info = HirInfo::new();
294        info.set_always_utf8(true);
295        info.set_all_assertions(true);
296        info.set_anchored_start(false);
297        info.set_anchored_end(false);
298        info.set_line_anchored_start(false);
299        info.set_line_anchored_end(false);
300        info.set_any_anchored_start(false);
301        info.set_any_anchored_end(false);
302        info.set_match_empty(true);
303        info.set_literal(false);
304        info.set_alternation_literal(false);
305        if let Anchor::StartText = anchor {
306            info.set_anchored_start(true);
307            info.set_line_anchored_start(true);
308            info.set_any_anchored_start(true);
309        }
310        if let Anchor::EndText = anchor {
311            info.set_anchored_end(true);
312            info.set_line_anchored_end(true);
313            info.set_any_anchored_end(true);
314        }
315        if let Anchor::StartLine = anchor {
316            info.set_line_anchored_start(true);
317        }
318        if let Anchor::EndLine = anchor {
319            info.set_line_anchored_end(true);
320        }
321        Hir { kind: HirKind::Anchor(anchor), info }
322    }
323
324    /// Creates a word boundary assertion HIR expression.
325    pub fn word_boundary(word_boundary: WordBoundary) -> Hir {
326        let mut info = HirInfo::new();
327        info.set_always_utf8(true);
328        info.set_all_assertions(true);
329        info.set_anchored_start(false);
330        info.set_anchored_end(false);
331        info.set_line_anchored_start(false);
332        info.set_line_anchored_end(false);
333        info.set_any_anchored_start(false);
334        info.set_any_anchored_end(false);
335        info.set_literal(false);
336        info.set_alternation_literal(false);
337        // A negated word boundary matches '', so that's fine. But \b does not
338        // match \b, so why do we say it can match the empty string? Well,
339        // because, if you search for \b against 'a', it will report [0, 0) and
340        // [1, 1) as matches, and both of those matches correspond to the empty
341        // string. Thus, only *certain* empty strings match \b, which similarly
342        // applies to \B.
343        info.set_match_empty(true);
344        // Negated ASCII word boundaries can match invalid UTF-8.
345        if let WordBoundary::AsciiNegate = word_boundary {
346            info.set_always_utf8(false);
347        }
348        Hir { kind: HirKind::WordBoundary(word_boundary), info }
349    }
350
351    /// Creates a repetition HIR expression.
352    pub fn repetition(rep: Repetition) -> Hir {
353        let mut info = HirInfo::new();
354        info.set_always_utf8(rep.hir.is_always_utf8());
355        info.set_all_assertions(rep.hir.is_all_assertions());
356        // If this operator can match the empty string, then it can never
357        // be anchored.
358        info.set_anchored_start(
359            !rep.is_match_empty() && rep.hir.is_anchored_start(),
360        );
361        info.set_anchored_end(
362            !rep.is_match_empty() && rep.hir.is_anchored_end(),
363        );
364        info.set_line_anchored_start(
365            !rep.is_match_empty() && rep.hir.is_anchored_start(),
366        );
367        info.set_line_anchored_end(
368            !rep.is_match_empty() && rep.hir.is_anchored_end(),
369        );
370        info.set_any_anchored_start(rep.hir.is_any_anchored_start());
371        info.set_any_anchored_end(rep.hir.is_any_anchored_end());
372        info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty());
373        info.set_literal(false);
374        info.set_alternation_literal(false);
375        Hir { kind: HirKind::Repetition(rep), info }
376    }
377
378    /// Creates a group HIR expression.
379    pub fn group(group: Group) -> Hir {
380        let mut info = HirInfo::new();
381        info.set_always_utf8(group.hir.is_always_utf8());
382        info.set_all_assertions(group.hir.is_all_assertions());
383        info.set_anchored_start(group.hir.is_anchored_start());
384        info.set_anchored_end(group.hir.is_anchored_end());
385        info.set_line_anchored_start(group.hir.is_line_anchored_start());
386        info.set_line_anchored_end(group.hir.is_line_anchored_end());
387        info.set_any_anchored_start(group.hir.is_any_anchored_start());
388        info.set_any_anchored_end(group.hir.is_any_anchored_end());
389        info.set_match_empty(group.hir.is_match_empty());
390        info.set_literal(false);
391        info.set_alternation_literal(false);
392        Hir { kind: HirKind::Group(group), info }
393    }
394
395    /// Returns the concatenation of the given expressions.
396    ///
397    /// This flattens the concatenation as appropriate.
398    pub fn concat(mut exprs: Vec<Hir>) -> Hir {
399        match exprs.len() {
400            0 => Hir::empty(),
401            1 => exprs.pop().unwrap(),
402            _ => {
403                let mut info = HirInfo::new();
404                info.set_always_utf8(true);
405                info.set_all_assertions(true);
406                info.set_any_anchored_start(false);
407                info.set_any_anchored_end(false);
408                info.set_match_empty(true);
409                info.set_literal(true);
410                info.set_alternation_literal(true);
411
412                // Some attributes require analyzing all sub-expressions.
413                for e in &exprs {
414                    let x = info.is_always_utf8() && e.is_always_utf8();
415                    info.set_always_utf8(x);
416
417                    let x = info.is_all_assertions() && e.is_all_assertions();
418                    info.set_all_assertions(x);
419
420                    let x = info.is_any_anchored_start()
421                        || e.is_any_anchored_start();
422                    info.set_any_anchored_start(x);
423
424                    let x =
425                        info.is_any_anchored_end() || e.is_any_anchored_end();
426                    info.set_any_anchored_end(x);
427
428                    let x = info.is_match_empty() && e.is_match_empty();
429                    info.set_match_empty(x);
430
431                    let x = info.is_literal() && e.is_literal();
432                    info.set_literal(x);
433
434                    let x = info.is_alternation_literal()
435                        && e.is_alternation_literal();
436                    info.set_alternation_literal(x);
437                }
438                // Anchored attributes require something slightly more
439                // sophisticated. Normally, WLOG, to determine whether an
440                // expression is anchored to the start, we'd only need to check
441                // the first expression of a concatenation. However,
442                // expressions like `$\b^` are still anchored to the start,
443                // but the first expression in the concatenation *isn't*
444                // anchored to the start. So the "first" expression to look at
445                // is actually one that is either not an assertion or is
446                // specifically the StartText assertion.
447                info.set_anchored_start(
448                    exprs
449                        .iter()
450                        .take_while(|e| {
451                            e.is_anchored_start() || e.is_all_assertions()
452                        })
453                        .any(|e| e.is_anchored_start()),
454                );
455                // Similarly for the end anchor, but in reverse.
456                info.set_anchored_end(
457                    exprs
458                        .iter()
459                        .rev()
460                        .take_while(|e| {
461                            e.is_anchored_end() || e.is_all_assertions()
462                        })
463                        .any(|e| e.is_anchored_end()),
464                );
465                // Repeat the process for line anchors.
466                info.set_line_anchored_start(
467                    exprs
468                        .iter()
469                        .take_while(|e| {
470                            e.is_line_anchored_start() || e.is_all_assertions()
471                        })
472                        .any(|e| e.is_line_anchored_start()),
473                );
474                info.set_line_anchored_end(
475                    exprs
476                        .iter()
477                        .rev()
478                        .take_while(|e| {
479                            e.is_line_anchored_end() || e.is_all_assertions()
480                        })
481                        .any(|e| e.is_line_anchored_end()),
482                );
483                Hir { kind: HirKind::Concat(exprs), info }
484            }
485        }
486    }
487
488    /// Returns the alternation of the given expressions.
489    ///
490    /// This flattens the alternation as appropriate.
491    pub fn alternation(mut exprs: Vec<Hir>) -> Hir {
492        match exprs.len() {
493            0 => Hir::empty(),
494            1 => exprs.pop().unwrap(),
495            _ => {
496                let mut info = HirInfo::new();
497                info.set_always_utf8(true);
498                info.set_all_assertions(true);
499                info.set_anchored_start(true);
500                info.set_anchored_end(true);
501                info.set_line_anchored_start(true);
502                info.set_line_anchored_end(true);
503                info.set_any_anchored_start(false);
504                info.set_any_anchored_end(false);
505                info.set_match_empty(false);
506                info.set_literal(false);
507                info.set_alternation_literal(true);
508
509                // Some attributes require analyzing all sub-expressions.
510                for e in &exprs {
511                    let x = info.is_always_utf8() && e.is_always_utf8();
512                    info.set_always_utf8(x);
513
514                    let x = info.is_all_assertions() && e.is_all_assertions();
515                    info.set_all_assertions(x);
516
517                    let x = info.is_anchored_start() && e.is_anchored_start();
518                    info.set_anchored_start(x);
519
520                    let x = info.is_anchored_end() && e.is_anchored_end();
521                    info.set_anchored_end(x);
522
523                    let x = info.is_line_anchored_start()
524                        && e.is_line_anchored_start();
525                    info.set_line_anchored_start(x);
526
527                    let x = info.is_line_anchored_end()
528                        && e.is_line_anchored_end();
529                    info.set_line_anchored_end(x);
530
531                    let x = info.is_any_anchored_start()
532                        || e.is_any_anchored_start();
533                    info.set_any_anchored_start(x);
534
535                    let x =
536                        info.is_any_anchored_end() || e.is_any_anchored_end();
537                    info.set_any_anchored_end(x);
538
539                    let x = info.is_match_empty() || e.is_match_empty();
540                    info.set_match_empty(x);
541
542                    let x = info.is_alternation_literal() && e.is_literal();
543                    info.set_alternation_literal(x);
544                }
545                Hir { kind: HirKind::Alternation(exprs), info }
546            }
547        }
548    }
549
550    /// Build an HIR expression for `.`.
551    ///
552    /// A `.` expression matches any character except for `\n`. To build an
553    /// expression that matches any character, including `\n`, use the `any`
554    /// method.
555    ///
556    /// If `bytes` is `true`, then this assumes characters are limited to a
557    /// single byte.
558    pub fn dot(bytes: bool) -> Hir {
559        if bytes {
560            let mut cls = ClassBytes::empty();
561            cls.push(ClassBytesRange::new(b'\0', b'\x09'));
562            cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
563            Hir::class(Class::Bytes(cls))
564        } else {
565            let mut cls = ClassUnicode::empty();
566            cls.push(ClassUnicodeRange::new('\0', '\x09'));
567            cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
568            Hir::class(Class::Unicode(cls))
569        }
570    }
571
572    /// Build an HIR expression for `(?s).`.
573    ///
574    /// A `(?s).` expression matches any character, including `\n`. To build an
575    /// expression that matches any character except for `\n`, then use the
576    /// `dot` method.
577    ///
578    /// If `bytes` is `true`, then this assumes characters are limited to a
579    /// single byte.
580    pub fn any(bytes: bool) -> Hir {
581        if bytes {
582            let mut cls = ClassBytes::empty();
583            cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
584            Hir::class(Class::Bytes(cls))
585        } else {
586            let mut cls = ClassUnicode::empty();
587            cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}'));
588            Hir::class(Class::Unicode(cls))
589        }
590    }
591
592    /// Return true if and only if this HIR will always match valid UTF-8.
593    ///
594    /// When this returns false, then it is possible for this HIR expression
595    /// to match invalid UTF-8.
596    pub fn is_always_utf8(&self) -> bool {
597        self.info.is_always_utf8()
598    }
599
600    /// Returns true if and only if this entire HIR expression is made up of
601    /// zero-width assertions.
602    ///
603    /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but
604    /// not `^a`.
605    pub fn is_all_assertions(&self) -> bool {
606        self.info.is_all_assertions()
607    }
608
609    /// Return true if and only if this HIR is required to match from the
610    /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`,
611    /// `^foo|^bar` but not `^foo|bar`.
612    pub fn is_anchored_start(&self) -> bool {
613        self.info.is_anchored_start()
614    }
615
616    /// Return true if and only if this HIR is required to match at the end
617    /// of text. This includes expressions like `foo$`, `(foo|bar)$`,
618    /// `foo$|bar$` but not `foo$|bar`.
619    pub fn is_anchored_end(&self) -> bool {
620        self.info.is_anchored_end()
621    }
622
623    /// Return true if and only if this HIR is required to match from the
624    /// beginning of text or the beginning of a line. This includes expressions
625    /// like `^foo`, `(?m)^foo`, `^(foo|bar)`, `^(foo|bar)`, `(?m)^foo|^bar`
626    /// but not `^foo|bar` or `(?m)^foo|bar`.
627    ///
628    /// Note that if `is_anchored_start` is `true`, then
629    /// `is_line_anchored_start` will also be `true`. The reverse implication
630    /// is not true. For example, `(?m)^foo` is line anchored, but not
631    /// `is_anchored_start`.
632    pub fn is_line_anchored_start(&self) -> bool {
633        self.info.is_line_anchored_start()
634    }
635
636    /// Return true if and only if this HIR is required to match at the
637    /// end of text or the end of a line. This includes expressions like
638    /// `foo$`, `(?m)foo$`, `(foo|bar)$`, `(?m)(foo|bar)$`, `foo$|bar$`,
639    /// `(?m)(foo|bar)$`, but not `foo$|bar` or `(?m)foo$|bar`.
640    ///
641    /// Note that if `is_anchored_end` is `true`, then
642    /// `is_line_anchored_end` will also be `true`. The reverse implication
643    /// is not true. For example, `(?m)foo$` is line anchored, but not
644    /// `is_anchored_end`.
645    pub fn is_line_anchored_end(&self) -> bool {
646        self.info.is_line_anchored_end()
647    }
648
649    /// Return true if and only if this HIR contains any sub-expression that
650    /// is required to match at the beginning of text. Specifically, this
651    /// returns true if the `^` symbol (when multiline mode is disabled) or the
652    /// `\A` escape appear anywhere in the regex.
653    pub fn is_any_anchored_start(&self) -> bool {
654        self.info.is_any_anchored_start()
655    }
656
657    /// Return true if and only if this HIR contains any sub-expression that is
658    /// required to match at the end of text. Specifically, this returns true
659    /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape
660    /// appear anywhere in the regex.
661    pub fn is_any_anchored_end(&self) -> bool {
662        self.info.is_any_anchored_end()
663    }
664
665    /// Return true if and only if the empty string is part of the language
666    /// matched by this regular expression.
667    ///
668    /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\b`
669    /// and `\B`, but not `a` or `a+`.
670    pub fn is_match_empty(&self) -> bool {
671        self.info.is_match_empty()
672    }
673
674    /// Return true if and only if this HIR is a simple literal. This is only
675    /// true when this HIR expression is either itself a `Literal` or a
676    /// concatenation of only `Literal`s.
677    ///
678    /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`,
679    /// `` are not (even though that contain sub-expressions that are literals).
680    pub fn is_literal(&self) -> bool {
681        self.info.is_literal()
682    }
683
684    /// Return true if and only if this HIR is either a simple literal or an
685    /// alternation of simple literals. This is only
686    /// true when this HIR expression is either itself a `Literal` or a
687    /// concatenation of only `Literal`s or an alternation of only `Literal`s.
688    ///
689    /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation
690    /// literals, but `f+`, `(foo)`, `foo()`, ``
691    /// are not (even though that contain sub-expressions that are literals).
692    pub fn is_alternation_literal(&self) -> bool {
693        self.info.is_alternation_literal()
694    }
695}
696
697impl HirKind {
698    /// Return true if and only if this HIR is the empty regular expression.
699    ///
700    /// Note that this is not defined inductively. That is, it only tests if
701    /// this kind is the `Empty` variant. To get the inductive definition,
702    /// use the `is_match_empty` method on [`Hir`](struct.Hir.html).
703    pub fn is_empty(&self) -> bool {
704        match *self {
705            HirKind::Empty => true,
706            _ => false,
707        }
708    }
709
710    /// Returns true if and only if this kind has any (including possibly
711    /// empty) subexpressions.
712    pub fn has_subexprs(&self) -> bool {
713        match *self {
714            HirKind::Empty
715            | HirKind::Literal(_)
716            | HirKind::Class(_)
717            | HirKind::Anchor(_)
718            | HirKind::WordBoundary(_) => false,
719            HirKind::Group(_)
720            | HirKind::Repetition(_)
721            | HirKind::Concat(_)
722            | HirKind::Alternation(_) => true,
723        }
724    }
725}
726
727/// Print a display representation of this Hir.
728///
729/// The result of this is a valid regular expression pattern string.
730///
731/// This implementation uses constant stack space and heap space proportional
732/// to the size of the `Hir`.
733impl fmt::Display for Hir {
734    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
735        use crate::hir::print::Printer;
736        Printer::new().print(self, f)
737    }
738}
739
740/// The high-level intermediate representation of a literal.
741///
742/// A literal corresponds to a single character, where a character is either
743/// defined by a Unicode scalar value or an arbitrary byte. Unicode characters
744/// are preferred whenever possible. In particular, a `Byte` variant is only
745/// ever produced when it could match invalid UTF-8.
746#[derive(Clone, Debug, Eq, PartialEq)]
747pub enum Literal {
748    /// A single character represented by a Unicode scalar value.
749    Unicode(char),
750    /// A single character represented by an arbitrary byte.
751    Byte(u8),
752}
753
754impl Literal {
755    /// Returns true if and only if this literal corresponds to a Unicode
756    /// scalar value.
757    pub fn is_unicode(&self) -> bool {
758        match *self {
759            Literal::Unicode(_) => true,
760            Literal::Byte(b) if b <= 0x7F => true,
761            Literal::Byte(_) => false,
762        }
763    }
764}
765
766/// The high-level intermediate representation of a character class.
767///
768/// A character class corresponds to a set of characters. A character is either
769/// defined by a Unicode scalar value or a byte. Unicode characters are used
770/// by default, while bytes are used when Unicode mode (via the `u` flag) is
771/// disabled.
772///
773/// A character class, regardless of its character type, is represented by a
774/// sequence of non-overlapping non-adjacent ranges of characters.
775///
776/// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may
777/// be produced even when it exclusively matches valid UTF-8. This is because
778/// a `Bytes` variant represents an intention by the author of the regular
779/// expression to disable Unicode mode, which in turn impacts the semantics of
780/// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not
781/// match the same set of strings.
782#[derive(Clone, Debug, Eq, PartialEq)]
783pub enum Class {
784    /// A set of characters represented by Unicode scalar values.
785    Unicode(ClassUnicode),
786    /// A set of characters represented by arbitrary bytes (one byte per
787    /// character).
788    Bytes(ClassBytes),
789}
790
791impl Class {
792    /// Apply Unicode simple case folding to this character class, in place.
793    /// The character class will be expanded to include all simple case folded
794    /// character variants.
795    ///
796    /// If this is a byte oriented character class, then this will be limited
797    /// to the ASCII ranges `A-Z` and `a-z`.
798    pub fn case_fold_simple(&mut self) {
799        match *self {
800            Class::Unicode(ref mut x) => x.case_fold_simple(),
801            Class::Bytes(ref mut x) => x.case_fold_simple(),
802        }
803    }
804
805    /// Negate this character class in place.
806    ///
807    /// After completion, this character class will contain precisely the
808    /// characters that weren't previously in the class.
809    pub fn negate(&mut self) {
810        match *self {
811            Class::Unicode(ref mut x) => x.negate(),
812            Class::Bytes(ref mut x) => x.negate(),
813        }
814    }
815
816    /// Returns true if and only if this character class will only ever match
817    /// valid UTF-8.
818    ///
819    /// A character class can match invalid UTF-8 only when the following
820    /// conditions are met:
821    ///
822    /// 1. The translator was configured to permit generating an expression
823    ///    that can match invalid UTF-8. (By default, this is disabled.)
824    /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
825    ///    syntax or in the parser builder. By default, Unicode mode is
826    ///    enabled.
827    pub fn is_always_utf8(&self) -> bool {
828        match *self {
829            Class::Unicode(_) => true,
830            Class::Bytes(ref x) => x.is_all_ascii(),
831        }
832    }
833}
834
835/// A set of characters represented by Unicode scalar values.
836#[derive(Clone, Debug, Eq, PartialEq)]
837pub struct ClassUnicode {
838    set: IntervalSet<ClassUnicodeRange>,
839}
840
841impl ClassUnicode {
842    /// Create a new class from a sequence of ranges.
843    ///
844    /// The given ranges do not need to be in any specific order, and ranges
845    /// may overlap.
846    pub fn new<I>(ranges: I) -> ClassUnicode
847    where
848        I: IntoIterator<Item = ClassUnicodeRange>,
849    {
850        ClassUnicode { set: IntervalSet::new(ranges) }
851    }
852
853    /// Create a new class with no ranges.
854    pub fn empty() -> ClassUnicode {
855        ClassUnicode::new(vec![])
856    }
857
858    /// Add a new range to this set.
859    pub fn push(&mut self, range: ClassUnicodeRange) {
860        self.set.push(range);
861    }
862
863    /// Return an iterator over all ranges in this class.
864    ///
865    /// The iterator yields ranges in ascending order.
866    pub fn iter(&self) -> ClassUnicodeIter<'_> {
867        ClassUnicodeIter(self.set.iter())
868    }
869
870    /// Return the underlying ranges as a slice.
871    pub fn ranges(&self) -> &[ClassUnicodeRange] {
872        self.set.intervals()
873    }
874
875    /// Expand this character class such that it contains all case folded
876    /// characters, according to Unicode's "simple" mapping. For example, if
877    /// this class consists of the range `a-z`, then applying case folding will
878    /// result in the class containing both the ranges `a-z` and `A-Z`.
879    ///
880    /// # Panics
881    ///
882    /// This routine panics when the case mapping data necessary for this
883    /// routine to complete is unavailable. This occurs when the `unicode-case`
884    /// feature is not enabled.
885    ///
886    /// Callers should prefer using `try_case_fold_simple` instead, which will
887    /// return an error instead of panicking.
888    pub fn case_fold_simple(&mut self) {
889        self.set
890            .case_fold_simple()
891            .expect("unicode-case feature must be enabled");
892    }
893
894    /// Expand this character class such that it contains all case folded
895    /// characters, according to Unicode's "simple" mapping. For example, if
896    /// this class consists of the range `a-z`, then applying case folding will
897    /// result in the class containing both the ranges `a-z` and `A-Z`.
898    ///
899    /// # Error
900    ///
901    /// This routine returns an error when the case mapping data necessary
902    /// for this routine to complete is unavailable. This occurs when the
903    /// `unicode-case` feature is not enabled.
904    pub fn try_case_fold_simple(
905        &mut self,
906    ) -> result::Result<(), CaseFoldError> {
907        self.set.case_fold_simple()
908    }
909
910    /// Negate this character class.
911    ///
912    /// For all `c` where `c` is a Unicode scalar value, if `c` was in this
913    /// set, then it will not be in this set after negation.
914    pub fn negate(&mut self) {
915        self.set.negate();
916    }
917
918    /// Union this character class with the given character class, in place.
919    pub fn union(&mut self, other: &ClassUnicode) {
920        self.set.union(&other.set);
921    }
922
923    /// Intersect this character class with the given character class, in
924    /// place.
925    pub fn intersect(&mut self, other: &ClassUnicode) {
926        self.set.intersect(&other.set);
927    }
928
929    /// Subtract the given character class from this character class, in place.
930    pub fn difference(&mut self, other: &ClassUnicode) {
931        self.set.difference(&other.set);
932    }
933
934    /// Compute the symmetric difference of the given character classes, in
935    /// place.
936    ///
937    /// This computes the symmetric difference of two character classes. This
938    /// removes all elements in this class that are also in the given class,
939    /// but all adds all elements from the given class that aren't in this
940    /// class. That is, the class will contain all elements in either class,
941    /// but will not contain any elements that are in both classes.
942    pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
943        self.set.symmetric_difference(&other.set);
944    }
945
946    /// Returns true if and only if this character class will either match
947    /// nothing or only ASCII bytes. Stated differently, this returns false
948    /// if and only if this class contains a non-ASCII codepoint.
949    pub fn is_all_ascii(&self) -> bool {
950        self.set.intervals().last().map_or(true, |r| r.end <= '\x7F')
951    }
952}
953
954/// An iterator over all ranges in a Unicode character class.
955///
956/// The lifetime `'a` refers to the lifetime of the underlying class.
957#[derive(Debug)]
958pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
959
960impl<'a> Iterator for ClassUnicodeIter<'a> {
961    type Item = &'a ClassUnicodeRange;
962
963    fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
964        self.0.next()
965    }
966}
967
968/// A single range of characters represented by Unicode scalar values.
969///
970/// The range is closed. That is, the start and end of the range are included
971/// in the range.
972#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
973pub struct ClassUnicodeRange {
974    start: char,
975    end: char,
976}
977
978impl fmt::Debug for ClassUnicodeRange {
979    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
980        let start = if !self.start.is_whitespace() && !self.start.is_control()
981        {
982            self.start.to_string()
983        } else {
984            format!("0x{:X}", self.start as u32)
985        };
986        let end = if !self.end.is_whitespace() && !self.end.is_control() {
987            self.end.to_string()
988        } else {
989            format!("0x{:X}", self.end as u32)
990        };
991        f.debug_struct("ClassUnicodeRange")
992            .field("start", &start)
993            .field("end", &end)
994            .finish()
995    }
996}
997
998impl Interval for ClassUnicodeRange {
999    type Bound = char;
1000
1001    #[inline]
1002    fn lower(&self) -> char {
1003        self.start
1004    }
1005    #[inline]
1006    fn upper(&self) -> char {
1007        self.end
1008    }
1009    #[inline]
1010    fn set_lower(&mut self, bound: char) {
1011        self.start = bound;
1012    }
1013    #[inline]
1014    fn set_upper(&mut self, bound: char) {
1015        self.end = bound;
1016    }
1017
1018    /// Apply simple case folding to this Unicode scalar value range.
1019    ///
1020    /// Additional ranges are appended to the given vector. Canonical ordering
1021    /// is *not* maintained in the given vector.
1022    fn case_fold_simple(
1023        &self,
1024        ranges: &mut Vec<ClassUnicodeRange>,
1025    ) -> Result<(), unicode::CaseFoldError> {
1026        if !unicode::contains_simple_case_mapping(self.start, self.end)? {
1027            return Ok(());
1028        }
1029        let start = self.start as u32;
1030        let end = (self.end as u32).saturating_add(1);
1031        let mut next_simple_cp = None;
1032        for cp in (start..end).filter_map(char::from_u32) {
1033            if next_simple_cp.map_or(false, |next| cp < next) {
1034                continue;
1035            }
1036            let it = match unicode::simple_fold(cp)? {
1037                Ok(it) => it,
1038                Err(next) => {
1039                    next_simple_cp = next;
1040                    continue;
1041                }
1042            };
1043            for cp_folded in it {
1044                ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
1045            }
1046        }
1047        Ok(())
1048    }
1049}
1050
1051impl ClassUnicodeRange {
1052    /// Create a new Unicode scalar value range for a character class.
1053    ///
1054    /// The returned range is always in a canonical form. That is, the range
1055    /// returned always satisfies the invariant that `start <= end`.
1056    pub fn new(start: char, end: char) -> ClassUnicodeRange {
1057        ClassUnicodeRange::create(start, end)
1058    }
1059
1060    /// Return the start of this range.
1061    ///
1062    /// The start of a range is always less than or equal to the end of the
1063    /// range.
1064    pub fn start(&self) -> char {
1065        self.start
1066    }
1067
1068    /// Return the end of this range.
1069    ///
1070    /// The end of a range is always greater than or equal to the start of the
1071    /// range.
1072    pub fn end(&self) -> char {
1073        self.end
1074    }
1075}
1076
1077/// A set of characters represented by arbitrary bytes (where one byte
1078/// corresponds to one character).
1079#[derive(Clone, Debug, Eq, PartialEq)]
1080pub struct ClassBytes {
1081    set: IntervalSet<ClassBytesRange>,
1082}
1083
1084impl ClassBytes {
1085    /// Create a new class from a sequence of ranges.
1086    ///
1087    /// The given ranges do not need to be in any specific order, and ranges
1088    /// may overlap.
1089    pub fn new<I>(ranges: I) -> ClassBytes
1090    where
1091        I: IntoIterator<Item = ClassBytesRange>,
1092    {
1093        ClassBytes { set: IntervalSet::new(ranges) }
1094    }
1095
1096    /// Create a new class with no ranges.
1097    pub fn empty() -> ClassBytes {
1098        ClassBytes::new(vec![])
1099    }
1100
1101    /// Add a new range to this set.
1102    pub fn push(&mut self, range: ClassBytesRange) {
1103        self.set.push(range);
1104    }
1105
1106    /// Return an iterator over all ranges in this class.
1107    ///
1108    /// The iterator yields ranges in ascending order.
1109    pub fn iter(&self) -> ClassBytesIter<'_> {
1110        ClassBytesIter(self.set.iter())
1111    }
1112
1113    /// Return the underlying ranges as a slice.
1114    pub fn ranges(&self) -> &[ClassBytesRange] {
1115        self.set.intervals()
1116    }
1117
1118    /// Expand this character class such that it contains all case folded
1119    /// characters. For example, if this class consists of the range `a-z`,
1120    /// then applying case folding will result in the class containing both the
1121    /// ranges `a-z` and `A-Z`.
1122    ///
1123    /// Note that this only applies ASCII case folding, which is limited to the
1124    /// characters `a-z` and `A-Z`.
1125    pub fn case_fold_simple(&mut self) {
1126        self.set.case_fold_simple().expect("ASCII case folding never fails");
1127    }
1128
1129    /// Negate this byte class.
1130    ///
1131    /// For all `b` where `b` is a any byte, if `b` was in this set, then it
1132    /// will not be in this set after negation.
1133    pub fn negate(&mut self) {
1134        self.set.negate();
1135    }
1136
1137    /// Union this byte class with the given byte class, in place.
1138    pub fn union(&mut self, other: &ClassBytes) {
1139        self.set.union(&other.set);
1140    }
1141
1142    /// Intersect this byte class with the given byte class, in place.
1143    pub fn intersect(&mut self, other: &ClassBytes) {
1144        self.set.intersect(&other.set);
1145    }
1146
1147    /// Subtract the given byte class from this byte class, in place.
1148    pub fn difference(&mut self, other: &ClassBytes) {
1149        self.set.difference(&other.set);
1150    }
1151
1152    /// Compute the symmetric difference of the given byte classes, in place.
1153    ///
1154    /// This computes the symmetric difference of two byte classes. This
1155    /// removes all elements in this class that are also in the given class,
1156    /// but all adds all elements from the given class that aren't in this
1157    /// class. That is, the class will contain all elements in either class,
1158    /// but will not contain any elements that are in both classes.
1159    pub fn symmetric_difference(&mut self, other: &ClassBytes) {
1160        self.set.symmetric_difference(&other.set);
1161    }
1162
1163    /// Returns true if and only if this character class will either match
1164    /// nothing or only ASCII bytes. Stated differently, this returns false
1165    /// if and only if this class contains a non-ASCII byte.
1166    pub fn is_all_ascii(&self) -> bool {
1167        self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
1168    }
1169}
1170
1171/// An iterator over all ranges in a byte character class.
1172///
1173/// The lifetime `'a` refers to the lifetime of the underlying class.
1174#[derive(Debug)]
1175pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
1176
1177impl<'a> Iterator for ClassBytesIter<'a> {
1178    type Item = &'a ClassBytesRange;
1179
1180    fn next(&mut self) -> Option<&'a ClassBytesRange> {
1181        self.0.next()
1182    }
1183}
1184
1185/// A single range of characters represented by arbitrary bytes.
1186///
1187/// The range is closed. That is, the start and end of the range are included
1188/// in the range.
1189#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1190pub struct ClassBytesRange {
1191    start: u8,
1192    end: u8,
1193}
1194
1195impl Interval for ClassBytesRange {
1196    type Bound = u8;
1197
1198    #[inline]
1199    fn lower(&self) -> u8 {
1200        self.start
1201    }
1202    #[inline]
1203    fn upper(&self) -> u8 {
1204        self.end
1205    }
1206    #[inline]
1207    fn set_lower(&mut self, bound: u8) {
1208        self.start = bound;
1209    }
1210    #[inline]
1211    fn set_upper(&mut self, bound: u8) {
1212        self.end = bound;
1213    }
1214
1215    /// Apply simple case folding to this byte range. Only ASCII case mappings
1216    /// (for a-z) are applied.
1217    ///
1218    /// Additional ranges are appended to the given vector. Canonical ordering
1219    /// is *not* maintained in the given vector.
1220    fn case_fold_simple(
1221        &self,
1222        ranges: &mut Vec<ClassBytesRange>,
1223    ) -> Result<(), unicode::CaseFoldError> {
1224        if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
1225            let lower = cmp::max(self.start, b'a');
1226            let upper = cmp::min(self.end, b'z');
1227            ranges.push(ClassBytesRange::new(lower - 32, upper - 32));
1228        }
1229        if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
1230            let lower = cmp::max(self.start, b'A');
1231            let upper = cmp::min(self.end, b'Z');
1232            ranges.push(ClassBytesRange::new(lower + 32, upper + 32));
1233        }
1234        Ok(())
1235    }
1236}
1237
1238impl ClassBytesRange {
1239    /// Create a new byte range for a character class.
1240    ///
1241    /// The returned range is always in a canonical form. That is, the range
1242    /// returned always satisfies the invariant that `start <= end`.
1243    pub fn new(start: u8, end: u8) -> ClassBytesRange {
1244        ClassBytesRange::create(start, end)
1245    }
1246
1247    /// Return the start of this range.
1248    ///
1249    /// The start of a range is always less than or equal to the end of the
1250    /// range.
1251    pub fn start(&self) -> u8 {
1252        self.start
1253    }
1254
1255    /// Return the end of this range.
1256    ///
1257    /// The end of a range is always greater than or equal to the start of the
1258    /// range.
1259    pub fn end(&self) -> u8 {
1260        self.end
1261    }
1262}
1263
1264impl fmt::Debug for ClassBytesRange {
1265    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1266        let mut debug = f.debug_struct("ClassBytesRange");
1267        if self.start <= 0x7F {
1268            debug.field("start", &(self.start as char));
1269        } else {
1270            debug.field("start", &self.start);
1271        }
1272        if self.end <= 0x7F {
1273            debug.field("end", &(self.end as char));
1274        } else {
1275            debug.field("end", &self.end);
1276        }
1277        debug.finish()
1278    }
1279}
1280
1281/// The high-level intermediate representation for an anchor assertion.
1282///
1283/// A matching anchor assertion is always zero-length.
1284#[derive(Clone, Debug, Eq, PartialEq)]
1285pub enum Anchor {
1286    /// Match the beginning of a line or the beginning of text. Specifically,
1287    /// this matches at the starting position of the input, or at the position
1288    /// immediately following a `\n` character.
1289    StartLine,
1290    /// Match the end of a line or the end of text. Specifically,
1291    /// this matches at the end position of the input, or at the position
1292    /// immediately preceding a `\n` character.
1293    EndLine,
1294    /// Match the beginning of text. Specifically, this matches at the starting
1295    /// position of the input.
1296    StartText,
1297    /// Match the end of text. Specifically, this matches at the ending
1298    /// position of the input.
1299    EndText,
1300}
1301
1302/// The high-level intermediate representation for a word-boundary assertion.
1303///
1304/// A matching word boundary assertion is always zero-length.
1305#[derive(Clone, Debug, Eq, PartialEq)]
1306pub enum WordBoundary {
1307    /// Match a Unicode-aware word boundary. That is, this matches a position
1308    /// where the left adjacent character and right adjacent character
1309    /// correspond to a word and non-word or a non-word and word character.
1310    Unicode,
1311    /// Match a Unicode-aware negation of a word boundary.
1312    UnicodeNegate,
1313    /// Match an ASCII-only word boundary. That is, this matches a position
1314    /// where the left adjacent character and right adjacent character
1315    /// correspond to a word and non-word or a non-word and word character.
1316    Ascii,
1317    /// Match an ASCII-only negation of a word boundary.
1318    AsciiNegate,
1319}
1320
1321impl WordBoundary {
1322    /// Returns true if and only if this word boundary assertion is negated.
1323    pub fn is_negated(&self) -> bool {
1324        match *self {
1325            WordBoundary::Unicode | WordBoundary::Ascii => false,
1326            WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true,
1327        }
1328    }
1329}
1330
1331/// The high-level intermediate representation for a group.
1332///
1333/// This represents one of three possible group types:
1334///
1335/// 1. A non-capturing group (e.g., `(?:expr)`).
1336/// 2. A capturing group (e.g., `(expr)`).
1337/// 3. A named capturing group (e.g., `(?P<name>expr)`).
1338#[derive(Clone, Debug, Eq, PartialEq)]
1339pub struct Group {
1340    /// The kind of this group. If it is a capturing group, then the kind
1341    /// contains the capture group index (and the name, if it is a named
1342    /// group).
1343    pub kind: GroupKind,
1344    /// The expression inside the capturing group, which may be empty.
1345    pub hir: Box<Hir>,
1346}
1347
1348/// The kind of group.
1349#[derive(Clone, Debug, Eq, PartialEq)]
1350pub enum GroupKind {
1351    /// A normal unnamed capturing group.
1352    ///
1353    /// The value is the capture index of the group.
1354    CaptureIndex(u32),
1355    /// A named capturing group.
1356    CaptureName {
1357        /// The name of the group.
1358        name: String,
1359        /// The capture index of the group.
1360        index: u32,
1361    },
1362    /// A non-capturing group.
1363    NonCapturing,
1364}
1365
1366/// The high-level intermediate representation of a repetition operator.
1367///
1368/// A repetition operator permits the repetition of an arbitrary
1369/// sub-expression.
1370#[derive(Clone, Debug, Eq, PartialEq)]
1371pub struct Repetition {
1372    /// The kind of this repetition operator.
1373    pub kind: RepetitionKind,
1374    /// Whether this repetition operator is greedy or not. A greedy operator
1375    /// will match as much as it can. A non-greedy operator will match as
1376    /// little as it can.
1377    ///
1378    /// Typically, operators are greedy by default and are only non-greedy when
1379    /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
1380    /// not. However, this can be inverted via the `U` "ungreedy" flag.
1381    pub greedy: bool,
1382    /// The expression being repeated.
1383    pub hir: Box<Hir>,
1384}
1385
1386impl Repetition {
1387    /// Returns true if and only if this repetition operator makes it possible
1388    /// to match the empty string.
1389    ///
1390    /// Note that this is not defined inductively. For example, while `a*`
1391    /// will report `true`, `()+` will not, even though `()` matches the empty
1392    /// string and one or more occurrences of something that matches the empty
1393    /// string will always match the empty string. In order to get the
1394    /// inductive definition, see the corresponding method on
1395    /// [`Hir`](struct.Hir.html).
1396    pub fn is_match_empty(&self) -> bool {
1397        match self.kind {
1398            RepetitionKind::ZeroOrOne => true,
1399            RepetitionKind::ZeroOrMore => true,
1400            RepetitionKind::OneOrMore => false,
1401            RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0,
1402            RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0,
1403            RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0,
1404        }
1405    }
1406}
1407
1408/// The kind of a repetition operator.
1409#[derive(Clone, Debug, Eq, PartialEq)]
1410pub enum RepetitionKind {
1411    /// Matches a sub-expression zero or one times.
1412    ZeroOrOne,
1413    /// Matches a sub-expression zero or more times.
1414    ZeroOrMore,
1415    /// Matches a sub-expression one or more times.
1416    OneOrMore,
1417    /// Matches a sub-expression within a bounded range of times.
1418    Range(RepetitionRange),
1419}
1420
1421/// The kind of a counted repetition operator.
1422#[derive(Clone, Debug, Eq, PartialEq)]
1423pub enum RepetitionRange {
1424    /// Matches a sub-expression exactly this many times.
1425    Exactly(u32),
1426    /// Matches a sub-expression at least this many times.
1427    AtLeast(u32),
1428    /// Matches a sub-expression at least `m` times and at most `n` times.
1429    Bounded(u32, u32),
1430}
1431
1432/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
1433/// space but heap space proportional to the depth of the total `Hir`.
1434impl Drop for Hir {
1435    fn drop(&mut self) {
1436        use std::mem;
1437
1438        match *self.kind() {
1439            HirKind::Empty
1440            | HirKind::Literal(_)
1441            | HirKind::Class(_)
1442            | HirKind::Anchor(_)
1443            | HirKind::WordBoundary(_) => return,
1444            HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return,
1445            HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return,
1446            HirKind::Concat(ref x) if x.is_empty() => return,
1447            HirKind::Alternation(ref x) if x.is_empty() => return,
1448            _ => {}
1449        }
1450
1451        let mut stack = vec![mem::replace(self, Hir::empty())];
1452        while let Some(mut expr) = stack.pop() {
1453            match expr.kind {
1454                HirKind::Empty
1455                | HirKind::Literal(_)
1456                | HirKind::Class(_)
1457                | HirKind::Anchor(_)
1458                | HirKind::WordBoundary(_) => {}
1459                HirKind::Group(ref mut x) => {
1460                    stack.push(mem::replace(&mut x.hir, Hir::empty()));
1461                }
1462                HirKind::Repetition(ref mut x) => {
1463                    stack.push(mem::replace(&mut x.hir, Hir::empty()));
1464                }
1465                HirKind::Concat(ref mut x) => {
1466                    stack.extend(x.drain(..));
1467                }
1468                HirKind::Alternation(ref mut x) => {
1469                    stack.extend(x.drain(..));
1470                }
1471            }
1472        }
1473    }
1474}
1475
1476/// A type that documents various attributes of an HIR expression.
1477///
1478/// These attributes are typically defined inductively on the HIR.
1479#[derive(Clone, Debug, Eq, PartialEq)]
1480struct HirInfo {
1481    /// Represent yes/no questions by a bitfield to conserve space, since
1482    /// this is included in every HIR expression.
1483    ///
1484    /// If more attributes need to be added, it is OK to increase the size of
1485    /// this as appropriate.
1486    bools: u16,
1487}
1488
1489// A simple macro for defining bitfield accessors/mutators.
1490macro_rules! define_bool {
1491    ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => {
1492        fn $is_fn_name(&self) -> bool {
1493            self.bools & (0b1 << $bit) > 0
1494        }
1495
1496        fn $set_fn_name(&mut self, yes: bool) {
1497            if yes {
1498                self.bools |= 1 << $bit;
1499            } else {
1500                self.bools &= !(1 << $bit);
1501            }
1502        }
1503    };
1504}
1505
1506impl HirInfo {
1507    fn new() -> HirInfo {
1508        HirInfo { bools: 0 }
1509    }
1510
1511    define_bool!(0, is_always_utf8, set_always_utf8);
1512    define_bool!(1, is_all_assertions, set_all_assertions);
1513    define_bool!(2, is_anchored_start, set_anchored_start);
1514    define_bool!(3, is_anchored_end, set_anchored_end);
1515    define_bool!(4, is_line_anchored_start, set_line_anchored_start);
1516    define_bool!(5, is_line_anchored_end, set_line_anchored_end);
1517    define_bool!(6, is_any_anchored_start, set_any_anchored_start);
1518    define_bool!(7, is_any_anchored_end, set_any_anchored_end);
1519    define_bool!(8, is_match_empty, set_match_empty);
1520    define_bool!(9, is_literal, set_literal);
1521    define_bool!(10, is_alternation_literal, set_alternation_literal);
1522}
1523
1524#[cfg(test)]
1525mod tests {
1526    use super::*;
1527
1528    fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
1529        let ranges: Vec<ClassUnicodeRange> = ranges
1530            .iter()
1531            .map(|&(s, e)| ClassUnicodeRange::new(s, e))
1532            .collect();
1533        ClassUnicode::new(ranges)
1534    }
1535
1536    fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
1537        let ranges: Vec<ClassBytesRange> =
1538            ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect();
1539        ClassBytes::new(ranges)
1540    }
1541
1542    fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
1543        cls.iter().map(|x| (x.start(), x.end())).collect()
1544    }
1545
1546    #[cfg(feature = "unicode-case")]
1547    fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
1548        let mut cls_ = cls.clone();
1549        cls_.case_fold_simple();
1550        cls_
1551    }
1552
1553    fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1554        let mut cls_ = cls1.clone();
1555        cls_.union(cls2);
1556        cls_
1557    }
1558
1559    fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1560        let mut cls_ = cls1.clone();
1561        cls_.intersect(cls2);
1562        cls_
1563    }
1564
1565    fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1566        let mut cls_ = cls1.clone();
1567        cls_.difference(cls2);
1568        cls_
1569    }
1570
1571    fn usymdifference(
1572        cls1: &ClassUnicode,
1573        cls2: &ClassUnicode,
1574    ) -> ClassUnicode {
1575        let mut cls_ = cls1.clone();
1576        cls_.symmetric_difference(cls2);
1577        cls_
1578    }
1579
1580    fn unegate(cls: &ClassUnicode) -> ClassUnicode {
1581        let mut cls_ = cls.clone();
1582        cls_.negate();
1583        cls_
1584    }
1585
1586    fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
1587        cls.iter().map(|x| (x.start(), x.end())).collect()
1588    }
1589
1590    fn bcasefold(cls: &ClassBytes) -> ClassBytes {
1591        let mut cls_ = cls.clone();
1592        cls_.case_fold_simple();
1593        cls_
1594    }
1595
1596    fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1597        let mut cls_ = cls1.clone();
1598        cls_.union(cls2);
1599        cls_
1600    }
1601
1602    fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1603        let mut cls_ = cls1.clone();
1604        cls_.intersect(cls2);
1605        cls_
1606    }
1607
1608    fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1609        let mut cls_ = cls1.clone();
1610        cls_.difference(cls2);
1611        cls_
1612    }
1613
1614    fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1615        let mut cls_ = cls1.clone();
1616        cls_.symmetric_difference(cls2);
1617        cls_
1618    }
1619
1620    fn bnegate(cls: &ClassBytes) -> ClassBytes {
1621        let mut cls_ = cls.clone();
1622        cls_.negate();
1623        cls_
1624    }
1625
1626    #[test]
1627    fn class_range_canonical_unicode() {
1628        let range = ClassUnicodeRange::new('\u{00FF}', '\0');
1629        assert_eq!('\0', range.start());
1630        assert_eq!('\u{00FF}', range.end());
1631    }
1632
1633    #[test]
1634    fn class_range_canonical_bytes() {
1635        let range = ClassBytesRange::new(b'\xFF', b'\0');
1636        assert_eq!(b'\0', range.start());
1637        assert_eq!(b'\xFF', range.end());
1638    }
1639
1640    #[test]
1641    fn class_canonicalize_unicode() {
1642        let cls = uclass(&[('a', 'c'), ('x', 'z')]);
1643        let expected = vec![('a', 'c'), ('x', 'z')];
1644        assert_eq!(expected, uranges(&cls));
1645
1646        let cls = uclass(&[('x', 'z'), ('a', 'c')]);
1647        let expected = vec![('a', 'c'), ('x', 'z')];
1648        assert_eq!(expected, uranges(&cls));
1649
1650        let cls = uclass(&[('x', 'z'), ('w', 'y')]);
1651        let expected = vec![('w', 'z')];
1652        assert_eq!(expected, uranges(&cls));
1653
1654        let cls = uclass(&[
1655            ('c', 'f'),
1656            ('a', 'g'),
1657            ('d', 'j'),
1658            ('a', 'c'),
1659            ('m', 'p'),
1660            ('l', 's'),
1661        ]);
1662        let expected = vec![('a', 'j'), ('l', 's')];
1663        assert_eq!(expected, uranges(&cls));
1664
1665        let cls = uclass(&[('x', 'z'), ('u', 'w')]);
1666        let expected = vec![('u', 'z')];
1667        assert_eq!(expected, uranges(&cls));
1668
1669        let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
1670        let expected = vec![('\x00', '\u{10FFFF}')];
1671        assert_eq!(expected, uranges(&cls));
1672
1673        let cls = uclass(&[('a', 'a'), ('b', 'b')]);
1674        let expected = vec![('a', 'b')];
1675        assert_eq!(expected, uranges(&cls));
1676    }
1677
1678    #[test]
1679    fn class_canonicalize_bytes() {
1680        let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
1681        let expected = vec![(b'a', b'c'), (b'x', b'z')];
1682        assert_eq!(expected, branges(&cls));
1683
1684        let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
1685        let expected = vec![(b'a', b'c'), (b'x', b'z')];
1686        assert_eq!(expected, branges(&cls));
1687
1688        let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
1689        let expected = vec![(b'w', b'z')];
1690        assert_eq!(expected, branges(&cls));
1691
1692        let cls = bclass(&[
1693            (b'c', b'f'),
1694            (b'a', b'g'),
1695            (b'd', b'j'),
1696            (b'a', b'c'),
1697            (b'm', b'p'),
1698            (b'l', b's'),
1699        ]);
1700        let expected = vec![(b'a', b'j'), (b'l', b's')];
1701        assert_eq!(expected, branges(&cls));
1702
1703        let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
1704        let expected = vec![(b'u', b'z')];
1705        assert_eq!(expected, branges(&cls));
1706
1707        let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]);
1708        let expected = vec![(b'\x00', b'\xFF')];
1709        assert_eq!(expected, branges(&cls));
1710
1711        let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
1712        let expected = vec![(b'a', b'b')];
1713        assert_eq!(expected, branges(&cls));
1714    }
1715
1716    #[test]
1717    #[cfg(feature = "unicode-case")]
1718    fn class_case_fold_unicode() {
1719        let cls = uclass(&[
1720            ('C', 'F'),
1721            ('A', 'G'),
1722            ('D', 'J'),
1723            ('A', 'C'),
1724            ('M', 'P'),
1725            ('L', 'S'),
1726            ('c', 'f'),
1727        ]);
1728        let expected = uclass(&[
1729            ('A', 'J'),
1730            ('L', 'S'),
1731            ('a', 'j'),
1732            ('l', 's'),
1733            ('\u{17F}', '\u{17F}'),
1734        ]);
1735        assert_eq!(expected, ucasefold(&cls));
1736
1737        let cls = uclass(&[('A', 'Z')]);
1738        let expected = uclass(&[
1739            ('A', 'Z'),
1740            ('a', 'z'),
1741            ('\u{17F}', '\u{17F}'),
1742            ('\u{212A}', '\u{212A}'),
1743        ]);
1744        assert_eq!(expected, ucasefold(&cls));
1745
1746        let cls = uclass(&[('a', 'z')]);
1747        let expected = uclass(&[
1748            ('A', 'Z'),
1749            ('a', 'z'),
1750            ('\u{17F}', '\u{17F}'),
1751            ('\u{212A}', '\u{212A}'),
1752        ]);
1753        assert_eq!(expected, ucasefold(&cls));
1754
1755        let cls = uclass(&[('A', 'A'), ('_', '_')]);
1756        let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
1757        assert_eq!(expected, ucasefold(&cls));
1758
1759        let cls = uclass(&[('A', 'A'), ('=', '=')]);
1760        let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
1761        assert_eq!(expected, ucasefold(&cls));
1762
1763        let cls = uclass(&[('\x00', '\x10')]);
1764        assert_eq!(cls, ucasefold(&cls));
1765
1766        let cls = uclass(&[('k', 'k')]);
1767        let expected =
1768            uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]);
1769        assert_eq!(expected, ucasefold(&cls));
1770
1771        let cls = uclass(&[('@', '@')]);
1772        assert_eq!(cls, ucasefold(&cls));
1773    }
1774
1775    #[test]
1776    #[cfg(not(feature = "unicode-case"))]
1777    fn class_case_fold_unicode_disabled() {
1778        let mut cls = uclass(&[
1779            ('C', 'F'),
1780            ('A', 'G'),
1781            ('D', 'J'),
1782            ('A', 'C'),
1783            ('M', 'P'),
1784            ('L', 'S'),
1785            ('c', 'f'),
1786        ]);
1787        assert!(cls.try_case_fold_simple().is_err());
1788    }
1789
1790    #[test]
1791    #[should_panic]
1792    #[cfg(not(feature = "unicode-case"))]
1793    fn class_case_fold_unicode_disabled_panics() {
1794        let mut cls = uclass(&[
1795            ('C', 'F'),
1796            ('A', 'G'),
1797            ('D', 'J'),
1798            ('A', 'C'),
1799            ('M', 'P'),
1800            ('L', 'S'),
1801            ('c', 'f'),
1802        ]);
1803        cls.case_fold_simple();
1804    }
1805
1806    #[test]
1807    fn class_case_fold_bytes() {
1808        let cls = bclass(&[
1809            (b'C', b'F'),
1810            (b'A', b'G'),
1811            (b'D', b'J'),
1812            (b'A', b'C'),
1813            (b'M', b'P'),
1814            (b'L', b'S'),
1815            (b'c', b'f'),
1816        ]);
1817        let expected =
1818            bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
1819        assert_eq!(expected, bcasefold(&cls));
1820
1821        let cls = bclass(&[(b'A', b'Z')]);
1822        let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
1823        assert_eq!(expected, bcasefold(&cls));
1824
1825        let cls = bclass(&[(b'a', b'z')]);
1826        let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
1827        assert_eq!(expected, bcasefold(&cls));
1828
1829        let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
1830        let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
1831        assert_eq!(expected, bcasefold(&cls));
1832
1833        let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
1834        let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
1835        assert_eq!(expected, bcasefold(&cls));
1836
1837        let cls = bclass(&[(b'\x00', b'\x10')]);
1838        assert_eq!(cls, bcasefold(&cls));
1839
1840        let cls = bclass(&[(b'k', b'k')]);
1841        let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
1842        assert_eq!(expected, bcasefold(&cls));
1843
1844        let cls = bclass(&[(b'@', b'@')]);
1845        assert_eq!(cls, bcasefold(&cls));
1846    }
1847
1848    #[test]
1849    fn class_negate_unicode() {
1850        let cls = uclass(&[('a', 'a')]);
1851        let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]);
1852        assert_eq!(expected, unegate(&cls));
1853
1854        let cls = uclass(&[('a', 'a'), ('b', 'b')]);
1855        let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]);
1856        assert_eq!(expected, unegate(&cls));
1857
1858        let cls = uclass(&[('a', 'c'), ('x', 'z')]);
1859        let expected = uclass(&[
1860            ('\x00', '\x60'),
1861            ('\x64', '\x77'),
1862            ('\x7B', '\u{10FFFF}'),
1863        ]);
1864        assert_eq!(expected, unegate(&cls));
1865
1866        let cls = uclass(&[('\x00', 'a')]);
1867        let expected = uclass(&[('\x62', '\u{10FFFF}')]);
1868        assert_eq!(expected, unegate(&cls));
1869
1870        let cls = uclass(&[('a', '\u{10FFFF}')]);
1871        let expected = uclass(&[('\x00', '\x60')]);
1872        assert_eq!(expected, unegate(&cls));
1873
1874        let cls = uclass(&[('\x00', '\u{10FFFF}')]);
1875        let expected = uclass(&[]);
1876        assert_eq!(expected, unegate(&cls));
1877
1878        let cls = uclass(&[]);
1879        let expected = uclass(&[('\x00', '\u{10FFFF}')]);
1880        assert_eq!(expected, unegate(&cls));
1881
1882        let cls =
1883            uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]);
1884        let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]);
1885        assert_eq!(expected, unegate(&cls));
1886
1887        let cls = uclass(&[('\x00', '\u{D7FF}')]);
1888        let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]);
1889        assert_eq!(expected, unegate(&cls));
1890
1891        let cls = uclass(&[('\x00', '\u{D7FE}')]);
1892        let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]);
1893        assert_eq!(expected, unegate(&cls));
1894
1895        let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]);
1896        let expected = uclass(&[('\x00', '\u{D7FF}')]);
1897        assert_eq!(expected, unegate(&cls));
1898
1899        let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]);
1900        let expected = uclass(&[('\x00', '\u{E000}')]);
1901        assert_eq!(expected, unegate(&cls));
1902    }
1903
1904    #[test]
1905    fn class_negate_bytes() {
1906        let cls = bclass(&[(b'a', b'a')]);
1907        let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]);
1908        assert_eq!(expected, bnegate(&cls));
1909
1910        let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
1911        let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]);
1912        assert_eq!(expected, bnegate(&cls));
1913
1914        let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
1915        let expected = bclass(&[
1916            (b'\x00', b'\x60'),
1917            (b'\x64', b'\x77'),
1918            (b'\x7B', b'\xFF'),
1919        ]);
1920        assert_eq!(expected, bnegate(&cls));
1921
1922        let cls = bclass(&[(b'\x00', b'a')]);
1923        let expected = bclass(&[(b'\x62', b'\xFF')]);
1924        assert_eq!(expected, bnegate(&cls));
1925
1926        let cls = bclass(&[(b'a', b'\xFF')]);
1927        let expected = bclass(&[(b'\x00', b'\x60')]);
1928        assert_eq!(expected, bnegate(&cls));
1929
1930        let cls = bclass(&[(b'\x00', b'\xFF')]);
1931        let expected = bclass(&[]);
1932        assert_eq!(expected, bnegate(&cls));
1933
1934        let cls = bclass(&[]);
1935        let expected = bclass(&[(b'\x00', b'\xFF')]);
1936        assert_eq!(expected, bnegate(&cls));
1937
1938        let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]);
1939        let expected = bclass(&[(b'\xFE', b'\xFE')]);
1940        assert_eq!(expected, bnegate(&cls));
1941    }
1942
1943    #[test]
1944    fn class_union_unicode() {
1945        let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
1946        let cls2 = uclass(&[('a', 'z')]);
1947        let expected = uclass(&[('a', 'z'), ('A', 'C')]);
1948        assert_eq!(expected, uunion(&cls1, &cls2));
1949    }
1950
1951    #[test]
1952    fn class_union_bytes() {
1953        let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
1954        let cls2 = bclass(&[(b'a', b'z')]);
1955        let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
1956        assert_eq!(expected, bunion(&cls1, &cls2));
1957    }
1958
1959    #[test]
1960    fn class_intersect_unicode() {
1961        let cls1 = uclass(&[]);
1962        let cls2 = uclass(&[('a', 'a')]);
1963        let expected = uclass(&[]);
1964        assert_eq!(expected, uintersect(&cls1, &cls2));
1965
1966        let cls1 = uclass(&[('a', 'a')]);
1967        let cls2 = uclass(&[('a', 'a')]);
1968        let expected = uclass(&[('a', 'a')]);
1969        assert_eq!(expected, uintersect(&cls1, &cls2));
1970
1971        let cls1 = uclass(&[('a', 'a')]);
1972        let cls2 = uclass(&[('b', 'b')]);
1973        let expected = uclass(&[]);
1974        assert_eq!(expected, uintersect(&cls1, &cls2));
1975
1976        let cls1 = uclass(&[('a', 'a')]);
1977        let cls2 = uclass(&[('a', 'c')]);
1978        let expected = uclass(&[('a', 'a')]);
1979        assert_eq!(expected, uintersect(&cls1, &cls2));
1980
1981        let cls1 = uclass(&[('a', 'b')]);
1982        let cls2 = uclass(&[('a', 'c')]);
1983        let expected = uclass(&[('a', 'b')]);
1984        assert_eq!(expected, uintersect(&cls1, &cls2));
1985
1986        let cls1 = uclass(&[('a', 'b')]);
1987        let cls2 = uclass(&[('b', 'c')]);
1988        let expected = uclass(&[('b', 'b')]);
1989        assert_eq!(expected, uintersect(&cls1, &cls2));
1990
1991        let cls1 = uclass(&[('a', 'b')]);
1992        let cls2 = uclass(&[('c', 'd')]);
1993        let expected = uclass(&[]);
1994        assert_eq!(expected, uintersect(&cls1, &cls2));
1995
1996        let cls1 = uclass(&[('b', 'c')]);
1997        let cls2 = uclass(&[('a', 'd')]);
1998        let expected = uclass(&[('b', 'c')]);
1999        assert_eq!(expected, uintersect(&cls1, &cls2));
2000
2001        let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2002        let cls2 = uclass(&[('a', 'h')]);
2003        let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2004        assert_eq!(expected, uintersect(&cls1, &cls2));
2005
2006        let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2007        let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2008        let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2009        assert_eq!(expected, uintersect(&cls1, &cls2));
2010
2011        let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
2012        let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
2013        let expected = uclass(&[]);
2014        assert_eq!(expected, uintersect(&cls1, &cls2));
2015
2016        let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2017        let cls2 = uclass(&[('h', 'h')]);
2018        let expected = uclass(&[('h', 'h')]);
2019        assert_eq!(expected, uintersect(&cls1, &cls2));
2020
2021        let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
2022        let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
2023        let expected = uclass(&[]);
2024        assert_eq!(expected, uintersect(&cls1, &cls2));
2025
2026        let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
2027        let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
2028        let expected = uclass(&[('b', 'f')]);
2029        assert_eq!(expected, uintersect(&cls1, &cls2));
2030    }
2031
2032    #[test]
2033    fn class_intersect_bytes() {
2034        let cls1 = bclass(&[]);
2035        let cls2 = bclass(&[(b'a', b'a')]);
2036        let expected = bclass(&[]);
2037        assert_eq!(expected, bintersect(&cls1, &cls2));
2038
2039        let cls1 = bclass(&[(b'a', b'a')]);
2040        let cls2 = bclass(&[(b'a', b'a')]);
2041        let expected = bclass(&[(b'a', b'a')]);
2042        assert_eq!(expected, bintersect(&cls1, &cls2));
2043
2044        let cls1 = bclass(&[(b'a', b'a')]);
2045        let cls2 = bclass(&[(b'b', b'b')]);
2046        let expected = bclass(&[]);
2047        assert_eq!(expected, bintersect(&cls1, &cls2));
2048
2049        let cls1 = bclass(&[(b'a', b'a')]);
2050        let cls2 = bclass(&[(b'a', b'c')]);
2051        let expected = bclass(&[(b'a', b'a')]);
2052        assert_eq!(expected, bintersect(&cls1, &cls2));
2053
2054        let cls1 = bclass(&[(b'a', b'b')]);
2055        let cls2 = bclass(&[(b'a', b'c')]);
2056        let expected = bclass(&[(b'a', b'b')]);
2057        assert_eq!(expected, bintersect(&cls1, &cls2));
2058
2059        let cls1 = bclass(&[(b'a', b'b')]);
2060        let cls2 = bclass(&[(b'b', b'c')]);
2061        let expected = bclass(&[(b'b', b'b')]);
2062        assert_eq!(expected, bintersect(&cls1, &cls2));
2063
2064        let cls1 = bclass(&[(b'a', b'b')]);
2065        let cls2 = bclass(&[(b'c', b'd')]);
2066        let expected = bclass(&[]);
2067        assert_eq!(expected, bintersect(&cls1, &cls2));
2068
2069        let cls1 = bclass(&[(b'b', b'c')]);
2070        let cls2 = bclass(&[(b'a', b'd')]);
2071        let expected = bclass(&[(b'b', b'c')]);
2072        assert_eq!(expected, bintersect(&cls1, &cls2));
2073
2074        let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2075        let cls2 = bclass(&[(b'a', b'h')]);
2076        let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2077        assert_eq!(expected, bintersect(&cls1, &cls2));
2078
2079        let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2080        let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2081        let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2082        assert_eq!(expected, bintersect(&cls1, &cls2));
2083
2084        let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
2085        let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
2086        let expected = bclass(&[]);
2087        assert_eq!(expected, bintersect(&cls1, &cls2));
2088
2089        let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2090        let cls2 = bclass(&[(b'h', b'h')]);
2091        let expected = bclass(&[(b'h', b'h')]);
2092        assert_eq!(expected, bintersect(&cls1, &cls2));
2093
2094        let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
2095        let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
2096        let expected = bclass(&[]);
2097        assert_eq!(expected, bintersect(&cls1, &cls2));
2098
2099        let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
2100        let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
2101        let expected = bclass(&[(b'b', b'f')]);
2102        assert_eq!(expected, bintersect(&cls1, &cls2));
2103    }
2104
2105    #[test]
2106    fn class_difference_unicode() {
2107        let cls1 = uclass(&[('a', 'a')]);
2108        let cls2 = uclass(&[('a', 'a')]);
2109        let expected = uclass(&[]);
2110        assert_eq!(expected, udifference(&cls1, &cls2));
2111
2112        let cls1 = uclass(&[('a', 'a')]);
2113        let cls2 = uclass(&[]);
2114        let expected = uclass(&[('a', 'a')]);
2115        assert_eq!(expected, udifference(&cls1, &cls2));
2116
2117        let cls1 = uclass(&[]);
2118        let cls2 = uclass(&[('a', 'a')]);
2119        let expected = uclass(&[]);
2120        assert_eq!(expected, udifference(&cls1, &cls2));
2121
2122        let cls1 = uclass(&[('a', 'z')]);
2123        let cls2 = uclass(&[('a', 'a')]);
2124        let expected = uclass(&[('b', 'z')]);
2125        assert_eq!(expected, udifference(&cls1, &cls2));
2126
2127        let cls1 = uclass(&[('a', 'z')]);
2128        let cls2 = uclass(&[('z', 'z')]);
2129        let expected = uclass(&[('a', 'y')]);
2130        assert_eq!(expected, udifference(&cls1, &cls2));
2131
2132        let cls1 = uclass(&[('a', 'z')]);
2133        let cls2 = uclass(&[('m', 'm')]);
2134        let expected = uclass(&[('a', 'l'), ('n', 'z')]);
2135        assert_eq!(expected, udifference(&cls1, &cls2));
2136
2137        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2138        let cls2 = uclass(&[('a', 'z')]);
2139        let expected = uclass(&[]);
2140        assert_eq!(expected, udifference(&cls1, &cls2));
2141
2142        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2143        let cls2 = uclass(&[('d', 'v')]);
2144        let expected = uclass(&[('a', 'c')]);
2145        assert_eq!(expected, udifference(&cls1, &cls2));
2146
2147        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2148        let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
2149        let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
2150        assert_eq!(expected, udifference(&cls1, &cls2));
2151
2152        let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2153        let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
2154        let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
2155        assert_eq!(expected, udifference(&cls1, &cls2));
2156
2157        let cls1 = uclass(&[('x', 'z')]);
2158        let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
2159        let expected = uclass(&[('x', 'z')]);
2160        assert_eq!(expected, udifference(&cls1, &cls2));
2161
2162        let cls1 = uclass(&[('a', 'z')]);
2163        let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
2164        let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
2165        assert_eq!(expected, udifference(&cls1, &cls2));
2166    }
2167
2168    #[test]
2169    fn class_difference_bytes() {
2170        let cls1 = bclass(&[(b'a', b'a')]);
2171        let cls2 = bclass(&[(b'a', b'a')]);
2172        let expected = bclass(&[]);
2173        assert_eq!(expected, bdifference(&cls1, &cls2));
2174
2175        let cls1 = bclass(&[(b'a', b'a')]);
2176        let cls2 = bclass(&[]);
2177        let expected = bclass(&[(b'a', b'a')]);
2178        assert_eq!(expected, bdifference(&cls1, &cls2));
2179
2180        let cls1 = bclass(&[]);
2181        let cls2 = bclass(&[(b'a', b'a')]);
2182        let expected = bclass(&[]);
2183        assert_eq!(expected, bdifference(&cls1, &cls2));
2184
2185        let cls1 = bclass(&[(b'a', b'z')]);
2186        let cls2 = bclass(&[(b'a', b'a')]);
2187        let expected = bclass(&[(b'b', b'z')]);
2188        assert_eq!(expected, bdifference(&cls1, &cls2));
2189
2190        let cls1 = bclass(&[(b'a', b'z')]);
2191        let cls2 = bclass(&[(b'z', b'z')]);
2192        let expected = bclass(&[(b'a', b'y')]);
2193        assert_eq!(expected, bdifference(&cls1, &cls2));
2194
2195        let cls1 = bclass(&[(b'a', b'z')]);
2196        let cls2 = bclass(&[(b'm', b'm')]);
2197        let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
2198        assert_eq!(expected, bdifference(&cls1, &cls2));
2199
2200        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2201        let cls2 = bclass(&[(b'a', b'z')]);
2202        let expected = bclass(&[]);
2203        assert_eq!(expected, bdifference(&cls1, &cls2));
2204
2205        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2206        let cls2 = bclass(&[(b'd', b'v')]);
2207        let expected = bclass(&[(b'a', b'c')]);
2208        assert_eq!(expected, bdifference(&cls1, &cls2));
2209
2210        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2211        let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
2212        let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
2213        assert_eq!(expected, bdifference(&cls1, &cls2));
2214
2215        let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2216        let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
2217        let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
2218        assert_eq!(expected, bdifference(&cls1, &cls2));
2219
2220        let cls1 = bclass(&[(b'x', b'z')]);
2221        let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
2222        let expected = bclass(&[(b'x', b'z')]);
2223        assert_eq!(expected, bdifference(&cls1, &cls2));
2224
2225        let cls1 = bclass(&[(b'a', b'z')]);
2226        let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
2227        let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
2228        assert_eq!(expected, bdifference(&cls1, &cls2));
2229    }
2230
2231    #[test]
2232    fn class_symmetric_difference_unicode() {
2233        let cls1 = uclass(&[('a', 'm')]);
2234        let cls2 = uclass(&[('g', 't')]);
2235        let expected = uclass(&[('a', 'f'), ('n', 't')]);
2236        assert_eq!(expected, usymdifference(&cls1, &cls2));
2237    }
2238
2239    #[test]
2240    fn class_symmetric_difference_bytes() {
2241        let cls1 = bclass(&[(b'a', b'm')]);
2242        let cls2 = bclass(&[(b'g', b't')]);
2243        let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
2244        assert_eq!(expected, bsymdifference(&cls1, &cls2));
2245    }
2246
2247    #[test]
2248    #[should_panic]
2249    fn hir_byte_literal_non_ascii() {
2250        Hir::literal(Literal::Byte(b'a'));
2251    }
2252
2253    // We use a thread with an explicit stack size to test that our destructor
2254    // for Hir can handle arbitrarily sized expressions in constant stack
2255    // space. In case we run on a platform without threads (WASM?), we limit
2256    // this test to Windows/Unix.
2257    #[test]
2258    #[cfg(any(unix, windows))]
2259    fn no_stack_overflow_on_drop() {
2260        use std::thread;
2261
2262        let run = || {
2263            let mut expr = Hir::empty();
2264            for _ in 0..100 {
2265                expr = Hir::group(Group {
2266                    kind: GroupKind::NonCapturing,
2267                    hir: Box::new(expr),
2268                });
2269                expr = Hir::repetition(Repetition {
2270                    kind: RepetitionKind::ZeroOrOne,
2271                    greedy: true,
2272                    hir: Box::new(expr),
2273                });
2274
2275                expr = Hir {
2276                    kind: HirKind::Concat(vec![expr]),
2277                    info: HirInfo::new(),
2278                };
2279                expr = Hir {
2280                    kind: HirKind::Alternation(vec![expr]),
2281                    info: HirInfo::new(),
2282                };
2283            }
2284            assert!(!expr.kind.is_empty());
2285        };
2286
2287        // We run our test on a thread with a small stack size so we can
2288        // force the issue more easily.
2289        thread::Builder::new()
2290            .stack_size(1 << 10)
2291            .spawn(run)
2292            .unwrap()
2293            .join()
2294            .unwrap();
2295    }
2296}
regex_syntax/hir/mod.rs

regex_syntax/hir/
mod.rs