regex_syntax/ast/
mod.rs

1/*!
2Defines an abstract syntax for regular expressions.
3*/
4
5use std::cmp::Ordering;
6use std::error;
7use std::fmt;
8
9pub use crate::ast::visitor::{visit, Visitor};
10
11pub mod parse;
12pub mod print;
13mod visitor;
14
15/// An error that occurred while parsing a regular expression into an abstract
16/// syntax tree.
17///
18/// Note that not all ASTs represents a valid regular expression. For example,
19/// an AST is constructed without error for `\p{Quux}`, but `Quux` is not a
20/// valid Unicode property name. That particular error is reported when
21/// translating an AST to the high-level intermediate representation (`HIR`).
22#[derive(Clone, Debug, Eq, PartialEq)]
23pub struct Error {
24    /// The kind of error.
25    kind: ErrorKind,
26    /// The original pattern that the parser generated the error from. Every
27    /// span in an error is a valid range into this string.
28    pattern: String,
29    /// The span of this error.
30    span: Span,
31}
32
33impl Error {
34    /// Return the type of this error.
35    pub fn kind(&self) -> &ErrorKind {
36        &self.kind
37    }
38
39    /// The original pattern string in which this error occurred.
40    ///
41    /// Every span reported by this error is reported in terms of this string.
42    pub fn pattern(&self) -> &str {
43        &self.pattern
44    }
45
46    /// Return the span at which this error occurred.
47    pub fn span(&self) -> &Span {
48        &self.span
49    }
50
51    /// Return an auxiliary span. This span exists only for some errors that
52    /// benefit from being able to point to two locations in the original
53    /// regular expression. For example, "duplicate" errors will have the
54    /// main error position set to the duplicate occurrence while its
55    /// auxiliary span will be set to the initial occurrence.
56    pub fn auxiliary_span(&self) -> Option<&Span> {
57        use self::ErrorKind::*;
58        match self.kind {
59            FlagDuplicate { ref original } => Some(original),
60            FlagRepeatedNegation { ref original, .. } => Some(original),
61            GroupNameDuplicate { ref original, .. } => Some(original),
62            _ => None,
63        }
64    }
65}
66
67/// The type of an error that occurred while building an AST.
68#[derive(Clone, Debug, Eq, PartialEq)]
69pub enum ErrorKind {
70    /// The capturing group limit was exceeded.
71    ///
72    /// Note that this represents a limit on the total number of capturing
73    /// groups in a regex and not necessarily the number of nested capturing
74    /// groups. That is, the nest limit can be low and it is still possible for
75    /// this error to occur.
76    CaptureLimitExceeded,
77    /// An invalid escape sequence was found in a character class set.
78    ClassEscapeInvalid,
79    /// An invalid character class range was found. An invalid range is any
80    /// range where the start is greater than the end.
81    ClassRangeInvalid,
82    /// An invalid range boundary was found in a character class. Range
83    /// boundaries must be a single literal codepoint, but this error indicates
84    /// that something else was found, such as a nested class.
85    ClassRangeLiteral,
86    /// An opening `[` was found with no corresponding closing `]`.
87    ClassUnclosed,
88    /// Note that this error variant is no longer used. Namely, a decimal
89    /// number can only appear as a repetition quantifier. When the number
90    /// in a repetition quantifier is empty, then it gets its own specialized
91    /// error, `RepetitionCountDecimalEmpty`.
92    DecimalEmpty,
93    /// An invalid decimal number was given where one was expected.
94    DecimalInvalid,
95    /// A bracketed hex literal was empty.
96    EscapeHexEmpty,
97    /// A bracketed hex literal did not correspond to a Unicode scalar value.
98    EscapeHexInvalid,
99    /// An invalid hexadecimal digit was found.
100    EscapeHexInvalidDigit,
101    /// EOF was found before an escape sequence was completed.
102    EscapeUnexpectedEof,
103    /// An unrecognized escape sequence.
104    EscapeUnrecognized,
105    /// A dangling negation was used when setting flags, e.g., `i-`.
106    FlagDanglingNegation,
107    /// A flag was used twice, e.g., `i-i`.
108    FlagDuplicate {
109        /// The position of the original flag. The error position
110        /// points to the duplicate flag.
111        original: Span,
112    },
113    /// The negation operator was used twice, e.g., `-i-s`.
114    FlagRepeatedNegation {
115        /// The position of the original negation operator. The error position
116        /// points to the duplicate negation operator.
117        original: Span,
118    },
119    /// Expected a flag but got EOF, e.g., `(?`.
120    FlagUnexpectedEof,
121    /// Unrecognized flag, e.g., `a`.
122    FlagUnrecognized,
123    /// A duplicate capture name was found.
124    GroupNameDuplicate {
125        /// The position of the initial occurrence of the capture name. The
126        /// error position itself points to the duplicate occurrence.
127        original: Span,
128    },
129    /// A capture group name is empty, e.g., `(?P<>abc)`.
130    GroupNameEmpty,
131    /// An invalid character was seen for a capture group name. This includes
132    /// errors where the first character is a digit (even though subsequent
133    /// characters are allowed to be digits).
134    GroupNameInvalid,
135    /// A closing `>` could not be found for a capture group name.
136    GroupNameUnexpectedEof,
137    /// An unclosed group, e.g., `(ab`.
138    ///
139    /// The span of this error corresponds to the unclosed parenthesis.
140    GroupUnclosed,
141    /// An unopened group, e.g., `ab)`.
142    GroupUnopened,
143    /// The nest limit was exceeded. The limit stored here is the limit
144    /// configured in the parser.
145    NestLimitExceeded(u32),
146    /// The range provided in a counted repetition operator is invalid. The
147    /// range is invalid if the start is greater than the end.
148    RepetitionCountInvalid,
149    /// An opening `{` was not followed by a valid decimal value.
150    /// For example, `x{}` or `x{]}` would fail.
151    RepetitionCountDecimalEmpty,
152    /// An opening `{` was found with no corresponding closing `}`.
153    RepetitionCountUnclosed,
154    /// A repetition operator was applied to a missing sub-expression. This
155    /// occurs, for example, in the regex consisting of just a `*` or even
156    /// `(?i)*`. It is, however, possible to create a repetition operating on
157    /// an empty sub-expression. For example, `()*` is still considered valid.
158    RepetitionMissing,
159    /// The Unicode class is not valid. This typically occurs when a `\p` is
160    /// followed by something other than a `{`.
161    UnicodeClassInvalid,
162    /// When octal support is disabled, this error is produced when an octal
163    /// escape is used. The octal escape is assumed to be an invocation of
164    /// a backreference, which is the common case.
165    UnsupportedBackreference,
166    /// When syntax similar to PCRE's look-around is used, this error is
167    /// returned. Some example syntaxes that are rejected include, but are
168    /// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and
169    /// `(?<!re)`. Note that all of these syntaxes are otherwise invalid; this
170    /// error is used to improve the user experience.
171    UnsupportedLookAround,
172    /// Hints that destructuring should not be exhaustive.
173    ///
174    /// This enum may grow additional variants, so this makes sure clients
175    /// don't count on exhaustive matching. (Otherwise, adding a new variant
176    /// could break existing code.)
177    #[doc(hidden)]
178    __Nonexhaustive,
179}
180
181impl error::Error for Error {
182    // TODO: Remove this method entirely on the next breaking semver release.
183    #[allow(deprecated)]
184    fn description(&self) -> &str {
185        use self::ErrorKind::*;
186        match self.kind {
187            CaptureLimitExceeded => "capture group limit exceeded",
188            ClassEscapeInvalid => "invalid escape sequence in character class",
189            ClassRangeInvalid => "invalid character class range",
190            ClassRangeLiteral => "invalid range boundary, must be a literal",
191            ClassUnclosed => "unclosed character class",
192            DecimalEmpty => "empty decimal literal",
193            DecimalInvalid => "invalid decimal literal",
194            EscapeHexEmpty => "empty hexadecimal literal",
195            EscapeHexInvalid => "invalid hexadecimal literal",
196            EscapeHexInvalidDigit => "invalid hexadecimal digit",
197            EscapeUnexpectedEof => "unexpected eof (escape sequence)",
198            EscapeUnrecognized => "unrecognized escape sequence",
199            FlagDanglingNegation => "dangling flag negation operator",
200            FlagDuplicate { .. } => "duplicate flag",
201            FlagRepeatedNegation { .. } => "repeated negation",
202            FlagUnexpectedEof => "unexpected eof (flag)",
203            FlagUnrecognized => "unrecognized flag",
204            GroupNameDuplicate { .. } => "duplicate capture group name",
205            GroupNameEmpty => "empty capture group name",
206            GroupNameInvalid => "invalid capture group name",
207            GroupNameUnexpectedEof => "unclosed capture group name",
208            GroupUnclosed => "unclosed group",
209            GroupUnopened => "unopened group",
210            NestLimitExceeded(_) => "nest limit exceeded",
211            RepetitionCountInvalid => "invalid repetition count range",
212            RepetitionCountUnclosed => "unclosed counted repetition",
213            RepetitionMissing => "repetition operator missing expression",
214            UnicodeClassInvalid => "invalid Unicode character class",
215            UnsupportedBackreference => "backreferences are not supported",
216            UnsupportedLookAround => "look-around is not supported",
217            _ => unreachable!(),
218        }
219    }
220}
221
222impl fmt::Display for Error {
223    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
224        crate::error::Formatter::from(self).fmt(f)
225    }
226}
227
228impl fmt::Display for ErrorKind {
229    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
230        use self::ErrorKind::*;
231        match *self {
232            CaptureLimitExceeded => write!(
233                f,
234                "exceeded the maximum number of \
235                 capturing groups ({})",
236                ::std::u32::MAX
237            ),
238            ClassEscapeInvalid => {
239                write!(f, "invalid escape sequence found in character class")
240            }
241            ClassRangeInvalid => write!(
242                f,
243                "invalid character class range, \
244                 the start must be <= the end"
245            ),
246            ClassRangeLiteral => {
247                write!(f, "invalid range boundary, must be a literal")
248            }
249            ClassUnclosed => write!(f, "unclosed character class"),
250            DecimalEmpty => write!(f, "decimal literal empty"),
251            DecimalInvalid => write!(f, "decimal literal invalid"),
252            EscapeHexEmpty => write!(f, "hexadecimal literal empty"),
253            EscapeHexInvalid => {
254                write!(f, "hexadecimal literal is not a Unicode scalar value")
255            }
256            EscapeHexInvalidDigit => write!(f, "invalid hexadecimal digit"),
257            EscapeUnexpectedEof => write!(
258                f,
259                "incomplete escape sequence, \
260                 reached end of pattern prematurely"
261            ),
262            EscapeUnrecognized => write!(f, "unrecognized escape sequence"),
263            FlagDanglingNegation => {
264                write!(f, "dangling flag negation operator")
265            }
266            FlagDuplicate { .. } => write!(f, "duplicate flag"),
267            FlagRepeatedNegation { .. } => {
268                write!(f, "flag negation operator repeated")
269            }
270            FlagUnexpectedEof => {
271                write!(f, "expected flag but got end of regex")
272            }
273            FlagUnrecognized => write!(f, "unrecognized flag"),
274            GroupNameDuplicate { .. } => {
275                write!(f, "duplicate capture group name")
276            }
277            GroupNameEmpty => write!(f, "empty capture group name"),
278            GroupNameInvalid => write!(f, "invalid capture group character"),
279            GroupNameUnexpectedEof => write!(f, "unclosed capture group name"),
280            GroupUnclosed => write!(f, "unclosed group"),
281            GroupUnopened => write!(f, "unopened group"),
282            NestLimitExceeded(limit) => write!(
283                f,
284                "exceed the maximum number of \
285                 nested parentheses/brackets ({})",
286                limit
287            ),
288            RepetitionCountInvalid => write!(
289                f,
290                "invalid repetition count range, \
291                 the start must be <= the end"
292            ),
293            RepetitionCountDecimalEmpty => {
294                write!(f, "repetition quantifier expects a valid decimal")
295            }
296            RepetitionCountUnclosed => {
297                write!(f, "unclosed counted repetition")
298            }
299            RepetitionMissing => {
300                write!(f, "repetition operator missing expression")
301            }
302            UnicodeClassInvalid => {
303                write!(f, "invalid Unicode character class")
304            }
305            UnsupportedBackreference => {
306                write!(f, "backreferences are not supported")
307            }
308            UnsupportedLookAround => write!(
309                f,
310                "look-around, including look-ahead and look-behind, \
311                 is not supported"
312            ),
313            _ => unreachable!(),
314        }
315    }
316}
317
318/// Span represents the position information of a single AST item.
319///
320/// All span positions are absolute byte offsets that can be used on the
321/// original regular expression that was parsed.
322#[derive(Clone, Copy, Eq, PartialEq)]
323pub struct Span {
324    /// The start byte offset.
325    pub start: Position,
326    /// The end byte offset.
327    pub end: Position,
328}
329
330impl fmt::Debug for Span {
331    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
332        write!(f, "Span({:?}, {:?})", self.start, self.end)
333    }
334}
335
336impl Ord for Span {
337    fn cmp(&self, other: &Span) -> Ordering {
338        (&self.start, &self.end).cmp(&(&other.start, &other.end))
339    }
340}
341
342impl PartialOrd for Span {
343    fn partial_cmp(&self, other: &Span) -> Option<Ordering> {
344        Some(self.cmp(other))
345    }
346}
347
348/// A single position in a regular expression.
349///
350/// A position encodes one half of a span, and include the byte offset, line
351/// number and column number.
352#[derive(Clone, Copy, Eq, PartialEq)]
353pub struct Position {
354    /// The absolute offset of this position, starting at `0` from the
355    /// beginning of the regular expression pattern string.
356    pub offset: usize,
357    /// The line number, starting at `1`.
358    pub line: usize,
359    /// The approximate column number, starting at `1`.
360    pub column: usize,
361}
362
363impl fmt::Debug for Position {
364    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
365        write!(
366            f,
367            "Position(o: {:?}, l: {:?}, c: {:?})",
368            self.offset, self.line, self.column
369        )
370    }
371}
372
373impl Ord for Position {
374    fn cmp(&self, other: &Position) -> Ordering {
375        self.offset.cmp(&other.offset)
376    }
377}
378
379impl PartialOrd for Position {
380    fn partial_cmp(&self, other: &Position) -> Option<Ordering> {
381        Some(self.cmp(other))
382    }
383}
384
385impl Span {
386    /// Create a new span with the given positions.
387    pub fn new(start: Position, end: Position) -> Span {
388        Span { start, end }
389    }
390
391    /// Create a new span using the given position as the start and end.
392    pub fn splat(pos: Position) -> Span {
393        Span::new(pos, pos)
394    }
395
396    /// Create a new span by replacing the starting the position with the one
397    /// given.
398    pub fn with_start(self, pos: Position) -> Span {
399        Span { start: pos, ..self }
400    }
401
402    /// Create a new span by replacing the ending the position with the one
403    /// given.
404    pub fn with_end(self, pos: Position) -> Span {
405        Span { end: pos, ..self }
406    }
407
408    /// Returns true if and only if this span occurs on a single line.
409    pub fn is_one_line(&self) -> bool {
410        self.start.line == self.end.line
411    }
412
413    /// Returns true if and only if this span is empty. That is, it points to
414    /// a single position in the concrete syntax of a regular expression.
415    pub fn is_empty(&self) -> bool {
416        self.start.offset == self.end.offset
417    }
418}
419
420impl Position {
421    /// Create a new position with the given information.
422    ///
423    /// `offset` is the absolute offset of the position, starting at `0` from
424    /// the beginning of the regular expression pattern string.
425    ///
426    /// `line` is the line number, starting at `1`.
427    ///
428    /// `column` is the approximate column number, starting at `1`.
429    pub fn new(offset: usize, line: usize, column: usize) -> Position {
430        Position { offset, line, column }
431    }
432}
433
434/// An abstract syntax tree for a singular expression along with comments
435/// found.
436///
437/// Comments are not stored in the tree itself to avoid complexity. Each
438/// comment contains a span of precisely where it occurred in the original
439/// regular expression.
440#[derive(Clone, Debug, Eq, PartialEq)]
441pub struct WithComments {
442    /// The actual ast.
443    pub ast: Ast,
444    /// All comments found in the original regular expression.
445    pub comments: Vec<Comment>,
446}
447
448/// A comment from a regular expression with an associated span.
449///
450/// A regular expression can only contain comments when the `x` flag is
451/// enabled.
452#[derive(Clone, Debug, Eq, PartialEq)]
453pub struct Comment {
454    /// The span of this comment, including the beginning `#` and ending `\n`.
455    pub span: Span,
456    /// The comment text, starting with the first character following the `#`
457    /// and ending with the last character preceding the `\n`.
458    pub comment: String,
459}
460
461/// An abstract syntax tree for a single regular expression.
462///
463/// An `Ast`'s `fmt::Display` implementation uses constant stack space and heap
464/// space proportional to the size of the `Ast`.
465///
466/// This type defines its own destructor that uses constant stack space and
467/// heap space proportional to the size of the `Ast`.
468#[derive(Clone, Debug, Eq, PartialEq)]
469pub enum Ast {
470    /// An empty regex that matches everything.
471    Empty(Span),
472    /// A set of flags, e.g., `(?is)`.
473    Flags(SetFlags),
474    /// A single character literal, which includes escape sequences.
475    Literal(Literal),
476    /// The "any character" class.
477    Dot(Span),
478    /// A single zero-width assertion.
479    Assertion(Assertion),
480    /// A single character class. This includes all forms of character classes
481    /// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`.
482    Class(Class),
483    /// A repetition operator applied to an arbitrary regular expression.
484    Repetition(Repetition),
485    /// A grouped regular expression.
486    Group(Group),
487    /// An alternation of regular expressions.
488    Alternation(Alternation),
489    /// A concatenation of regular expressions.
490    Concat(Concat),
491}
492
493impl Ast {
494    /// Return the span of this abstract syntax tree.
495    pub fn span(&self) -> &Span {
496        match *self {
497            Ast::Empty(ref span) => span,
498            Ast::Flags(ref x) => &x.span,
499            Ast::Literal(ref x) => &x.span,
500            Ast::Dot(ref span) => span,
501            Ast::Assertion(ref x) => &x.span,
502            Ast::Class(ref x) => x.span(),
503            Ast::Repetition(ref x) => &x.span,
504            Ast::Group(ref x) => &x.span,
505            Ast::Alternation(ref x) => &x.span,
506            Ast::Concat(ref x) => &x.span,
507        }
508    }
509
510    /// Return true if and only if this Ast is empty.
511    pub fn is_empty(&self) -> bool {
512        match *self {
513            Ast::Empty(_) => true,
514            _ => false,
515        }
516    }
517
518    /// Returns true if and only if this AST has any (including possibly empty)
519    /// subexpressions.
520    fn has_subexprs(&self) -> bool {
521        match *self {
522            Ast::Empty(_)
523            | Ast::Flags(_)
524            | Ast::Literal(_)
525            | Ast::Dot(_)
526            | Ast::Assertion(_) => false,
527            Ast::Class(_)
528            | Ast::Repetition(_)
529            | Ast::Group(_)
530            | Ast::Alternation(_)
531            | Ast::Concat(_) => true,
532        }
533    }
534}
535
536/// Print a display representation of this Ast.
537///
538/// This does not preserve any of the original whitespace formatting that may
539/// have originally been present in the concrete syntax from which this Ast
540/// was generated.
541///
542/// This implementation uses constant stack space and heap space proportional
543/// to the size of the `Ast`.
544impl fmt::Display for Ast {
545    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
546        use crate::ast::print::Printer;
547        Printer::new().print(self, f)
548    }
549}
550
551/// An alternation of regular expressions.
552#[derive(Clone, Debug, Eq, PartialEq)]
553pub struct Alternation {
554    /// The span of this alternation.
555    pub span: Span,
556    /// The alternate regular expressions.
557    pub asts: Vec<Ast>,
558}
559
560impl Alternation {
561    /// Return this alternation as an AST.
562    ///
563    /// If this alternation contains zero ASTs, then Ast::Empty is
564    /// returned. If this alternation contains exactly 1 AST, then the
565    /// corresponding AST is returned. Otherwise, Ast::Alternation is returned.
566    pub fn into_ast(mut self) -> Ast {
567        match self.asts.len() {
568            0 => Ast::Empty(self.span),
569            1 => self.asts.pop().unwrap(),
570            _ => Ast::Alternation(self),
571        }
572    }
573}
574
575/// A concatenation of regular expressions.
576#[derive(Clone, Debug, Eq, PartialEq)]
577pub struct Concat {
578    /// The span of this concatenation.
579    pub span: Span,
580    /// The concatenation regular expressions.
581    pub asts: Vec<Ast>,
582}
583
584impl Concat {
585    /// Return this concatenation as an AST.
586    ///
587    /// If this concatenation contains zero ASTs, then Ast::Empty is
588    /// returned. If this concatenation contains exactly 1 AST, then the
589    /// corresponding AST is returned. Otherwise, Ast::Concat is returned.
590    pub fn into_ast(mut self) -> Ast {
591        match self.asts.len() {
592            0 => Ast::Empty(self.span),
593            1 => self.asts.pop().unwrap(),
594            _ => Ast::Concat(self),
595        }
596    }
597}
598
599/// A single literal expression.
600///
601/// A literal corresponds to a single Unicode scalar value. Literals may be
602/// represented in their literal form, e.g., `a` or in their escaped form,
603/// e.g., `\x61`.
604#[derive(Clone, Debug, Eq, PartialEq)]
605pub struct Literal {
606    /// The span of this literal.
607    pub span: Span,
608    /// The kind of this literal.
609    pub kind: LiteralKind,
610    /// The Unicode scalar value corresponding to this literal.
611    pub c: char,
612}
613
614impl Literal {
615    /// If this literal was written as a `\x` hex escape, then this returns
616    /// the corresponding byte value. Otherwise, this returns `None`.
617    pub fn byte(&self) -> Option<u8> {
618        let short_hex = LiteralKind::HexFixed(HexLiteralKind::X);
619        if self.c as u32 <= 255 && self.kind == short_hex {
620            Some(self.c as u8)
621        } else {
622            None
623        }
624    }
625}
626
627/// The kind of a single literal expression.
628#[derive(Clone, Debug, Eq, PartialEq)]
629pub enum LiteralKind {
630    /// The literal is written verbatim, e.g., `a` or `☃`.
631    Verbatim,
632    /// The literal is written as an escape because it is punctuation, e.g.,
633    /// `\*` or `\[`.
634    Punctuation,
635    /// The literal is written as an octal escape, e.g., `\141`.
636    Octal,
637    /// The literal is written as a hex code with a fixed number of digits
638    /// depending on the type of the escape, e.g., `\x61` or or `\u0061` or
639    /// `\U00000061`.
640    HexFixed(HexLiteralKind),
641    /// The literal is written as a hex code with a bracketed number of
642    /// digits. The only restriction is that the bracketed hex code must refer
643    /// to a valid Unicode scalar value.
644    HexBrace(HexLiteralKind),
645    /// The literal is written as a specially recognized escape, e.g., `\f`
646    /// or `\n`.
647    Special(SpecialLiteralKind),
648}
649
650/// The type of a special literal.
651///
652/// A special literal is a special escape sequence recognized by the regex
653/// parser, e.g., `\f` or `\n`.
654#[derive(Clone, Debug, Eq, PartialEq)]
655pub enum SpecialLiteralKind {
656    /// Bell, spelled `\a` (`\x07`).
657    Bell,
658    /// Form feed, spelled `\f` (`\x0C`).
659    FormFeed,
660    /// Tab, spelled `\t` (`\x09`).
661    Tab,
662    /// Line feed, spelled `\n` (`\x0A`).
663    LineFeed,
664    /// Carriage return, spelled `\r` (`\x0D`).
665    CarriageReturn,
666    /// Vertical tab, spelled `\v` (`\x0B`).
667    VerticalTab,
668    /// Space, spelled `\ ` (`\x20`). Note that this can only appear when
669    /// parsing in verbose mode.
670    Space,
671}
672
673/// The type of a Unicode hex literal.
674///
675/// Note that all variants behave the same when used with brackets. They only
676/// differ when used without brackets in the number of hex digits that must
677/// follow.
678#[derive(Clone, Debug, Eq, PartialEq)]
679pub enum HexLiteralKind {
680    /// A `\x` prefix. When used without brackets, this form is limited to
681    /// two digits.
682    X,
683    /// A `\u` prefix. When used without brackets, this form is limited to
684    /// four digits.
685    UnicodeShort,
686    /// A `\U` prefix. When used without brackets, this form is limited to
687    /// eight digits.
688    UnicodeLong,
689}
690
691impl HexLiteralKind {
692    /// The number of digits that must be used with this literal form when
693    /// used without brackets. When used with brackets, there is no
694    /// restriction on the number of digits.
695    pub fn digits(&self) -> u32 {
696        match *self {
697            HexLiteralKind::X => 2,
698            HexLiteralKind::UnicodeShort => 4,
699            HexLiteralKind::UnicodeLong => 8,
700        }
701    }
702}
703
704/// A single character class expression.
705#[derive(Clone, Debug, Eq, PartialEq)]
706pub enum Class {
707    /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
708    Unicode(ClassUnicode),
709    /// A perl character class, e.g., `\d` or `\W`.
710    Perl(ClassPerl),
711    /// A bracketed character class set, which may contain zero or more
712    /// character ranges and/or zero or more nested classes. e.g.,
713    /// `[a-zA-Z\pL]`.
714    Bracketed(ClassBracketed),
715}
716
717impl Class {
718    /// Return the span of this character class.
719    pub fn span(&self) -> &Span {
720        match *self {
721            Class::Perl(ref x) => &x.span,
722            Class::Unicode(ref x) => &x.span,
723            Class::Bracketed(ref x) => &x.span,
724        }
725    }
726}
727
728/// A Perl character class.
729#[derive(Clone, Debug, Eq, PartialEq)]
730pub struct ClassPerl {
731    /// The span of this class.
732    pub span: Span,
733    /// The kind of Perl class.
734    pub kind: ClassPerlKind,
735    /// Whether the class is negated or not. e.g., `\d` is not negated but
736    /// `\D` is.
737    pub negated: bool,
738}
739
740/// The available Perl character classes.
741#[derive(Clone, Debug, Eq, PartialEq)]
742pub enum ClassPerlKind {
743    /// Decimal numbers.
744    Digit,
745    /// Whitespace.
746    Space,
747    /// Word characters.
748    Word,
749}
750
751/// An ASCII character class.
752#[derive(Clone, Debug, Eq, PartialEq)]
753pub struct ClassAscii {
754    /// The span of this class.
755    pub span: Span,
756    /// The kind of ASCII class.
757    pub kind: ClassAsciiKind,
758    /// Whether the class is negated or not. e.g., `[[:alpha:]]` is not negated
759    /// but `[[:^alpha:]]` is.
760    pub negated: bool,
761}
762
763/// The available ASCII character classes.
764#[derive(Clone, Debug, Eq, PartialEq)]
765pub enum ClassAsciiKind {
766    /// `[0-9A-Za-z]`
767    Alnum,
768    /// `[A-Za-z]`
769    Alpha,
770    /// `[\x00-\x7F]`
771    Ascii,
772    /// `[ \t]`
773    Blank,
774    /// `[\x00-\x1F\x7F]`
775    Cntrl,
776    /// `[0-9]`
777    Digit,
778    /// `[!-~]`
779    Graph,
780    /// `[a-z]`
781    Lower,
782    /// `[ -~]`
783    Print,
784    /// `[!-/:-@\[-`{-~]`
785    Punct,
786    /// `[\t\n\v\f\r ]`
787    Space,
788    /// `[A-Z]`
789    Upper,
790    /// `[0-9A-Za-z_]`
791    Word,
792    /// `[0-9A-Fa-f]`
793    Xdigit,
794}
795
796impl ClassAsciiKind {
797    /// Return the corresponding ClassAsciiKind variant for the given name.
798    ///
799    /// The name given should correspond to the lowercase version of the
800    /// variant name. e.g., `cntrl` is the name for `ClassAsciiKind::Cntrl`.
801    ///
802    /// If no variant with the corresponding name exists, then `None` is
803    /// returned.
804    pub fn from_name(name: &str) -> Option<ClassAsciiKind> {
805        use self::ClassAsciiKind::*;
806        match name {
807            "alnum" => Some(Alnum),
808            "alpha" => Some(Alpha),
809            "ascii" => Some(Ascii),
810            "blank" => Some(Blank),
811            "cntrl" => Some(Cntrl),
812            "digit" => Some(Digit),
813            "graph" => Some(Graph),
814            "lower" => Some(Lower),
815            "print" => Some(Print),
816            "punct" => Some(Punct),
817            "space" => Some(Space),
818            "upper" => Some(Upper),
819            "word" => Some(Word),
820            "xdigit" => Some(Xdigit),
821            _ => None,
822        }
823    }
824}
825
826/// A Unicode character class.
827#[derive(Clone, Debug, Eq, PartialEq)]
828pub struct ClassUnicode {
829    /// The span of this class.
830    pub span: Span,
831    /// Whether this class is negated or not.
832    ///
833    /// Note: be careful when using this attribute. This specifically refers
834    /// to whether the class is written as `\p` or `\P`, where the latter
835    /// is `negated = true`. However, it also possible to write something like
836    /// `\P{scx!=Katakana}` which is actually equivalent to
837    /// `\p{scx=Katakana}` and is therefore not actually negated even though
838    /// `negated = true` here. To test whether this class is truly negated
839    /// or not, use the `is_negated` method.
840    pub negated: bool,
841    /// The kind of Unicode class.
842    pub kind: ClassUnicodeKind,
843}
844
845impl ClassUnicode {
846    /// Returns true if this class has been negated.
847    ///
848    /// Note that this takes the Unicode op into account, if it's present.
849    /// e.g., `is_negated` for `\P{scx!=Katakana}` will return `false`.
850    pub fn is_negated(&self) -> bool {
851        match self.kind {
852            ClassUnicodeKind::NamedValue {
853                op: ClassUnicodeOpKind::NotEqual,
854                ..
855            } => !self.negated,
856            _ => self.negated,
857        }
858    }
859}
860
861/// The available forms of Unicode character classes.
862#[derive(Clone, Debug, Eq, PartialEq)]
863pub enum ClassUnicodeKind {
864    /// A one letter abbreviated class, e.g., `\pN`.
865    OneLetter(char),
866    /// A binary property, general category or script. The string may be
867    /// empty.
868    Named(String),
869    /// A property name and an associated value.
870    NamedValue {
871        /// The type of Unicode op used to associate `name` with `value`.
872        op: ClassUnicodeOpKind,
873        /// The property name (which may be empty).
874        name: String,
875        /// The property value (which may be empty).
876        value: String,
877    },
878}
879
880/// The type of op used in a Unicode character class.
881#[derive(Clone, Debug, Eq, PartialEq)]
882pub enum ClassUnicodeOpKind {
883    /// A property set to a specific value, e.g., `\p{scx=Katakana}`.
884    Equal,
885    /// A property set to a specific value using a colon, e.g.,
886    /// `\p{scx:Katakana}`.
887    Colon,
888    /// A property that isn't a particular value, e.g., `\p{scx!=Katakana}`.
889    NotEqual,
890}
891
892impl ClassUnicodeOpKind {
893    /// Whether the op is an equality op or not.
894    pub fn is_equal(&self) -> bool {
895        match *self {
896            ClassUnicodeOpKind::Equal | ClassUnicodeOpKind::Colon => true,
897            _ => false,
898        }
899    }
900}
901
902/// A bracketed character class, e.g., `[a-z0-9]`.
903#[derive(Clone, Debug, Eq, PartialEq)]
904pub struct ClassBracketed {
905    /// The span of this class.
906    pub span: Span,
907    /// Whether this class is negated or not. e.g., `[a]` is not negated but
908    /// `[^a]` is.
909    pub negated: bool,
910    /// The type of this set. A set is either a normal union of things, e.g.,
911    /// `[abc]` or a result of applying set operations, e.g., `[\pL--c]`.
912    pub kind: ClassSet,
913}
914
915/// A character class set.
916///
917/// This type corresponds to the internal structure of a bracketed character
918/// class. That is, every bracketed character is one of two types: a union of
919/// items (literals, ranges, other bracketed classes) or a tree of binary set
920/// operations.
921#[derive(Clone, Debug, Eq, PartialEq)]
922pub enum ClassSet {
923    /// An item, which can be a single literal, range, nested character class
924    /// or a union of items.
925    Item(ClassSetItem),
926    /// A single binary operation (i.e., &&, -- or ~~).
927    BinaryOp(ClassSetBinaryOp),
928}
929
930impl ClassSet {
931    /// Build a set from a union.
932    pub fn union(ast: ClassSetUnion) -> ClassSet {
933        ClassSet::Item(ClassSetItem::Union(ast))
934    }
935
936    /// Return the span of this character class set.
937    pub fn span(&self) -> &Span {
938        match *self {
939            ClassSet::Item(ref x) => x.span(),
940            ClassSet::BinaryOp(ref x) => &x.span,
941        }
942    }
943
944    /// Return true if and only if this class set is empty.
945    fn is_empty(&self) -> bool {
946        match *self {
947            ClassSet::Item(ClassSetItem::Empty(_)) => true,
948            _ => false,
949        }
950    }
951}
952
953/// A single component of a character class set.
954#[derive(Clone, Debug, Eq, PartialEq)]
955pub enum ClassSetItem {
956    /// An empty item.
957    ///
958    /// Note that a bracketed character class cannot contain a single empty
959    /// item. Empty items can appear when using one of the binary operators.
960    /// For example, `[&&]` is the intersection of two empty classes.
961    Empty(Span),
962    /// A single literal.
963    Literal(Literal),
964    /// A range between two literals.
965    Range(ClassSetRange),
966    /// An ASCII character class, e.g., `[:alnum:]` or `[:punct:]`.
967    Ascii(ClassAscii),
968    /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
969    Unicode(ClassUnicode),
970    /// A perl character class, e.g., `\d` or `\W`.
971    Perl(ClassPerl),
972    /// A bracketed character class set, which may contain zero or more
973    /// character ranges and/or zero or more nested classes. e.g.,
974    /// `[a-zA-Z\pL]`.
975    Bracketed(Box<ClassBracketed>),
976    /// A union of items.
977    Union(ClassSetUnion),
978}
979
980impl ClassSetItem {
981    /// Return the span of this character class set item.
982    pub fn span(&self) -> &Span {
983        match *self {
984            ClassSetItem::Empty(ref span) => span,
985            ClassSetItem::Literal(ref x) => &x.span,
986            ClassSetItem::Range(ref x) => &x.span,
987            ClassSetItem::Ascii(ref x) => &x.span,
988            ClassSetItem::Perl(ref x) => &x.span,
989            ClassSetItem::Unicode(ref x) => &x.span,
990            ClassSetItem::Bracketed(ref x) => &x.span,
991            ClassSetItem::Union(ref x) => &x.span,
992        }
993    }
994}
995
996/// A single character class range in a set.
997#[derive(Clone, Debug, Eq, PartialEq)]
998pub struct ClassSetRange {
999    /// The span of this range.
1000    pub span: Span,
1001    /// The start of this range.
1002    pub start: Literal,
1003    /// The end of this range.
1004    pub end: Literal,
1005}
1006
1007impl ClassSetRange {
1008    /// Returns true if and only if this character class range is valid.
1009    ///
1010    /// The only case where a range is invalid is if its start is greater than
1011    /// its end.
1012    pub fn is_valid(&self) -> bool {
1013        self.start.c <= self.end.c
1014    }
1015}
1016
1017/// A union of items inside a character class set.
1018#[derive(Clone, Debug, Eq, PartialEq)]
1019pub struct ClassSetUnion {
1020    /// The span of the items in this operation. e.g., the `a-z0-9` in
1021    /// `[^a-z0-9]`
1022    pub span: Span,
1023    /// The sequence of items that make up this union.
1024    pub items: Vec<ClassSetItem>,
1025}
1026
1027impl ClassSetUnion {
1028    /// Push a new item in this union.
1029    ///
1030    /// The ending position of this union's span is updated to the ending
1031    /// position of the span of the item given. If the union is empty, then
1032    /// the starting position of this union is set to the starting position
1033    /// of this item.
1034    ///
1035    /// In other words, if you only use this method to add items to a union
1036    /// and you set the spans on each item correctly, then you should never
1037    /// need to adjust the span of the union directly.
1038    pub fn push(&mut self, item: ClassSetItem) {
1039        if self.items.is_empty() {
1040            self.span.start = item.span().start;
1041        }
1042        self.span.end = item.span().end;
1043        self.items.push(item);
1044    }
1045
1046    /// Return this union as a character class set item.
1047    ///
1048    /// If this union contains zero items, then an empty union is
1049    /// returned. If this concatenation contains exactly 1 item, then the
1050    /// corresponding item is returned. Otherwise, ClassSetItem::Union is
1051    /// returned.
1052    pub fn into_item(mut self) -> ClassSetItem {
1053        match self.items.len() {
1054            0 => ClassSetItem::Empty(self.span),
1055            1 => self.items.pop().unwrap(),
1056            _ => ClassSetItem::Union(self),
1057        }
1058    }
1059}
1060
1061/// A Unicode character class set operation.
1062#[derive(Clone, Debug, Eq, PartialEq)]
1063pub struct ClassSetBinaryOp {
1064    /// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`.
1065    pub span: Span,
1066    /// The type of this set operation.
1067    pub kind: ClassSetBinaryOpKind,
1068    /// The left hand side of the operation.
1069    pub lhs: Box<ClassSet>,
1070    /// The right hand side of the operation.
1071    pub rhs: Box<ClassSet>,
1072}
1073
1074/// The type of a Unicode character class set operation.
1075///
1076/// Note that this doesn't explicitly represent union since there is no
1077/// explicit union operator. Concatenation inside a character class corresponds
1078/// to the union operation.
1079#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1080pub enum ClassSetBinaryOpKind {
1081    /// The intersection of two sets, e.g., `\pN&&[a-z]`.
1082    Intersection,
1083    /// The difference of two sets, e.g., `\pN--[0-9]`.
1084    Difference,
1085    /// The symmetric difference of two sets. The symmetric difference is the
1086    /// set of elements belonging to one but not both sets.
1087    /// e.g., `[\pL~~[:ascii:]]`.
1088    SymmetricDifference,
1089}
1090
1091/// A single zero-width assertion.
1092#[derive(Clone, Debug, Eq, PartialEq)]
1093pub struct Assertion {
1094    /// The span of this assertion.
1095    pub span: Span,
1096    /// The assertion kind, e.g., `\b` or `^`.
1097    pub kind: AssertionKind,
1098}
1099
1100/// An assertion kind.
1101#[derive(Clone, Debug, Eq, PartialEq)]
1102pub enum AssertionKind {
1103    /// `^`
1104    StartLine,
1105    /// `$`
1106    EndLine,
1107    /// `\A`
1108    StartText,
1109    /// `\z`
1110    EndText,
1111    /// `\b`
1112    WordBoundary,
1113    /// `\B`
1114    NotWordBoundary,
1115}
1116
1117/// A repetition operation applied to a regular expression.
1118#[derive(Clone, Debug, Eq, PartialEq)]
1119pub struct Repetition {
1120    /// The span of this operation.
1121    pub span: Span,
1122    /// The actual operation.
1123    pub op: RepetitionOp,
1124    /// Whether this operation was applied greedily or not.
1125    pub greedy: bool,
1126    /// The regular expression under repetition.
1127    pub ast: Box<Ast>,
1128}
1129
1130/// The repetition operator itself.
1131#[derive(Clone, Debug, Eq, PartialEq)]
1132pub struct RepetitionOp {
1133    /// The span of this operator. This includes things like `+`, `*?` and
1134    /// `{m,n}`.
1135    pub span: Span,
1136    /// The type of operation.
1137    pub kind: RepetitionKind,
1138}
1139
1140/// The kind of a repetition operator.
1141#[derive(Clone, Debug, Eq, PartialEq)]
1142pub enum RepetitionKind {
1143    /// `?`
1144    ZeroOrOne,
1145    /// `*`
1146    ZeroOrMore,
1147    /// `+`
1148    OneOrMore,
1149    /// `{m,n}`
1150    Range(RepetitionRange),
1151}
1152
1153/// A range repetition operator.
1154#[derive(Clone, Debug, Eq, PartialEq)]
1155pub enum RepetitionRange {
1156    /// `{m}`
1157    Exactly(u32),
1158    /// `{m,}`
1159    AtLeast(u32),
1160    /// `{m,n}`
1161    Bounded(u32, u32),
1162}
1163
1164impl RepetitionRange {
1165    /// Returns true if and only if this repetition range is valid.
1166    ///
1167    /// The only case where a repetition range is invalid is if it is bounded
1168    /// and its start is greater than its end.
1169    pub fn is_valid(&self) -> bool {
1170        match *self {
1171            RepetitionRange::Bounded(s, e) if s > e => false,
1172            _ => true,
1173        }
1174    }
1175}
1176
1177/// A grouped regular expression.
1178///
1179/// This includes both capturing and non-capturing groups. This does **not**
1180/// include flag-only groups like `(?is)`, but does contain any group that
1181/// contains a sub-expression, e.g., `(a)`, `(?P<name>a)`, `(?:a)` and
1182/// `(?is:a)`.
1183#[derive(Clone, Debug, Eq, PartialEq)]
1184pub struct Group {
1185    /// The span of this group.
1186    pub span: Span,
1187    /// The kind of this group.
1188    pub kind: GroupKind,
1189    /// The regular expression in this group.
1190    pub ast: Box<Ast>,
1191}
1192
1193impl Group {
1194    /// If this group is non-capturing, then this returns the (possibly empty)
1195    /// set of flags. Otherwise, `None` is returned.
1196    pub fn flags(&self) -> Option<&Flags> {
1197        match self.kind {
1198            GroupKind::NonCapturing(ref flags) => Some(flags),
1199            _ => None,
1200        }
1201    }
1202
1203    /// Returns true if and only if this group is capturing.
1204    pub fn is_capturing(&self) -> bool {
1205        match self.kind {
1206            GroupKind::CaptureIndex(_) | GroupKind::CaptureName(_) => true,
1207            GroupKind::NonCapturing(_) => false,
1208        }
1209    }
1210
1211    /// Returns the capture index of this group, if this is a capturing group.
1212    ///
1213    /// This returns a capture index precisely when `is_capturing` is `true`.
1214    pub fn capture_index(&self) -> Option<u32> {
1215        match self.kind {
1216            GroupKind::CaptureIndex(i) => Some(i),
1217            GroupKind::CaptureName(ref x) => Some(x.index),
1218            GroupKind::NonCapturing(_) => None,
1219        }
1220    }
1221}
1222
1223/// The kind of a group.
1224#[derive(Clone, Debug, Eq, PartialEq)]
1225pub enum GroupKind {
1226    /// `(a)`
1227    CaptureIndex(u32),
1228    /// `(?P<name>a)`
1229    CaptureName(CaptureName),
1230    /// `(?:a)` and `(?i:a)`
1231    NonCapturing(Flags),
1232}
1233
1234/// A capture name.
1235///
1236/// This corresponds to the name itself between the angle brackets in, e.g.,
1237/// `(?P<foo>expr)`.
1238#[derive(Clone, Debug, Eq, PartialEq)]
1239pub struct CaptureName {
1240    /// The span of this capture name.
1241    pub span: Span,
1242    /// The capture name.
1243    pub name: String,
1244    /// The capture index.
1245    pub index: u32,
1246}
1247
1248/// A group of flags that is not applied to a particular regular expression.
1249#[derive(Clone, Debug, Eq, PartialEq)]
1250pub struct SetFlags {
1251    /// The span of these flags, including the grouping parentheses.
1252    pub span: Span,
1253    /// The actual sequence of flags.
1254    pub flags: Flags,
1255}
1256
1257/// A group of flags.
1258///
1259/// This corresponds only to the sequence of flags themselves, e.g., `is-u`.
1260#[derive(Clone, Debug, Eq, PartialEq)]
1261pub struct Flags {
1262    /// The span of this group of flags.
1263    pub span: Span,
1264    /// A sequence of flag items. Each item is either a flag or a negation
1265    /// operator.
1266    pub items: Vec<FlagsItem>,
1267}
1268
1269impl Flags {
1270    /// Add the given item to this sequence of flags.
1271    ///
1272    /// If the item was added successfully, then `None` is returned. If the
1273    /// given item is a duplicate, then `Some(i)` is returned, where
1274    /// `items[i].kind == item.kind`.
1275    pub fn add_item(&mut self, item: FlagsItem) -> Option<usize> {
1276        for (i, x) in self.items.iter().enumerate() {
1277            if x.kind == item.kind {
1278                return Some(i);
1279            }
1280        }
1281        self.items.push(item);
1282        None
1283    }
1284
1285    /// Returns the state of the given flag in this set.
1286    ///
1287    /// If the given flag is in the set but is negated, then `Some(false)` is
1288    /// returned.
1289    ///
1290    /// If the given flag is in the set and is not negated, then `Some(true)`
1291    /// is returned.
1292    ///
1293    /// Otherwise, `None` is returned.
1294    pub fn flag_state(&self, flag: Flag) -> Option<bool> {
1295        let mut negated = false;
1296        for x in &self.items {
1297            match x.kind {
1298                FlagsItemKind::Negation => {
1299                    negated = true;
1300                }
1301                FlagsItemKind::Flag(ref xflag) if xflag == &flag => {
1302                    return Some(!negated);
1303                }
1304                _ => {}
1305            }
1306        }
1307        None
1308    }
1309}
1310
1311/// A single item in a group of flags.
1312#[derive(Clone, Debug, Eq, PartialEq)]
1313pub struct FlagsItem {
1314    /// The span of this item.
1315    pub span: Span,
1316    /// The kind of this item.
1317    pub kind: FlagsItemKind,
1318}
1319
1320/// The kind of an item in a group of flags.
1321#[derive(Clone, Debug, Eq, PartialEq)]
1322pub enum FlagsItemKind {
1323    /// A negation operator applied to all subsequent flags in the enclosing
1324    /// group.
1325    Negation,
1326    /// A single flag in a group.
1327    Flag(Flag),
1328}
1329
1330impl FlagsItemKind {
1331    /// Returns true if and only if this item is a negation operator.
1332    pub fn is_negation(&self) -> bool {
1333        match *self {
1334            FlagsItemKind::Negation => true,
1335            _ => false,
1336        }
1337    }
1338}
1339
1340/// A single flag.
1341#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1342pub enum Flag {
1343    /// `i`
1344    CaseInsensitive,
1345    /// `m`
1346    MultiLine,
1347    /// `s`
1348    DotMatchesNewLine,
1349    /// `U`
1350    SwapGreed,
1351    /// `u`
1352    Unicode,
1353    /// `x`
1354    IgnoreWhitespace,
1355}
1356
1357/// A custom `Drop` impl is used for `Ast` such that it uses constant stack
1358/// space but heap space proportional to the depth of the `Ast`.
1359impl Drop for Ast {
1360    fn drop(&mut self) {
1361        use std::mem;
1362
1363        match *self {
1364            Ast::Empty(_)
1365            | Ast::Flags(_)
1366            | Ast::Literal(_)
1367            | Ast::Dot(_)
1368            | Ast::Assertion(_)
1369            // Classes are recursive, so they get their own Drop impl.
1370            | Ast::Class(_) => return,
1371            Ast::Repetition(ref x) if !x.ast.has_subexprs() => return,
1372            Ast::Group(ref x) if !x.ast.has_subexprs() => return,
1373            Ast::Alternation(ref x) if x.asts.is_empty() => return,
1374            Ast::Concat(ref x) if x.asts.is_empty() => return,
1375            _ => {}
1376        }
1377
1378        let empty_span = || Span::splat(Position::new(0, 0, 0));
1379        let empty_ast = || Ast::Empty(empty_span());
1380        let mut stack = vec![mem::replace(self, empty_ast())];
1381        while let Some(mut ast) = stack.pop() {
1382            match ast {
1383                Ast::Empty(_)
1384                | Ast::Flags(_)
1385                | Ast::Literal(_)
1386                | Ast::Dot(_)
1387                | Ast::Assertion(_)
1388                // Classes are recursive, so they get their own Drop impl.
1389                | Ast::Class(_) => {}
1390                Ast::Repetition(ref mut x) => {
1391                    stack.push(mem::replace(&mut x.ast, empty_ast()));
1392                }
1393                Ast::Group(ref mut x) => {
1394                    stack.push(mem::replace(&mut x.ast, empty_ast()));
1395                }
1396                Ast::Alternation(ref mut x) => {
1397                    stack.extend(x.asts.drain(..));
1398                }
1399                Ast::Concat(ref mut x) => {
1400                    stack.extend(x.asts.drain(..));
1401                }
1402            }
1403        }
1404    }
1405}
1406
1407/// A custom `Drop` impl is used for `ClassSet` such that it uses constant
1408/// stack space but heap space proportional to the depth of the `ClassSet`.
1409impl Drop for ClassSet {
1410    fn drop(&mut self) {
1411        use std::mem;
1412
1413        match *self {
1414            ClassSet::Item(ref item) => match *item {
1415                ClassSetItem::Empty(_)
1416                | ClassSetItem::Literal(_)
1417                | ClassSetItem::Range(_)
1418                | ClassSetItem::Ascii(_)
1419                | ClassSetItem::Unicode(_)
1420                | ClassSetItem::Perl(_) => return,
1421                ClassSetItem::Bracketed(ref x) => {
1422                    if x.kind.is_empty() {
1423                        return;
1424                    }
1425                }
1426                ClassSetItem::Union(ref x) => {
1427                    if x.items.is_empty() {
1428                        return;
1429                    }
1430                }
1431            },
1432            ClassSet::BinaryOp(ref op) => {
1433                if op.lhs.is_empty() && op.rhs.is_empty() {
1434                    return;
1435                }
1436            }
1437        }
1438
1439        let empty_span = || Span::splat(Position::new(0, 0, 0));
1440        let empty_set = || ClassSet::Item(ClassSetItem::Empty(empty_span()));
1441        let mut stack = vec![mem::replace(self, empty_set())];
1442        while let Some(mut set) = stack.pop() {
1443            match set {
1444                ClassSet::Item(ref mut item) => match *item {
1445                    ClassSetItem::Empty(_)
1446                    | ClassSetItem::Literal(_)
1447                    | ClassSetItem::Range(_)
1448                    | ClassSetItem::Ascii(_)
1449                    | ClassSetItem::Unicode(_)
1450                    | ClassSetItem::Perl(_) => {}
1451                    ClassSetItem::Bracketed(ref mut x) => {
1452                        stack.push(mem::replace(&mut x.kind, empty_set()));
1453                    }
1454                    ClassSetItem::Union(ref mut x) => {
1455                        stack.extend(x.items.drain(..).map(ClassSet::Item));
1456                    }
1457                },
1458                ClassSet::BinaryOp(ref mut op) => {
1459                    stack.push(mem::replace(&mut op.lhs, empty_set()));
1460                    stack.push(mem::replace(&mut op.rhs, empty_set()));
1461                }
1462            }
1463        }
1464    }
1465}
1466
1467#[cfg(test)]
1468mod tests {
1469    use super::*;
1470
1471    // We use a thread with an explicit stack size to test that our destructor
1472    // for Ast can handle arbitrarily sized expressions in constant stack
1473    // space. In case we run on a platform without threads (WASM?), we limit
1474    // this test to Windows/Unix.
1475    #[test]
1476    #[cfg(any(unix, windows))]
1477    fn no_stack_overflow_on_drop() {
1478        use std::thread;
1479
1480        let run = || {
1481            let span = || Span::splat(Position::new(0, 0, 0));
1482            let mut ast = Ast::Empty(span());
1483            for i in 0..200 {
1484                ast = Ast::Group(Group {
1485                    span: span(),
1486                    kind: GroupKind::CaptureIndex(i),
1487                    ast: Box::new(ast),
1488                });
1489            }
1490            assert!(!ast.is_empty());
1491        };
1492
1493        // We run our test on a thread with a small stack size so we can
1494        // force the issue more easily.
1495        thread::Builder::new()
1496            .stack_size(1 << 10)
1497            .spawn(run)
1498            .unwrap()
1499            .join()
1500            .unwrap();
1501    }
1502}
regex_syntax/ast/mod.rs

regex_syntax/ast/
mod.rs