regex_syntax/hir/
print.rs

1/*!
2This module provides a regular expression printer for `Hir`.
3*/
4
5use std::fmt;
6
7use crate::hir::visitor::{self, Visitor};
8use crate::hir::{self, Hir, HirKind};
9use crate::is_meta_character;
10
11/// A builder for constructing a printer.
12///
13/// Note that since a printer doesn't have any configuration knobs, this type
14/// remains unexported.
15#[derive(Clone, Debug)]
16struct PrinterBuilder {
17    _priv: (),
18}
19
20impl Default for PrinterBuilder {
21    fn default() -> PrinterBuilder {
22        PrinterBuilder::new()
23    }
24}
25
26impl PrinterBuilder {
27    fn new() -> PrinterBuilder {
28        PrinterBuilder { _priv: () }
29    }
30
31    fn build(&self) -> Printer {
32        Printer { _priv: () }
33    }
34}
35
36/// A printer for a regular expression's high-level intermediate
37/// representation.
38///
39/// A printer converts a high-level intermediate representation (HIR) to a
40/// regular expression pattern string. This particular printer uses constant
41/// stack space and heap space proportional to the size of the HIR.
42///
43/// Since this printer is only using the HIR, the pattern it prints will likely
44/// not resemble the original pattern at all. For example, a pattern like
45/// `\pL` will have its entire class written out.
46///
47/// The purpose of this printer is to provide a means to mutate an HIR and then
48/// build a regular expression from the result of that mutation. (A regex
49/// library could provide a constructor from this HIR explicitly, but that
50/// creates an unnecessary public coupling between the regex library and this
51/// specific HIR representation.)
52#[derive(Debug)]
53pub struct Printer {
54    _priv: (),
55}
56
57impl Printer {
58    /// Create a new printer.
59    pub fn new() -> Printer {
60        PrinterBuilder::new().build()
61    }
62
63    /// Print the given `Ast` to the given writer. The writer must implement
64    /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used
65    /// here are a `fmt::Formatter` (which is available in `fmt::Display`
66    /// implementations) or a `&mut String`.
67    pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result {
68        visitor::visit(hir, Writer { wtr })
69    }
70}
71
72#[derive(Debug)]
73struct Writer<W> {
74    wtr: W,
75}
76
77impl<W: fmt::Write> Visitor for Writer<W> {
78    type Output = ();
79    type Err = fmt::Error;
80
81    fn finish(self) -> fmt::Result {
82        Ok(())
83    }
84
85    fn visit_pre(&mut self, hir: &Hir) -> fmt::Result {
86        match *hir.kind() {
87            HirKind::Empty
88            | HirKind::Repetition(_)
89            | HirKind::Concat(_)
90            | HirKind::Alternation(_) => {}
91            HirKind::Literal(hir::Literal::Unicode(c)) => {
92                self.write_literal_char(c)?;
93            }
94            HirKind::Literal(hir::Literal::Byte(b)) => {
95                self.write_literal_byte(b)?;
96            }
97            HirKind::Class(hir::Class::Unicode(ref cls)) => {
98                self.wtr.write_str("[")?;
99                for range in cls.iter() {
100                    if range.start() == range.end() {
101                        self.write_literal_char(range.start())?;
102                    } else {
103                        self.write_literal_char(range.start())?;
104                        self.wtr.write_str("-")?;
105                        self.write_literal_char(range.end())?;
106                    }
107                }
108                self.wtr.write_str("]")?;
109            }
110            HirKind::Class(hir::Class::Bytes(ref cls)) => {
111                self.wtr.write_str("(?-u:[")?;
112                for range in cls.iter() {
113                    if range.start() == range.end() {
114                        self.write_literal_class_byte(range.start())?;
115                    } else {
116                        self.write_literal_class_byte(range.start())?;
117                        self.wtr.write_str("-")?;
118                        self.write_literal_class_byte(range.end())?;
119                    }
120                }
121                self.wtr.write_str("])")?;
122            }
123            HirKind::Anchor(hir::Anchor::StartLine) => {
124                self.wtr.write_str("(?m:^)")?;
125            }
126            HirKind::Anchor(hir::Anchor::EndLine) => {
127                self.wtr.write_str("(?m:$)")?;
128            }
129            HirKind::Anchor(hir::Anchor::StartText) => {
130                self.wtr.write_str(r"\A")?;
131            }
132            HirKind::Anchor(hir::Anchor::EndText) => {
133                self.wtr.write_str(r"\z")?;
134            }
135            HirKind::WordBoundary(hir::WordBoundary::Unicode) => {
136                self.wtr.write_str(r"\b")?;
137            }
138            HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => {
139                self.wtr.write_str(r"\B")?;
140            }
141            HirKind::WordBoundary(hir::WordBoundary::Ascii) => {
142                self.wtr.write_str(r"(?-u:\b)")?;
143            }
144            HirKind::WordBoundary(hir::WordBoundary::AsciiNegate) => {
145                self.wtr.write_str(r"(?-u:\B)")?;
146            }
147            HirKind::Group(ref x) => match x.kind {
148                hir::GroupKind::CaptureIndex(_) => {
149                    self.wtr.write_str("(")?;
150                }
151                hir::GroupKind::CaptureName { ref name, .. } => {
152                    write!(self.wtr, "(?P<{}>", name)?;
153                }
154                hir::GroupKind::NonCapturing => {
155                    self.wtr.write_str("(?:")?;
156                }
157            },
158        }
159        Ok(())
160    }
161
162    fn visit_post(&mut self, hir: &Hir) -> fmt::Result {
163        match *hir.kind() {
164            // Handled during visit_pre
165            HirKind::Empty
166            | HirKind::Literal(_)
167            | HirKind::Class(_)
168            | HirKind::Anchor(_)
169            | HirKind::WordBoundary(_)
170            | HirKind::Concat(_)
171            | HirKind::Alternation(_) => {}
172            HirKind::Repetition(ref x) => {
173                match x.kind {
174                    hir::RepetitionKind::ZeroOrOne => {
175                        self.wtr.write_str("?")?;
176                    }
177                    hir::RepetitionKind::ZeroOrMore => {
178                        self.wtr.write_str("*")?;
179                    }
180                    hir::RepetitionKind::OneOrMore => {
181                        self.wtr.write_str("+")?;
182                    }
183                    hir::RepetitionKind::Range(ref x) => match *x {
184                        hir::RepetitionRange::Exactly(m) => {
185                            write!(self.wtr, "{{{}}}", m)?;
186                        }
187                        hir::RepetitionRange::AtLeast(m) => {
188                            write!(self.wtr, "{{{},}}", m)?;
189                        }
190                        hir::RepetitionRange::Bounded(m, n) => {
191                            write!(self.wtr, "{{{},{}}}", m, n)?;
192                        }
193                    },
194                }
195                if !x.greedy {
196                    self.wtr.write_str("?")?;
197                }
198            }
199            HirKind::Group(_) => {
200                self.wtr.write_str(")")?;
201            }
202        }
203        Ok(())
204    }
205
206    fn visit_alternation_in(&mut self) -> fmt::Result {
207        self.wtr.write_str("|")
208    }
209}
210
211impl<W: fmt::Write> Writer<W> {
212    fn write_literal_char(&mut self, c: char) -> fmt::Result {
213        if is_meta_character(c) {
214            self.wtr.write_str("\\")?;
215        }
216        self.wtr.write_char(c)
217    }
218
219    fn write_literal_byte(&mut self, b: u8) -> fmt::Result {
220        let c = b as char;
221        if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
222            self.write_literal_char(c)
223        } else {
224            write!(self.wtr, "(?-u:\\x{:02X})", b)
225        }
226    }
227
228    fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result {
229        let c = b as char;
230        if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() {
231            self.write_literal_char(c)
232        } else {
233            write!(self.wtr, "\\x{:02X}", b)
234        }
235    }
236}
237
238#[cfg(test)]
239mod tests {
240    use super::Printer;
241    use crate::ParserBuilder;
242
243    fn roundtrip(given: &str, expected: &str) {
244        roundtrip_with(|b| b, given, expected);
245    }
246
247    fn roundtrip_bytes(given: &str, expected: &str) {
248        roundtrip_with(|b| b.allow_invalid_utf8(true), given, expected);
249    }
250
251    fn roundtrip_with<F>(mut f: F, given: &str, expected: &str)
252    where
253        F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder,
254    {
255        let mut builder = ParserBuilder::new();
256        f(&mut builder);
257        let hir = builder.build().parse(given).unwrap();
258
259        let mut printer = Printer::new();
260        let mut dst = String::new();
261        printer.print(&hir, &mut dst).unwrap();
262
263        // Check that the result is actually valid.
264        builder.build().parse(&dst).unwrap();
265
266        assert_eq!(expected, dst);
267    }
268
269    #[test]
270    fn print_literal() {
271        roundtrip("a", "a");
272        roundtrip(r"\xff", "\u{FF}");
273        roundtrip_bytes(r"\xff", "\u{FF}");
274        roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)");
275        roundtrip("☃", "☃");
276    }
277
278    #[test]
279    fn print_class() {
280        roundtrip(r"[a]", r"[a]");
281        roundtrip(r"[a-z]", r"[a-z]");
282        roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]");
283        roundtrip(r"[^\x01-\u{10FFFF}]", "[\u{0}]");
284        roundtrip(r"[-]", r"[\-]");
285        roundtrip(r"[☃-⛄]", r"[☃-⛄]");
286
287        roundtrip(r"(?-u)[a]", r"(?-u:[a])");
288        roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])");
289        roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])");
290
291        // The following test that the printer escapes meta characters
292        // in character classes.
293        roundtrip(r"[\[]", r"[\[]");
294        roundtrip(r"[Z-_]", r"[Z-_]");
295        roundtrip(r"[Z-_--Z]", r"[\[-_]");
296
297        // The following test that the printer escapes meta characters
298        // in byte oriented character classes.
299        roundtrip_bytes(r"(?-u)[\[]", r"(?-u:[\[])");
300        roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])");
301        roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])");
302    }
303
304    #[test]
305    fn print_anchor() {
306        roundtrip(r"^", r"\A");
307        roundtrip(r"$", r"\z");
308        roundtrip(r"(?m)^", r"(?m:^)");
309        roundtrip(r"(?m)$", r"(?m:$)");
310    }
311
312    #[test]
313    fn print_word_boundary() {
314        roundtrip(r"\b", r"\b");
315        roundtrip(r"\B", r"\B");
316        roundtrip(r"(?-u)\b", r"(?-u:\b)");
317        roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)");
318    }
319
320    #[test]
321    fn print_repetition() {
322        roundtrip("a?", "a?");
323        roundtrip("a??", "a??");
324        roundtrip("(?U)a?", "a??");
325
326        roundtrip("a*", "a*");
327        roundtrip("a*?", "a*?");
328        roundtrip("(?U)a*", "a*?");
329
330        roundtrip("a+", "a+");
331        roundtrip("a+?", "a+?");
332        roundtrip("(?U)a+", "a+?");
333
334        roundtrip("a{1}", "a{1}");
335        roundtrip("a{1,}", "a{1,}");
336        roundtrip("a{1,5}", "a{1,5}");
337        roundtrip("a{1}?", "a{1}?");
338        roundtrip("a{1,}?", "a{1,}?");
339        roundtrip("a{1,5}?", "a{1,5}?");
340        roundtrip("(?U)a{1}", "a{1}?");
341        roundtrip("(?U)a{1,}", "a{1,}?");
342        roundtrip("(?U)a{1,5}", "a{1,5}?");
343    }
344
345    #[test]
346    fn print_group() {
347        roundtrip("()", "()");
348        roundtrip("(?P<foo>)", "(?P<foo>)");
349        roundtrip("(?:)", "(?:)");
350
351        roundtrip("(a)", "(a)");
352        roundtrip("(?P<foo>a)", "(?P<foo>a)");
353        roundtrip("(?:a)", "(?:a)");
354
355        roundtrip("((((a))))", "((((a))))");
356    }
357
358    #[test]
359    fn print_alternation() {
360        roundtrip("|", "|");
361        roundtrip("||", "||");
362
363        roundtrip("a|b", "a|b");
364        roundtrip("a|b|c", "a|b|c");
365        roundtrip("foo|bar|quux", "foo|bar|quux");
366    }
367}