proptest/
string.rs

1//-
2// Copyright 2017 Jason Lingle
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! Strategies for generating strings and byte strings from regular
11//! expressions.
12
13use crate::std_facade::{Box, Cow, String, ToOwned, Vec};
14use core::fmt;
15use core::mem;
16use core::ops::RangeInclusive;
17use core::u32;
18
19use regex_syntax::hir::{
20    self, Hir,
21    HirKind::*,
22    Literal::*,
23    RepetitionKind::{self, *},
24    RepetitionRange::*,
25};
26use regex_syntax::{Error as ParseError, Parser};
27
28use crate::bool;
29use crate::char;
30use crate::collection::{size_range, vec, SizeRange};
31use crate::strategy::*;
32use crate::test_runner::*;
33
34/// Wraps the regex that forms the `Strategy` for `String` so that a sensible
35/// `Default` can be given. The default is a string of non-control characters.
36#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
37pub struct StringParam(&'static str);
38
39impl From<StringParam> for &'static str {
40    fn from(x: StringParam) -> Self {
41        x.0
42    }
43}
44
45impl From<&'static str> for StringParam {
46    fn from(x: &'static str) -> Self {
47        StringParam(x)
48    }
49}
50
51impl Default for StringParam {
52    fn default() -> Self {
53        StringParam("\\PC*")
54    }
55}
56
57// quick_error! uses bare trait objects, so we enclose its invocation here in a
58// module so the lint can be disabled just for it.
59#[allow(bare_trait_objects)]
60mod error_container {
61    use super::*;
62
63    quick_error! {
64        /// Errors which may occur when preparing a regular expression for use with
65        /// string generation.
66        #[derive(Debug)]
67        pub enum Error {
68            /// The string passed as the regex was not syntactically valid.
69            RegexSyntax(err: ParseError) {
70                from()
71                    source(err)
72                    display("{}", err)
73            }
74            /// The regex was syntactically valid, but contains elements not
75            /// supported by proptest.
76            UnsupportedRegex(message: &'static str) {
77                display("{}", message)
78            }
79        }
80    }
81}
82
83pub use self::error_container::Error;
84
85opaque_strategy_wrapper! {
86    /// Strategy which generates values (i.e., `String` or `Vec<u8>`) matching
87    /// a regular expression.
88    ///
89    /// Created by various functions in this module.
90    #[derive(Debug)]
91    pub struct RegexGeneratorStrategy[<T>][where T : fmt::Debug]
92        (SBoxedStrategy<T>) -> RegexGeneratorValueTree<T>;
93    /// `ValueTree` corresponding to `RegexGeneratorStrategy`.
94    pub struct RegexGeneratorValueTree[<T>][where T : fmt::Debug]
95        (Box<dyn ValueTree<Value = T>>) -> T;
96}
97
98impl Strategy for str {
99    type Tree = RegexGeneratorValueTree<String>;
100    type Value = String;
101
102    fn new_tree(&self, runner: &mut TestRunner) -> NewTree<Self> {
103        string_regex(self).unwrap().new_tree(runner)
104    }
105}
106
107type ParseResult<T> = Result<RegexGeneratorStrategy<T>, Error>;
108
109#[doc(hidden)]
110/// A type which knows how to produce a `Strategy` from a regular expression
111/// generating the type.
112///
113/// This trait exists for the benefit of `#[proptest(regex = "...")]`.
114/// It is semver exempt, so use at your own risk.
115/// If you found a use for the trait beyond `Vec<u8>` and `String`,
116/// please file an issue at https://github.com/AltSysrq/proptest.
117pub trait StrategyFromRegex: Sized + fmt::Debug {
118    type Strategy: Strategy<Value = Self>;
119
120    /// Produce a strategy for `Self` from the `regex`.
121    fn from_regex(regex: &str) -> Self::Strategy;
122}
123
124impl StrategyFromRegex for String {
125    type Strategy = RegexGeneratorStrategy<Self>;
126
127    fn from_regex(regex: &str) -> Self::Strategy {
128        string_regex(regex).unwrap()
129    }
130}
131
132impl StrategyFromRegex for Vec<u8> {
133    type Strategy = RegexGeneratorStrategy<Self>;
134
135    fn from_regex(regex: &str) -> Self::Strategy {
136        bytes_regex(regex).unwrap()
137    }
138}
139
140/// Creates a strategy which generates strings matching the given regular
141/// expression.
142///
143/// If you don't need error handling and aren't limited by setup time, it is
144/// also possible to directly use a `&str` as a strategy with the same effect.
145pub fn string_regex(regex: &str) -> ParseResult<String> {
146    string_regex_parsed(&regex_to_hir(regex)?)
147}
148
149/// Like `string_regex()`, but allows providing a pre-parsed expression.
150pub fn string_regex_parsed(expr: &Hir) -> ParseResult<String> {
151    bytes_regex_parsed(expr)
152        .map(|v| {
153            v.prop_map(|bytes| {
154                String::from_utf8(bytes).expect("non-utf8 string")
155            })
156            .sboxed()
157        })
158        .map(RegexGeneratorStrategy)
159}
160
161/// Creates a strategy which generates byte strings matching the given regular
162/// expression.
163pub fn bytes_regex(regex: &str) -> ParseResult<Vec<u8>> {
164    bytes_regex_parsed(&regex_to_hir(regex)?)
165}
166
167/// Like `bytes_regex()`, but allows providing a pre-parsed expression.
168pub fn bytes_regex_parsed(expr: &Hir) -> ParseResult<Vec<u8>> {
169    match expr.kind() {
170        Empty => Ok(Just(vec![]).sboxed()),
171
172        Literal(lit) => Ok(Just(match lit {
173            Unicode(scalar) => to_bytes(*scalar),
174            Byte(byte) => vec![*byte],
175        })
176        .sboxed()),
177
178        Class(class) => Ok(match class {
179            hir::Class::Unicode(class) => {
180                unicode_class_strategy(class).prop_map(to_bytes).sboxed()
181            }
182            hir::Class::Bytes(class) => {
183                let subs = class.iter().map(|r| r.start()..=r.end());
184                Union::new(subs).prop_map(|b| vec![b]).sboxed()
185            }
186        }),
187
188        Repetition(rep) => Ok(vec(
189            bytes_regex_parsed(&rep.hir)?,
190            to_range(rep.kind.clone())?,
191        )
192        .prop_map(|parts| {
193            parts.into_iter().fold(vec![], |mut acc, child| {
194                acc.extend(child);
195                acc
196            })
197        })
198        .sboxed()),
199
200        Group(group) => bytes_regex_parsed(&group.hir).map(|v| v.0),
201
202        Concat(subs) => {
203            let subs = ConcatIter {
204                iter: subs.iter(),
205                buf: vec![],
206                next: None,
207            };
208            let ext = |(mut lhs, rhs): (Vec<_>, _)| {
209                lhs.extend(rhs);
210                lhs
211            };
212            Ok(subs
213                .fold(Ok(None), |accum: Result<_, Error>, rhs| {
214                    Ok(match accum? {
215                        None => Some(rhs?.sboxed()),
216                        Some(accum) => {
217                            Some((accum, rhs?).prop_map(ext).sboxed())
218                        }
219                    })
220                })?
221                .unwrap_or_else(|| Just(vec![]).sboxed()))
222        }
223
224        Alternation(subs) => {
225            Ok(Union::try_new(subs.iter().map(bytes_regex_parsed))?.sboxed())
226        }
227
228        Anchor(_) => {
229            unsupported("line/text anchors not supported for string generation")
230        }
231
232        WordBoundary(_) => unsupported(
233            "word boundary tests not supported for string generation",
234        ),
235    }
236    .map(RegexGeneratorStrategy)
237}
238
239fn unicode_class_strategy(
240    class: &hir::ClassUnicode,
241) -> char::CharStrategy<'static> {
242    static NONL_RANGES: &[RangeInclusive<char>] = &[
243        '\x00'..='\x09',
244        // Multiple instances of the latter range to partially make up
245        // for the bias of having such a tiny range in the control
246        // characters.
247        '\x0B'..=::core::char::MAX,
248        '\x0B'..=::core::char::MAX,
249        '\x0B'..=::core::char::MAX,
250        '\x0B'..=::core::char::MAX,
251        '\x0B'..=::core::char::MAX,
252    ];
253
254    let dotnnl = |x: &hir::ClassUnicodeRange, y: &hir::ClassUnicodeRange| {
255        x.start() == '\0'
256            && x.end() == '\x09'
257            && y.start() == '\x0B'
258            && y.end() == '\u{10FFFF}'
259    };
260
261    char::ranges(match class.ranges() {
262        [x, y] if dotnnl(x, y) || dotnnl(y, x) => Cow::Borrowed(NONL_RANGES),
263        _ => Cow::Owned(class.iter().map(|r| r.start()..=r.end()).collect()),
264    })
265}
266
267struct ConcatIter<'a, I> {
268    buf: Vec<u8>,
269    iter: I,
270    next: Option<&'a Hir>,
271}
272
273fn flush_lit_buf<I>(
274    it: &mut ConcatIter<'_, I>,
275) -> Option<ParseResult<Vec<u8>>> {
276    Some(Ok(RegexGeneratorStrategy(
277        Just(mem::replace(&mut it.buf, vec![])).sboxed(),
278    )))
279}
280
281impl<'a, I: Iterator<Item = &'a Hir>> Iterator for ConcatIter<'a, I> {
282    type Item = ParseResult<Vec<u8>>;
283
284    fn next(&mut self) -> Option<Self::Item> {
285        // A left-over node, process it first:
286        if let Some(next) = self.next.take() {
287            return Some(bytes_regex_parsed(next));
288        }
289
290        // Accumulate a literal sequence as long as we can:
291        while let Some(next) = self.iter.next() {
292            match next.kind() {
293                // A literal. Accumulate:
294                Literal(Unicode(scalar)) => self.buf.extend(to_bytes(*scalar)),
295                Literal(Byte(byte)) => self.buf.push(*byte),
296                // Encountered a non-literal.
297                _ => {
298                    return if !self.buf.is_empty() {
299                        // We've accumulated a literal from before, flush it out.
300                        // Store this node so we deal with it the next call.
301                        self.next = Some(next);
302                        flush_lit_buf(self)
303                    } else {
304                        // We didn't; just yield this node.
305                        Some(bytes_regex_parsed(next))
306                    };
307                }
308            }
309        }
310
311        // Flush out any accumulated literal from before.
312        if !self.buf.is_empty() {
313            flush_lit_buf(self)
314        } else {
315            self.next.take().map(bytes_regex_parsed)
316        }
317    }
318}
319
320fn to_range(kind: RepetitionKind) -> Result<SizeRange, Error> {
321    Ok(match kind {
322        ZeroOrOne => size_range(0..=1),
323        ZeroOrMore => size_range(0..=32),
324        OneOrMore => size_range(1..=32),
325        Range(range) => match range {
326            Exactly(count) if u32::MAX == count => {
327                return unsupported(
328                    "Cannot have repetition of exactly u32::MAX",
329                )
330            }
331            Exactly(count) => size_range(count as usize),
332            AtLeast(min) => {
333                let max = if min < u32::MAX as u32 / 2 {
334                    min as usize * 2
335                } else {
336                    u32::MAX as usize
337                };
338                size_range((min as usize)..max)
339            }
340            Bounded(_, max) if u32::MAX == max => {
341                return unsupported("Cannot have repetition max of u32::MAX")
342            }
343            Bounded(min, max) => size_range((min as usize)..(max as usize + 1)),
344        },
345    })
346}
347
348fn to_bytes(khar: char) -> Vec<u8> {
349    let mut buf = [0u8; 4];
350    khar.encode_utf8(&mut buf).as_bytes().to_owned()
351}
352
353fn regex_to_hir(pattern: &str) -> Result<Hir, Error> {
354    Ok(Parser::new().parse(pattern)?)
355}
356
357fn unsupported<T>(error: &'static str) -> Result<T, Error> {
358    Err(Error::UnsupportedRegex(error))
359}
360
361#[cfg(test)]
362mod test {
363    use std::collections::HashSet;
364
365    use regex::Regex;
366
367    use super::*;
368
369    fn do_test(
370        pattern: &str,
371        min_distinct: usize,
372        max_distinct: usize,
373        iterations: usize,
374    ) {
375        let generated = generate_values_matching_regex(pattern, iterations);
376        assert!(
377            generated.len() >= min_distinct,
378            "Expected to generate at least {} strings, but only \
379             generated {}",
380            min_distinct,
381            generated.len()
382        );
383        assert!(
384            generated.len() <= max_distinct,
385            "Expected to generate at most {} strings, but \
386             generated {}",
387            max_distinct,
388            generated.len()
389        );
390    }
391
392    fn generate_values_matching_regex(
393        pattern: &str,
394        iterations: usize,
395    ) -> HashSet<String> {
396        let rx = Regex::new(pattern).unwrap();
397        let mut generated = HashSet::new();
398
399        let strategy = string_regex(pattern).unwrap();
400        let mut runner = TestRunner::deterministic();
401        for _ in 0..iterations {
402            let mut value = strategy.new_tree(&mut runner).unwrap();
403
404            loop {
405                let s = value.current();
406                let ok = if let Some(matsch) = rx.find(&s) {
407                    0 == matsch.start() && s.len() == matsch.end()
408                } else {
409                    false
410                };
411                if !ok {
412                    panic!(
413                        "Generated string {:?} which does not match {:?}",
414                        s, pattern
415                    );
416                }
417
418                generated.insert(s);
419
420                if !value.simplify() {
421                    break;
422                }
423            }
424        }
425        generated
426    }
427
428    #[test]
429    fn test_case_insensitive_produces_all_available_values() {
430        let mut expected: HashSet<String> = HashSet::new();
431        expected.insert("a".into());
432        expected.insert("b".into());
433        expected.insert("A".into());
434        expected.insert("B".into());
435        assert_eq!(generate_values_matching_regex("(?i:a|B)", 64), expected);
436    }
437
438    #[test]
439    fn test_literal() {
440        do_test("foo", 1, 1, 8);
441    }
442
443    #[test]
444    fn test_casei_literal() {
445        do_test("(?i:fOo)", 8, 8, 64);
446    }
447
448    #[test]
449    fn test_alternation() {
450        do_test("foo|bar|baz", 3, 3, 16);
451    }
452
453    #[test]
454    fn test_repitition() {
455        do_test("a{0,8}", 9, 9, 64);
456    }
457
458    #[test]
459    fn test_question() {
460        do_test("a?", 2, 2, 16);
461    }
462
463    #[test]
464    fn test_star() {
465        do_test("a*", 33, 33, 256);
466    }
467
468    #[test]
469    fn test_plus() {
470        do_test("a+", 32, 32, 256);
471    }
472
473    #[test]
474    fn test_n_to_range() {
475        do_test("a{4,}", 4, 4, 64);
476    }
477
478    #[test]
479    fn test_concatenation() {
480        do_test("(foo|bar)(xyzzy|plugh)", 4, 4, 32);
481    }
482
483    #[test]
484    fn test_ascii_class() {
485        do_test("[[:digit:]]", 10, 10, 256);
486    }
487
488    #[test]
489    fn test_unicode_class() {
490        do_test("\\p{Greek}", 24, 512, 256);
491    }
492
493    #[test]
494    fn test_dot() {
495        do_test(".", 200, 65536, 256);
496    }
497
498    #[test]
499    fn test_dot_s() {
500        do_test("(?s).", 200, 65536, 256);
501    }
502
503    #[test]
504    fn test_backslash_d_plus() {
505        do_test("\\d+", 1, 65536, 256);
506    }
507
508    fn assert_send_and_sync<T: Send + Sync>(_: T) {}
509
510    #[test]
511    fn regex_strategy_is_send_and_sync() {
512        assert_send_and_sync(string_regex(".").unwrap());
513    }
514
515    macro_rules! consistent {
516        ($name:ident, $value:expr) => {
517            #[test]
518            fn $name() {
519                test_generates_matching_strings($value);
520            }
521        };
522    }
523
524    fn test_generates_matching_strings(pattern: &str) {
525        use std::time;
526
527        let mut runner = TestRunner::default();
528        let start = time::Instant::now();
529
530        // If we don't support this regex, just move on quietly
531        if let Ok(strategy) = string_regex(pattern) {
532            let rx = Regex::new(pattern).unwrap();
533
534            for _ in 0..1000 {
535                let mut val = strategy.new_tree(&mut runner).unwrap();
536                // No more than 1000 simplify steps to keep test time down
537                for _ in 0..1000 {
538                    let s = val.current();
539                    assert!(
540                        rx.is_match(&s),
541                        "Produced string {:?}, which does not match {:?}",
542                        s,
543                        pattern
544                    );
545
546                    if !val.simplify() {
547                        break;
548                    }
549                }
550
551                // Quietly stop testing if we've run for >10 s
552                if start.elapsed().as_secs() > 10 {
553                    break;
554                }
555            }
556        }
557    }
558
559    include!("regex-contrib/crates_regex.rs");
560}