regex_syntax/
unicode.rs

1use std::error;
2use std::fmt;
3use std::result;
4
5use crate::hir;
6
7/// A type alias for errors specific to Unicode handling of classes.
8pub type Result<T> = result::Result<T, Error>;
9
10/// An inclusive range of codepoints from a generated file (hence the static
11/// lifetime).
12type Range = &'static [(char, char)];
13
14/// An error that occurs when dealing with Unicode.
15///
16/// We don't impl the Error trait here because these always get converted
17/// into other public errors. (This error type isn't exported.)
18#[derive(Debug)]
19pub enum Error {
20    PropertyNotFound,
21    PropertyValueNotFound,
22    // Not used when unicode-perl is enabled.
23    #[allow(dead_code)]
24    PerlClassNotFound,
25}
26
27/// A type alias for errors specific to Unicode case folding.
28pub type FoldResult<T> = result::Result<T, CaseFoldError>;
29
30/// An error that occurs when Unicode-aware simple case folding fails.
31///
32/// This error can occur when the case mapping tables necessary for Unicode
33/// aware case folding are unavailable. This only occurs when the
34/// `unicode-case` feature is disabled. (The feature is enabled by default.)
35#[derive(Debug)]
36pub struct CaseFoldError(());
37
38impl error::Error for CaseFoldError {}
39
40impl fmt::Display for CaseFoldError {
41    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
42        write!(
43            f,
44            "Unicode-aware case folding is not available \
45             (probably because the unicode-case feature is not enabled)"
46        )
47    }
48}
49
50/// An error that occurs when the Unicode-aware `\w` class is unavailable.
51///
52/// This error can occur when the data tables necessary for the Unicode aware
53/// Perl character class `\w` are unavailable. This only occurs when the
54/// `unicode-perl` feature is disabled. (The feature is enabled by default.)
55#[derive(Debug)]
56pub struct UnicodeWordError(());
57
58impl error::Error for UnicodeWordError {}
59
60impl fmt::Display for UnicodeWordError {
61    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
62        write!(
63            f,
64            "Unicode-aware \\w class is not available \
65             (probably because the unicode-perl feature is not enabled)"
66        )
67    }
68}
69
70/// Return an iterator over the equivalence class of simple case mappings
71/// for the given codepoint. The equivalence class does not include the
72/// given codepoint.
73///
74/// If the equivalence class is empty, then this returns the next scalar
75/// value that has a non-empty equivalence class, if it exists. If no such
76/// scalar value exists, then `None` is returned. The point of this behavior
77/// is to permit callers to avoid calling `simple_fold` more than they need
78/// to, since there is some cost to fetching the equivalence class.
79///
80/// This returns an error if the Unicode case folding tables are not available.
81pub fn simple_fold(
82    c: char,
83) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>> {
84    #[cfg(not(feature = "unicode-case"))]
85    fn imp(
86        _: char,
87    ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>>
88    {
89        use std::option::IntoIter;
90        Err::<result::Result<IntoIter<char>, _>, _>(CaseFoldError(()))
91    }
92
93    #[cfg(feature = "unicode-case")]
94    fn imp(
95        c: char,
96    ) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>>
97    {
98        use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
99
100        Ok(CASE_FOLDING_SIMPLE
101            .binary_search_by_key(&c, |&(c1, _)| c1)
102            .map(|i| CASE_FOLDING_SIMPLE[i].1.iter().copied())
103            .map_err(|i| {
104                if i >= CASE_FOLDING_SIMPLE.len() {
105                    None
106                } else {
107                    Some(CASE_FOLDING_SIMPLE[i].0)
108                }
109            }))
110    }
111
112    imp(c)
113}
114
115/// Returns true if and only if the given (inclusive) range contains at least
116/// one Unicode scalar value that has a non-empty non-trivial simple case
117/// mapping.
118///
119/// This function panics if `end < start`.
120///
121/// This returns an error if the Unicode case folding tables are not available.
122pub fn contains_simple_case_mapping(
123    start: char,
124    end: char,
125) -> FoldResult<bool> {
126    #[cfg(not(feature = "unicode-case"))]
127    fn imp(_: char, _: char) -> FoldResult<bool> {
128        Err(CaseFoldError(()))
129    }
130
131    #[cfg(feature = "unicode-case")]
132    fn imp(start: char, end: char) -> FoldResult<bool> {
133        use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
134        use std::cmp::Ordering;
135
136        assert!(start <= end);
137        Ok(CASE_FOLDING_SIMPLE
138            .binary_search_by(|&(c, _)| {
139                if start <= c && c <= end {
140                    Ordering::Equal
141                } else if c > end {
142                    Ordering::Greater
143                } else {
144                    Ordering::Less
145                }
146            })
147            .is_ok())
148    }
149
150    imp(start, end)
151}
152
153/// A query for finding a character class defined by Unicode. This supports
154/// either use of a property name directly, or lookup by property value. The
155/// former generally refers to Binary properties (see UTS#44, Table 8), but
156/// as a special exception (see UTS#18, Section 1.2) both general categories
157/// (an enumeration) and scripts (a catalog) are supported as if each of their
158/// possible values were a binary property.
159///
160/// In all circumstances, property names and values are normalized and
161/// canonicalized. That is, `GC == gc == GeneralCategory == general_category`.
162///
163/// The lifetime `'a` refers to the shorter of the lifetimes of property name
164/// and property value.
165#[derive(Debug)]
166pub enum ClassQuery<'a> {
167    /// Return a class corresponding to a Unicode binary property, named by
168    /// a single letter.
169    OneLetter(char),
170    /// Return a class corresponding to a Unicode binary property.
171    ///
172    /// Note that, by special exception (see UTS#18, Section 1.2), both
173    /// general category values and script values are permitted here as if
174    /// they were a binary property.
175    Binary(&'a str),
176    /// Return a class corresponding to all codepoints whose property
177    /// (identified by `property_name`) corresponds to the given value
178    /// (identified by `property_value`).
179    ByValue {
180        /// A property name.
181        property_name: &'a str,
182        /// A property value.
183        property_value: &'a str,
184    },
185}
186
187impl<'a> ClassQuery<'a> {
188    fn canonicalize(&self) -> Result<CanonicalClassQuery> {
189        match *self {
190            ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()),
191            ClassQuery::Binary(name) => self.canonical_binary(name),
192            ClassQuery::ByValue { property_name, property_value } => {
193                let property_name = symbolic_name_normalize(property_name);
194                let property_value = symbolic_name_normalize(property_value);
195
196                let canon_name = match canonical_prop(&property_name)? {
197                    None => return Err(Error::PropertyNotFound),
198                    Some(canon_name) => canon_name,
199                };
200                Ok(match canon_name {
201                    "General_Category" => {
202                        let canon = match canonical_gencat(&property_value)? {
203                            None => return Err(Error::PropertyValueNotFound),
204                            Some(canon) => canon,
205                        };
206                        CanonicalClassQuery::GeneralCategory(canon)
207                    }
208                    "Script" => {
209                        let canon = match canonical_script(&property_value)? {
210                            None => return Err(Error::PropertyValueNotFound),
211                            Some(canon) => canon,
212                        };
213                        CanonicalClassQuery::Script(canon)
214                    }
215                    _ => {
216                        let vals = match property_values(canon_name)? {
217                            None => return Err(Error::PropertyValueNotFound),
218                            Some(vals) => vals,
219                        };
220                        let canon_val =
221                            match canonical_value(vals, &property_value) {
222                                None => {
223                                    return Err(Error::PropertyValueNotFound)
224                                }
225                                Some(canon_val) => canon_val,
226                            };
227                        CanonicalClassQuery::ByValue {
228                            property_name: canon_name,
229                            property_value: canon_val,
230                        }
231                    }
232                })
233            }
234        }
235    }
236
237    fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> {
238        let norm = symbolic_name_normalize(name);
239
240        // This is a special case where 'cf' refers to the 'Format' general
241        // category, but where the 'cf' abbreviation is also an abbreviation
242        // for the 'Case_Folding' property. But we want to treat it as
243        // a general category. (Currently, we don't even support the
244        // 'Case_Folding' property. But if we do in the future, users will be
245        // required to spell it out.)
246        if norm != "cf" {
247            if let Some(canon) = canonical_prop(&norm)? {
248                return Ok(CanonicalClassQuery::Binary(canon));
249            }
250        }
251        if let Some(canon) = canonical_gencat(&norm)? {
252            return Ok(CanonicalClassQuery::GeneralCategory(canon));
253        }
254        if let Some(canon) = canonical_script(&norm)? {
255            return Ok(CanonicalClassQuery::Script(canon));
256        }
257        Err(Error::PropertyNotFound)
258    }
259}
260
261/// Like ClassQuery, but its parameters have been canonicalized. This also
262/// differentiates binary properties from flattened general categories and
263/// scripts.
264#[derive(Debug, Eq, PartialEq)]
265enum CanonicalClassQuery {
266    /// The canonical binary property name.
267    Binary(&'static str),
268    /// The canonical general category name.
269    GeneralCategory(&'static str),
270    /// The canonical script name.
271    Script(&'static str),
272    /// An arbitrary association between property and value, both of which
273    /// have been canonicalized.
274    ///
275    /// Note that by construction, the property name of ByValue will never
276    /// be General_Category or Script. Those two cases are subsumed by the
277    /// eponymous variants.
278    ByValue {
279        /// The canonical property name.
280        property_name: &'static str,
281        /// The canonical property value.
282        property_value: &'static str,
283    },
284}
285
286/// Looks up a Unicode class given a query. If one doesn't exist, then
287/// `None` is returned.
288pub fn class(query: ClassQuery<'_>) -> Result<hir::ClassUnicode> {
289    use self::CanonicalClassQuery::*;
290
291    match query.canonicalize()? {
292        Binary(name) => bool_property(name),
293        GeneralCategory(name) => gencat(name),
294        Script(name) => script(name),
295        ByValue { property_name: "Age", property_value } => {
296            let mut class = hir::ClassUnicode::empty();
297            for set in ages(property_value)? {
298                class.union(&hir_class(set));
299            }
300            Ok(class)
301        }
302        ByValue { property_name: "Script_Extensions", property_value } => {
303            script_extension(property_value)
304        }
305        ByValue {
306            property_name: "Grapheme_Cluster_Break",
307            property_value,
308        } => gcb(property_value),
309        ByValue { property_name: "Sentence_Break", property_value } => {
310            sb(property_value)
311        }
312        ByValue { property_name: "Word_Break", property_value } => {
313            wb(property_value)
314        }
315        _ => {
316            // What else should we support?
317            Err(Error::PropertyNotFound)
318        }
319    }
320}
321
322/// Returns a Unicode aware class for \w.
323///
324/// This returns an error if the data is not available for \w.
325pub fn perl_word() -> Result<hir::ClassUnicode> {
326    #[cfg(not(feature = "unicode-perl"))]
327    fn imp() -> Result<hir::ClassUnicode> {
328        Err(Error::PerlClassNotFound)
329    }
330
331    #[cfg(feature = "unicode-perl")]
332    fn imp() -> Result<hir::ClassUnicode> {
333        use crate::unicode_tables::perl_word::PERL_WORD;
334        Ok(hir_class(PERL_WORD))
335    }
336
337    imp()
338}
339
340/// Returns a Unicode aware class for \s.
341///
342/// This returns an error if the data is not available for \s.
343pub fn perl_space() -> Result<hir::ClassUnicode> {
344    #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))]
345    fn imp() -> Result<hir::ClassUnicode> {
346        Err(Error::PerlClassNotFound)
347    }
348
349    #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))]
350    fn imp() -> Result<hir::ClassUnicode> {
351        use crate::unicode_tables::perl_space::WHITE_SPACE;
352        Ok(hir_class(WHITE_SPACE))
353    }
354
355    #[cfg(feature = "unicode-bool")]
356    fn imp() -> Result<hir::ClassUnicode> {
357        use crate::unicode_tables::property_bool::WHITE_SPACE;
358        Ok(hir_class(WHITE_SPACE))
359    }
360
361    imp()
362}
363
364/// Returns a Unicode aware class for \d.
365///
366/// This returns an error if the data is not available for \d.
367pub fn perl_digit() -> Result<hir::ClassUnicode> {
368    #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))]
369    fn imp() -> Result<hir::ClassUnicode> {
370        Err(Error::PerlClassNotFound)
371    }
372
373    #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))]
374    fn imp() -> Result<hir::ClassUnicode> {
375        use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER;
376        Ok(hir_class(DECIMAL_NUMBER))
377    }
378
379    #[cfg(feature = "unicode-gencat")]
380    fn imp() -> Result<hir::ClassUnicode> {
381        use crate::unicode_tables::general_category::DECIMAL_NUMBER;
382        Ok(hir_class(DECIMAL_NUMBER))
383    }
384
385    imp()
386}
387
388/// Build a Unicode HIR class from a sequence of Unicode scalar value ranges.
389pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode {
390    let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges
391        .iter()
392        .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
393        .collect();
394    hir::ClassUnicode::new(hir_ranges)
395}
396
397/// Returns true only if the given codepoint is in the `\w` character class.
398///
399/// If the `unicode-perl` feature is not enabled, then this returns an error.
400pub fn is_word_character(c: char) -> result::Result<bool, UnicodeWordError> {
401    #[cfg(not(feature = "unicode-perl"))]
402    fn imp(_: char) -> result::Result<bool, UnicodeWordError> {
403        Err(UnicodeWordError(()))
404    }
405
406    #[cfg(feature = "unicode-perl")]
407    fn imp(c: char) -> result::Result<bool, UnicodeWordError> {
408        use crate::is_word_byte;
409        use crate::unicode_tables::perl_word::PERL_WORD;
410        use std::cmp::Ordering;
411
412        if c <= 0x7F as char && is_word_byte(c as u8) {
413            return Ok(true);
414        }
415        Ok(PERL_WORD
416            .binary_search_by(|&(start, end)| {
417                if start <= c && c <= end {
418                    Ordering::Equal
419                } else if start > c {
420                    Ordering::Greater
421                } else {
422                    Ordering::Less
423                }
424            })
425            .is_ok())
426    }
427
428    imp(c)
429}
430
431/// A mapping of property values for a specific property.
432///
433/// The first element of each tuple is a normalized property value while the
434/// second element of each tuple is the corresponding canonical property
435/// value.
436type PropertyValues = &'static [(&'static str, &'static str)];
437
438fn canonical_gencat(normalized_value: &str) -> Result<Option<&'static str>> {
439    Ok(match normalized_value {
440        "any" => Some("Any"),
441        "assigned" => Some("Assigned"),
442        "ascii" => Some("ASCII"),
443        _ => {
444            let gencats = property_values("General_Category")?.unwrap();
445            canonical_value(gencats, normalized_value)
446        }
447    })
448}
449
450fn canonical_script(normalized_value: &str) -> Result<Option<&'static str>> {
451    let scripts = property_values("Script")?.unwrap();
452    Ok(canonical_value(scripts, normalized_value))
453}
454
455/// Find the canonical property name for the given normalized property name.
456///
457/// If no such property exists, then `None` is returned.
458///
459/// The normalized property name must have been normalized according to
460/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
461///
462/// If the property names data is not available, then an error is returned.
463fn canonical_prop(normalized_name: &str) -> Result<Option<&'static str>> {
464    #[cfg(not(any(
465        feature = "unicode-age",
466        feature = "unicode-bool",
467        feature = "unicode-gencat",
468        feature = "unicode-perl",
469        feature = "unicode-script",
470        feature = "unicode-segment",
471    )))]
472    fn imp(_: &str) -> Result<Option<&'static str>> {
473        Err(Error::PropertyNotFound)
474    }
475
476    #[cfg(any(
477        feature = "unicode-age",
478        feature = "unicode-bool",
479        feature = "unicode-gencat",
480        feature = "unicode-perl",
481        feature = "unicode-script",
482        feature = "unicode-segment",
483    ))]
484    fn imp(name: &str) -> Result<Option<&'static str>> {
485        use crate::unicode_tables::property_names::PROPERTY_NAMES;
486
487        Ok(PROPERTY_NAMES
488            .binary_search_by_key(&name, |&(n, _)| n)
489            .ok()
490            .map(|i| PROPERTY_NAMES[i].1))
491    }
492
493    imp(normalized_name)
494}
495
496/// Find the canonical property value for the given normalized property
497/// value.
498///
499/// The given property values should correspond to the values for the property
500/// under question, which can be found using `property_values`.
501///
502/// If no such property value exists, then `None` is returned.
503///
504/// The normalized property value must have been normalized according to
505/// UAX44 LM3, which can be done using `symbolic_name_normalize`.
506fn canonical_value(
507    vals: PropertyValues,
508    normalized_value: &str,
509) -> Option<&'static str> {
510    vals.binary_search_by_key(&normalized_value, |&(n, _)| n)
511        .ok()
512        .map(|i| vals[i].1)
513}
514
515/// Return the table of property values for the given property name.
516///
517/// If the property values data is not available, then an error is returned.
518fn property_values(
519    canonical_property_name: &'static str,
520) -> Result<Option<PropertyValues>> {
521    #[cfg(not(any(
522        feature = "unicode-age",
523        feature = "unicode-bool",
524        feature = "unicode-gencat",
525        feature = "unicode-perl",
526        feature = "unicode-script",
527        feature = "unicode-segment",
528    )))]
529    fn imp(_: &'static str) -> Result<Option<PropertyValues>> {
530        Err(Error::PropertyValueNotFound)
531    }
532
533    #[cfg(any(
534        feature = "unicode-age",
535        feature = "unicode-bool",
536        feature = "unicode-gencat",
537        feature = "unicode-perl",
538        feature = "unicode-script",
539        feature = "unicode-segment",
540    ))]
541    fn imp(name: &'static str) -> Result<Option<PropertyValues>> {
542        use crate::unicode_tables::property_values::PROPERTY_VALUES;
543
544        Ok(PROPERTY_VALUES
545            .binary_search_by_key(&name, |&(n, _)| n)
546            .ok()
547            .map(|i| PROPERTY_VALUES[i].1))
548    }
549
550    imp(canonical_property_name)
551}
552
553// This is only used in some cases, but small enough to just let it be dead
554// instead of figuring out (and maintaining) the right set of features.
555#[allow(dead_code)]
556fn property_set(
557    name_map: &'static [(&'static str, Range)],
558    canonical: &'static str,
559) -> Option<Range> {
560    name_map
561        .binary_search_by_key(&canonical, |x| x.0)
562        .ok()
563        .map(|i| name_map[i].1)
564}
565
566/// Returns an iterator over Unicode Age sets. Each item corresponds to a set
567/// of codepoints that were added in a particular revision of Unicode. The
568/// iterator yields items in chronological order.
569///
570/// If the given age value isn't valid or if the data isn't available, then an
571/// error is returned instead.
572fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
573    #[cfg(not(feature = "unicode-age"))]
574    fn imp(_: &str) -> Result<impl Iterator<Item = Range>> {
575        use std::option::IntoIter;
576        Err::<IntoIter<Range>, _>(Error::PropertyNotFound)
577    }
578
579    #[cfg(feature = "unicode-age")]
580    fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
581        use crate::unicode_tables::age;
582
583        const AGES: &[(&str, Range)] = &[
584            ("V1_1", age::V1_1),
585            ("V2_0", age::V2_0),
586            ("V2_1", age::V2_1),
587            ("V3_0", age::V3_0),
588            ("V3_1", age::V3_1),
589            ("V3_2", age::V3_2),
590            ("V4_0", age::V4_0),
591            ("V4_1", age::V4_1),
592            ("V5_0", age::V5_0),
593            ("V5_1", age::V5_1),
594            ("V5_2", age::V5_2),
595            ("V6_0", age::V6_0),
596            ("V6_1", age::V6_1),
597            ("V6_2", age::V6_2),
598            ("V6_3", age::V6_3),
599            ("V7_0", age::V7_0),
600            ("V8_0", age::V8_0),
601            ("V9_0", age::V9_0),
602            ("V10_0", age::V10_0),
603            ("V11_0", age::V11_0),
604            ("V12_0", age::V12_0),
605            ("V12_1", age::V12_1),
606            ("V13_0", age::V13_0),
607            ("V14_0", age::V14_0),
608        ];
609        assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");
610
611        let pos = AGES.iter().position(|&(age, _)| canonical_age == age);
612        match pos {
613            None => Err(Error::PropertyValueNotFound),
614            Some(i) => Ok(AGES[..=i].iter().map(|&(_, classes)| classes)),
615        }
616    }
617
618    imp(canonical_age)
619}
620
621/// Returns the Unicode HIR class corresponding to the given general category.
622///
623/// Name canonicalization is assumed to be performed by the caller.
624///
625/// If the given general category could not be found, or if the general
626/// category data is not available, then an error is returned.
627fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
628    #[cfg(not(feature = "unicode-gencat"))]
629    fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
630        Err(Error::PropertyNotFound)
631    }
632
633    #[cfg(feature = "unicode-gencat")]
634    fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
635        use crate::unicode_tables::general_category::BY_NAME;
636        match name {
637            "ASCII" => Ok(hir_class(&[('\0', '\x7F')])),
638            "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])),
639            "Assigned" => {
640                let mut cls = gencat("Unassigned")?;
641                cls.negate();
642                Ok(cls)
643            }
644            name => property_set(BY_NAME, name)
645                .map(hir_class)
646                .ok_or(Error::PropertyValueNotFound),
647        }
648    }
649
650    match canonical_name {
651        "Decimal_Number" => perl_digit(),
652        name => imp(name),
653    }
654}
655
656/// Returns the Unicode HIR class corresponding to the given script.
657///
658/// Name canonicalization is assumed to be performed by the caller.
659///
660/// If the given script could not be found, or if the script data is not
661/// available, then an error is returned.
662fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
663    #[cfg(not(feature = "unicode-script"))]
664    fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
665        Err(Error::PropertyNotFound)
666    }
667
668    #[cfg(feature = "unicode-script")]
669    fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
670        use crate::unicode_tables::script::BY_NAME;
671        property_set(BY_NAME, name)
672            .map(hir_class)
673            .ok_or(Error::PropertyValueNotFound)
674    }
675
676    imp(canonical_name)
677}
678
679/// Returns the Unicode HIR class corresponding to the given script extension.
680///
681/// Name canonicalization is assumed to be performed by the caller.
682///
683/// If the given script extension could not be found, or if the script data is
684/// not available, then an error is returned.
685fn script_extension(
686    canonical_name: &'static str,
687) -> Result<hir::ClassUnicode> {
688    #[cfg(not(feature = "unicode-script"))]
689    fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
690        Err(Error::PropertyNotFound)
691    }
692
693    #[cfg(feature = "unicode-script")]
694    fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
695        use crate::unicode_tables::script_extension::BY_NAME;
696        property_set(BY_NAME, name)
697            .map(hir_class)
698            .ok_or(Error::PropertyValueNotFound)
699    }
700
701    imp(canonical_name)
702}
703
704/// Returns the Unicode HIR class corresponding to the given Unicode boolean
705/// property.
706///
707/// Name canonicalization is assumed to be performed by the caller.
708///
709/// If the given boolean property could not be found, or if the boolean
710/// property data is not available, then an error is returned.
711fn bool_property(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
712    #[cfg(not(feature = "unicode-bool"))]
713    fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
714        Err(Error::PropertyNotFound)
715    }
716
717    #[cfg(feature = "unicode-bool")]
718    fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
719        use crate::unicode_tables::property_bool::BY_NAME;
720        property_set(BY_NAME, name)
721            .map(hir_class)
722            .ok_or(Error::PropertyNotFound)
723    }
724
725    match canonical_name {
726        "Decimal_Number" => perl_digit(),
727        "White_Space" => perl_space(),
728        name => imp(name),
729    }
730}
731
732/// Returns the Unicode HIR class corresponding to the given grapheme cluster
733/// break property.
734///
735/// Name canonicalization is assumed to be performed by the caller.
736///
737/// If the given property could not be found, or if the corresponding data is
738/// not available, then an error is returned.
739fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
740    #[cfg(not(feature = "unicode-segment"))]
741    fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
742        Err(Error::PropertyNotFound)
743    }
744
745    #[cfg(feature = "unicode-segment")]
746    fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
747        use crate::unicode_tables::grapheme_cluster_break::BY_NAME;
748        property_set(BY_NAME, name)
749            .map(hir_class)
750            .ok_or(Error::PropertyValueNotFound)
751    }
752
753    imp(canonical_name)
754}
755
756/// Returns the Unicode HIR class corresponding to the given word break
757/// property.
758///
759/// Name canonicalization is assumed to be performed by the caller.
760///
761/// If the given property could not be found, or if the corresponding data is
762/// not available, then an error is returned.
763fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
764    #[cfg(not(feature = "unicode-segment"))]
765    fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
766        Err(Error::PropertyNotFound)
767    }
768
769    #[cfg(feature = "unicode-segment")]
770    fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
771        use crate::unicode_tables::word_break::BY_NAME;
772        property_set(BY_NAME, name)
773            .map(hir_class)
774            .ok_or(Error::PropertyValueNotFound)
775    }
776
777    imp(canonical_name)
778}
779
780/// Returns the Unicode HIR class corresponding to the given sentence
781/// break property.
782///
783/// Name canonicalization is assumed to be performed by the caller.
784///
785/// If the given property could not be found, or if the corresponding data is
786/// not available, then an error is returned.
787fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
788    #[cfg(not(feature = "unicode-segment"))]
789    fn imp(_: &'static str) -> Result<hir::ClassUnicode> {
790        Err(Error::PropertyNotFound)
791    }
792
793    #[cfg(feature = "unicode-segment")]
794    fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
795        use crate::unicode_tables::sentence_break::BY_NAME;
796        property_set(BY_NAME, name)
797            .map(hir_class)
798            .ok_or(Error::PropertyValueNotFound)
799    }
800
801    imp(canonical_name)
802}
803
804/// Like symbolic_name_normalize_bytes, but operates on a string.
805fn symbolic_name_normalize(x: &str) -> String {
806    let mut tmp = x.as_bytes().to_vec();
807    let len = symbolic_name_normalize_bytes(&mut tmp).len();
808    tmp.truncate(len);
809    // This should always succeed because `symbolic_name_normalize_bytes`
810    // guarantees that `&tmp[..len]` is always valid UTF-8.
811    //
812    // N.B. We could avoid the additional UTF-8 check here, but it's unlikely
813    // to be worth skipping the additional safety check. A benchmark must
814    // justify it first.
815    String::from_utf8(tmp).unwrap()
816}
817
818/// Normalize the given symbolic name in place according to UAX44-LM3.
819///
820/// A "symbolic name" typically corresponds to property names and property
821/// value aliases. Note, though, that it should not be applied to property
822/// string values.
823///
824/// The slice returned is guaranteed to be valid UTF-8 for all possible values
825/// of `slice`.
826///
827/// See: https://unicode.org/reports/tr44/#UAX44-LM3
828fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
829    // I couldn't find a place in the standard that specified that property
830    // names/aliases had a particular structure (unlike character names), but
831    // we assume that it's ASCII only and drop anything that isn't ASCII.
832    let mut start = 0;
833    let mut starts_with_is = false;
834    if slice.len() >= 2 {
835        // Ignore any "is" prefix.
836        starts_with_is = slice[0..2] == b"is"[..]
837            || slice[0..2] == b"IS"[..]
838            || slice[0..2] == b"iS"[..]
839            || slice[0..2] == b"Is"[..];
840        if starts_with_is {
841            start = 2;
842        }
843    }
844    let mut next_write = 0;
845    for i in start..slice.len() {
846        // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid
847        // UTF-8, we ensure that the slice contains only ASCII bytes. In
848        // particular, we drop every non-ASCII byte from the normalized string.
849        let b = slice[i];
850        if b == b' ' || b == b'_' || b == b'-' {
851            continue;
852        } else if b'A' <= b && b <= b'Z' {
853            slice[next_write] = b + (b'a' - b'A');
854            next_write += 1;
855        } else if b <= 0x7F {
856            slice[next_write] = b;
857            next_write += 1;
858        }
859    }
860    // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally
861    // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross
862    // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it
863    // is actually an alias for the 'Other' general category.
864    if starts_with_is && next_write == 1 && slice[0] == b'c' {
865        slice[0] = b'i';
866        slice[1] = b's';
867        slice[2] = b'c';
868        next_write = 3;
869    }
870    &mut slice[..next_write]
871}
872
873#[cfg(test)]
874mod tests {
875    use super::{
876        contains_simple_case_mapping, simple_fold, symbolic_name_normalize,
877        symbolic_name_normalize_bytes,
878    };
879
880    #[cfg(feature = "unicode-case")]
881    fn simple_fold_ok(c: char) -> impl Iterator<Item = char> {
882        simple_fold(c).unwrap().unwrap()
883    }
884
885    #[cfg(feature = "unicode-case")]
886    fn simple_fold_err(c: char) -> Option<char> {
887        match simple_fold(c).unwrap() {
888            Ok(_) => unreachable!("simple_fold returned Ok iterator"),
889            Err(next) => next,
890        }
891    }
892
893    #[cfg(feature = "unicode-case")]
894    fn contains_case_map(start: char, end: char) -> bool {
895        contains_simple_case_mapping(start, end).unwrap()
896    }
897
898    #[test]
899    #[cfg(feature = "unicode-case")]
900    fn simple_fold_k() {
901        let xs: Vec<char> = simple_fold_ok('k').collect();
902        assert_eq!(xs, vec!['K', 'K']);
903
904        let xs: Vec<char> = simple_fold_ok('K').collect();
905        assert_eq!(xs, vec!['k', 'K']);
906
907        let xs: Vec<char> = simple_fold_ok('K').collect();
908        assert_eq!(xs, vec!['K', 'k']);
909    }
910
911    #[test]
912    #[cfg(feature = "unicode-case")]
913    fn simple_fold_a() {
914        let xs: Vec<char> = simple_fold_ok('a').collect();
915        assert_eq!(xs, vec!['A']);
916
917        let xs: Vec<char> = simple_fold_ok('A').collect();
918        assert_eq!(xs, vec!['a']);
919    }
920
921    #[test]
922    #[cfg(feature = "unicode-case")]
923    fn simple_fold_empty() {
924        assert_eq!(Some('A'), simple_fold_err('?'));
925        assert_eq!(Some('A'), simple_fold_err('@'));
926        assert_eq!(Some('a'), simple_fold_err('['));
927        assert_eq!(Some('Ⰰ'), simple_fold_err('☃'));
928    }
929
930    #[test]
931    #[cfg(feature = "unicode-case")]
932    fn simple_fold_max() {
933        assert_eq!(None, simple_fold_err('\u{10FFFE}'));
934        assert_eq!(None, simple_fold_err('\u{10FFFF}'));
935    }
936
937    #[test]
938    #[cfg(not(feature = "unicode-case"))]
939    fn simple_fold_disabled() {
940        assert!(simple_fold('a').is_err());
941    }
942
943    #[test]
944    #[cfg(feature = "unicode-case")]
945    fn range_contains() {
946        assert!(contains_case_map('A', 'A'));
947        assert!(contains_case_map('Z', 'Z'));
948        assert!(contains_case_map('A', 'Z'));
949        assert!(contains_case_map('@', 'A'));
950        assert!(contains_case_map('Z', '['));
951        assert!(contains_case_map('☃', 'Ⰰ'));
952
953        assert!(!contains_case_map('[', '['));
954        assert!(!contains_case_map('[', '`'));
955
956        assert!(!contains_case_map('☃', '☃'));
957    }
958
959    #[test]
960    #[cfg(not(feature = "unicode-case"))]
961    fn range_contains_disabled() {
962        assert!(contains_simple_case_mapping('a', 'a').is_err());
963    }
964
965    #[test]
966    #[cfg(feature = "unicode-gencat")]
967    fn regression_466() {
968        use super::{CanonicalClassQuery, ClassQuery};
969
970        let q = ClassQuery::OneLetter('C');
971        assert_eq!(
972            q.canonicalize().unwrap(),
973            CanonicalClassQuery::GeneralCategory("Other")
974        );
975    }
976
977    #[test]
978    fn sym_normalize() {
979        let sym_norm = symbolic_name_normalize;
980
981        assert_eq!(sym_norm("Line_Break"), "linebreak");
982        assert_eq!(sym_norm("Line-break"), "linebreak");
983        assert_eq!(sym_norm("linebreak"), "linebreak");
984        assert_eq!(sym_norm("BA"), "ba");
985        assert_eq!(sym_norm("ba"), "ba");
986        assert_eq!(sym_norm("Greek"), "greek");
987        assert_eq!(sym_norm("isGreek"), "greek");
988        assert_eq!(sym_norm("IS_Greek"), "greek");
989        assert_eq!(sym_norm("isc"), "isc");
990        assert_eq!(sym_norm("is c"), "isc");
991        assert_eq!(sym_norm("is_c"), "isc");
992    }
993
994    #[test]
995    fn valid_utf8_symbolic() {
996        let mut x = b"abc\xFFxyz".to_vec();
997        let y = symbolic_name_normalize_bytes(&mut x);
998        assert_eq!(y, b"abcxyz");
999    }
1000}
regex_syntax/unicode.rs

regex_syntax/
unicode.rs