From 19b3e24083eb0b1b5299e689d0bc5f1a6c4ebdcd Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 13 May 2025 10:40:26 +0200 Subject: slre: drop wrong "anchored" optimization The regex '^a|b' means "does the string start with a, or does it have a b anywhere", not "does the string start with a or b" (the latter should be spelled '^[ab]' or '^(a|b)'). It should match exactly the same strings as 'b|^a'. But the current implementation hard-codes an assumption that when the regex starts with a ^, the whole regex must match from the beginning, i.e. it only attempts at offset 0. It really should be completely symmetrical to 'b|c$' ("does it have a b anywhere or end with c?"), which is treated correctly. Another quirk is that currently the regex 'x*$', which should match all strings (because it just means "does the string end with 0 or more x'es"), does not, because in the unanchored case we never attempt to match at ofs==len. In the anchored case, '^x*$', this works correctly and matches exactly strings (including the empty string) consisting entirely of x'es. Fix both of these issues by dropping all use of the slre->anchored member and always test at all possible offsets. If the regex does have a ^ somewhere (including after a | branch character), that is correctly handled by the match engine by only matching when *ofs is 0. Reviewed-by: Simon Glass Signed-off-by: Rasmus Villemoes --- lib/slre.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/lib/slre.c b/lib/slre.c index 277a59a03a7..4f455400d3a 100644 --- a/lib/slre.c +++ b/lib/slre.c @@ -413,10 +413,7 @@ int slre_compile(struct slre *r, const char *re) { r->err_str = NULL; - r->code_size = r->data_size = r->num_caps = r->anchored = 0; - - if (*re == '^') - r->anchored++; + r->code_size = r->data_size = r->num_caps = 0; emit(r, OPEN); /* This will capture what matches full RE */ emit(r, 0); @@ -650,13 +647,9 @@ slre_match(const struct slre *r, const char *buf, int len, { int i, ofs = 0, res = 0; - if (r->anchored) { + for (i = 0; i <= len && res == 0; i++) { + ofs = i; res = match(r, 0, buf, len, &ofs, caps); - } else { - for (i = 0; i < len && res == 0; i++) { - ofs = i; - res = match(r, 0, buf, len, &ofs, caps); - } } return res; -- cgit v1.3.1 From 457f19815c8f8b74c36d8cbef454a0b575ea7068 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 13 May 2025 10:40:29 +0200 Subject: slre: refactor is_any_but() As preparation for fixing the handling of backslash-escapes used inside a character class, refactor is_any_but() to be defined in terms of is_any_of() so we don't have to repeat the same logic in two places. Reviewed-by: Simon Glass Signed-off-by: Rasmus Villemoes --- lib/slre.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/lib/slre.c b/lib/slre.c index 4f455400d3a..5cb0d3ec7fa 100644 --- a/lib/slre.c +++ b/lib/slre.c @@ -484,17 +484,14 @@ is_any_of(const unsigned char *p, int len, const char *s, int *ofs) static int is_any_but(const unsigned char *p, int len, const char *s, int *ofs) { - int i, ch; - - ch = s[*ofs]; + int dummy = *ofs; - for (i = 0; i < len; i++) { - if (p[i] == ch) - return 0; + if (is_any_of(p, len, s, &dummy)) { + return 0; + } else { + (*ofs)++; + return 1; } - - (*ofs)++; - return 1; } static int -- cgit v1.3.1 From 5d3f91d6a82386e94c3178721641608563560871 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 13 May 2025 10:40:30 +0200 Subject: slre: fix matching of escape sequence used inside character class At the compile stage, the anyof() function clearly intends to handle escape sequences like \d (for digits) inside square brackets, since the logic emits a 0 byte followed by the code representing the character class (NONSPACE, SPACE or DIGIT). However, this is not handled in the corresponding match helper is_any_of(); it just naively loops over all the bytes in the ->data array emitted by anyof() and compares those directly to the current character. For example, this means that the string "\x11" (containing the single character with value 17) is matched by the regex "[#%\d]", because DIGIT happens to be 17. Fix that by recognizing a zero byte as indicating something special and act accordingly. In order not to repeat the "increment *ofs and return 1" in all places, put those two lines after a new match: label. Reviewed-by: Simon Glass Signed-off-by: Rasmus Villemoes --- lib/slre.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/slre.c b/lib/slre.c index 5cb0d3ec7fa..87dfde720e9 100644 --- a/lib/slre.c +++ b/lib/slre.c @@ -472,13 +472,33 @@ is_any_of(const unsigned char *p, int len, const char *s, int *ofs) ch = s[*ofs]; - for (i = 0; i < len; i++) - if (p[i] == ch) { - (*ofs)++; - return 1; + for (i = 0; i < len; i++) { + if (p[i] == '\0') { + switch (p[++i]) { + case NONSPACE: + if (!isspace(ch)) + goto match; + break; + case SPACE: + if (isspace(ch)) + goto match; + break; + case DIGIT: + if (isdigit(ch)) + goto match; + break; + } + continue; } + if (p[i] == ch) + goto match; + } return 0; + +match: + (*ofs)++; + return 1; } static int -- cgit v1.3.1 From fe4f21185050ef3f2cc847ba04701d1e714522e3 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 13 May 2025 10:40:32 +0200 Subject: slre: implement support for ranges in character classes When trying to use U-Boot's regex facility, it is a rather large gotcha that [a-z] range syntax is not supported. It doesn't require a lot of extra code to implement that; we just let the regular parsing emit the start and end literal symbols as usual, and add a new "escape" code RANGE. At match time, this means the code will first just see an 'a' and try to match that, and only then recognize that it's actually part of a range and then do the 'a' <= ch <= 'z' test. Of course, this means that a - in the middle of a [] pair no longer matches a literal dash, but I highly doubt anybody relies on that. Putting it first or last, or escaping it with \, as in most other RE engines, continues to work. Reviewed-by: Simon Glass Signed-off-by: Rasmus Villemoes --- lib/slre.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/slre.c b/lib/slre.c index 87dfde720e9..117815a6d60 100644 --- a/lib/slre.c +++ b/lib/slre.c @@ -30,7 +30,7 @@ #include enum {END, BRANCH, ANY, EXACT, ANYOF, ANYBUT, OPEN, CLOSE, BOL, EOL, - STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT}; + STAR, PLUS, STARQ, PLUSQ, QUEST, SPACE, NONSPACE, DIGIT, RANGE}; #ifdef SLRE_TEST static struct { @@ -55,7 +55,8 @@ static struct { {"QUEST", 1, "o"}, /* Match zero or one time, "?" */ {"SPACE", 0, ""}, /* Match whitespace, "\s" */ {"NONSPACE", 0, ""}, /* Match non-space, "\S" */ - {"DIGIT", 0, ""} /* Match digit, "\d" */ + {"DIGIT", 0, ""}, /* Match digit, "\d" */ + {"RANGE", 0, ""}, /* Range separator - */ }; #endif /* SLRE_TEST */ @@ -260,6 +261,15 @@ anyof(struct slre *r, const char **re) return; /* NOTREACHED */ break; + case '-': + if (r->data_size == old_data_size || **re == ']') { + /* First or last character, just match - itself. */ + store_char_in_data(r, '-'); + break; + } + store_char_in_data(r, 0); + store_char_in_data(r, RANGE); + break; case '\\': esc = get_escape_char(re); if ((esc & 0xff) == 0) { @@ -487,6 +497,14 @@ is_any_of(const unsigned char *p, int len, const char *s, int *ofs) if (isdigit(ch)) goto match; break; + case RANGE: + /* + * a-z is represented in the data array as {'a', \0, RANGE, 'z'} + */ + ++i; + if (p[i - 3] <= (unsigned char)ch && (unsigned char)ch <= p[i]) + goto match; + break; } continue; } -- cgit v1.3.1