From 3b86b9c35fe971c99d42c1e18691823f43bbab3f Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 17 Aug 2025 16:22:26 -0600 Subject: [PATCH 01/12] toke.c: S_intuit_more: Simplify a bit of code --- toke.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/toke.c b/toke.c index aef854a9e641..70bc026cba4f 100644 --- a/toke.c +++ b/toke.c @@ -4590,10 +4590,7 @@ S_intuit_more(pTHX_ char *s, char *e, * written, and regcurly never required a comma, as in {0}. Probably it is * ok as-is */ if (s[0] == '{') { - if (regcurly(s, e, NULL)) { - return FALSE; - } - return TRUE; + return ! regcurly(s, e, NULL); } /* Here is '[': maybe we have a character class. Examine the guts */ From b302a1970435cfff84ab0b95dae512e8df0e3e6d Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Fri, 5 Sep 2025 16:01:32 -0600 Subject: [PATCH 02/12] intuit_more: Handle easy thing first This swaps the order of handling a conditional --- toke.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/toke.c b/toke.c index 70bc026cba4f..72db7fa55fbc 100644 --- a/toke.c +++ b/toke.c @@ -4811,8 +4811,11 @@ S_intuit_more(pTHX_ char *s, char *e, case '\\': - if (s[1]) { - if (memCHRs("wds]", s[1])) { + if (s[1] == '\0') { + /* \ followed by NUL strongly indicates character class */ + weight += 100; + } + else if (memCHRs("wds]", s[1])) { weight += 100; /* \w \d \s => strongly charclass */ /* khw: \] can't happen, as any ']' is beyond our search. * Why not \W \D \S \h \v, etc as well? Should they have @@ -4866,9 +4869,6 @@ S_intuit_more(pTHX_ char *s, char *e, * purposes of the 'seen' array. Whatever is matched by these * backslashed sequences should not be added to 'seen'. That * includes the backslash. */ - } - else /* \ followed by NUL strongly indicates character class */ - weight += 100; break; case '-': From eb6d4674b453e234bf7b43a5ef4b2661719de940 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Fri, 5 Sep 2025 16:03:00 -0600 Subject: [PATCH 03/12] intuit_more: White space only The previous commit allows this one to outdent things. --- toke.c | 103 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 51 insertions(+), 52 deletions(-) diff --git a/toke.c b/toke.c index 72db7fa55fbc..b3764e74f99b 100644 --- a/toke.c +++ b/toke.c @@ -4816,59 +4816,58 @@ S_intuit_more(pTHX_ char *s, char *e, weight += 100; } else if (memCHRs("wds]", s[1])) { - weight += 100; /* \w \d \s => strongly charclass */ - /* khw: \] can't happen, as any ']' is beyond our search. - * Why not \W \D \S \h \v, etc as well? Should they have - * the same weights as \w \d \s or should all or some be - * in the 'abcfnrtvx' below? */ - } else if (seen[(U8)'\''] || seen[(U8)'"']) { - weight += 1; - /* khw: This is problematic. Enough so, that I misread - * it, and added a wrong comment about what it does in - * 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee. Note that it - * doesn't look at the current character. What it - * actually does is: if any quote has been seen in the - * parse, don't do the rest of the else's below, but for - * every subsequent backslashed character encountered - * (except \0 \w \s \d), increment the weight to lean a - * bit more towards being a charclass. That means that - * every backslash sequence following the first occurrence - * of a quote increments the weight regardless of what the - * sequence is. Again, \0 \w \d and \s are not controlled - * by this else, so they change the weight by a lot more. - * But what makes them so special that they aren't subject - * to this. Any why does having a quote change the - * behavior from then on. And why only backslashed - * sequences get this treatment? This code has been - * unchanged since this function was added in 1993. I - * don't get it. Instead, it does seem to me that it is - * especially unlikely to repeat a quote in a charclass, - * but that having just a single quote is indicative of a - * charclass, and having pairs of quotes is indicative of - * a subscript. Similarly for things that could indicate - * nesting of braces or parens. */ - } - else if (memCHRs("abcfnrtvx", s[1])) - weight += 40; /* \n, etc => charclass */ - /* khw: Why not \e etc as well? */ - else if (isDIGIT(s[1])) { - weight += 40; /* \123 => charclass */ - while (s[1] && isDIGIT(s[1])) - s++; - } + weight += 100; /* \w \d \s => strongly charclass */ + /* khw: \] can't happen, as any ']' is beyond our search. Why + * not \W \D \S \h \v, etc as well? Should they have the same + * weights as \w \d \s or should all or some be in the + * 'abcfnrtvx' below? */ + } + else if (seen[(U8)'\''] || seen[(U8)'"']) { + weight += 1; + /* khw: This is problematic. Enough so, that I misread it, + * and added a wrong comment about what it does in + * 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee. Note that it + * doesn't look at the current character. What it actually + * does is: if any quote has been seen in the parse, don't do + * the rest of the else's below, but for every subsequent + * backslashed character encountered (except \0 \w \s \d), + * increment the weight to lean a bit more towards being a + * charclass. That means that every backslash sequence + * following the first occurrence of a quote increments the + * weight regardless of what the sequence is. Again, \0 \w \d + * and \s are not controlled by this else, so they change the + * weight by a lot more. But what makes them so special that + * they aren't subject to this. Any why does having a quote + * change the behavior from then on. And why only backslashed + * sequences get this treatment? This code has been unchanged + * since this function was added in 1993. I don't get it. + * Instead, it does seem to me that it is especially unlikely + * to repeat a quote in a charclass, but that having just a + * single quote is indicative of a charclass, and having pairs + * of quotes is indicative of a subscript. Similarly for + * things that could indicate nesting of braces or parens. */ + } + else if (memCHRs("abcfnrtvx", s[1])) + weight += 40; /* \n, etc => charclass */ + /* khw: Why not \e etc as well? */ + else if (isDIGIT(s[1])) { + weight += 40; /* \123 => charclass */ + while (s[1] && isDIGIT(s[1])) + s++; + } - /* khw: There are lots more possible escape sequences. Some, - * like \A,\z have no special meaning to charclasses, so might - * indicate a subscript, but I don't know what they would be - * doing there either. Some have been added to the language - * after this code was written, but no one thought to, or - * could wade through this function, to add them. Things like - * \p{} for properties, \N and \N{}, for example. - * - * It's problematic that \a is treated as plain 'a' for - * purposes of the 'seen' array. Whatever is matched by these - * backslashed sequences should not be added to 'seen'. That - * includes the backslash. */ + /* khw: There are lots more possible escape sequences. Some, like + * \A,\z have no special meaning to charclasses, so might indicate + * a subscript, but I don't know what they would be doing there + * either. Some have been added to the language after this code + * was written, but no one thought to, or could wade through this + * function, to add them. Things like \p{} for properties, \N and + * \N{}, for example. + * + * It's problematic that \a is treated as plain 'a' for purposes + * of the 'seen' array. Whatever is matched by these backslashed + * sequences should not be added to 'seen'. That includes the + * backslash. */ break; case '-': From 5eeec5ecaac8a940f98bad379715093595686fe6 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 27 Oct 2025 04:19:29 -0600 Subject: [PATCH 04/12] intuit_more: Reserve first byte of buffer Future commits will want to use this space. --- toke.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/toke.c b/toke.c index b3764e74f99b..fbff10d5b467 100644 --- a/toke.c +++ b/toke.c @@ -4740,19 +4740,21 @@ S_intuit_more(pTHX_ char *s, char *e, * changed since the code was first added */ char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) * 4 ]; - if (! scan_ident(s, tmpbuf, C_ARRAY_END(tmpbuf), CHECK_ONLY)) + /* (Reserve tmpbuf[0] for future commits) */ + if (! scan_ident(s, tmpbuf + 1, C_ARRAY_END(tmpbuf), + CHECK_ONLY)) { /* An illegal identifier means this can't be a subscript; * it's an error or it could be a charclass */ return false; } - len = strlen(tmpbuf); + len = strlen(tmpbuf + 1); /* khw: This only looks at global variables; lexicals came * later, and this hasn't been updated. Ouch!! */ if ( len > 1 - && gv_fetchpvn_flags(tmpbuf, + && gv_fetchpvn_flags(tmpbuf + 1, len, UTF ? SVf_UTF8 : 0, SVt_PV)) From 2aeccb953b3e28495f53e605427520fbf40aaaf4 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 27 Oct 2025 10:08:35 -0600 Subject: [PATCH 05/12] intuit_more: Move declaration to first use --- toke.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/toke.c b/toke.c index fbff10d5b467..41a6d61e90ef 100644 --- a/toke.c +++ b/toke.c @@ -4733,7 +4733,6 @@ S_intuit_more(pTHX_ char *s, char *e, * */ if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) { - Size_t len; /* khw: where did the magic number 4 come from?. This buffer * was 4 times as large as tokenbuf in 1997, and had not @@ -4749,6 +4748,7 @@ S_intuit_more(pTHX_ char *s, char *e, return false; } + Size_t len; /* (C++ forbids joining these 2 lines) */ len = strlen(tmpbuf + 1); /* khw: This only looks at global variables; lexicals came From 5d79145ed13618e67b642268fc60c85d5c750e9c Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 27 Oct 2025 10:44:02 -0600 Subject: [PATCH 06/12] intuit_more: Look for more variable types. Using \w here missed the possibility of lots of other syntaxes of variables, like $::foo or ${foo}, that scan_ident looks for. So call scan_ident without first filtering what it looks at. It is more convenient to then swap the code block that handles punctuation variables with the code block that handles other length 1 identifiers --- toke.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/toke.c b/toke.c index 41a6d61e90ef..15c8b3530684 100644 --- a/toke.c +++ b/toke.c @@ -4732,7 +4732,6 @@ S_intuit_more(pTHX_ char *s, char *e, * looks for. * */ - if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) { /* khw: where did the magic number 4 come from?. This buffer * was 4 times as large as tokenbuf in 1997, and had not @@ -4772,6 +4771,20 @@ S_intuit_more(pTHX_ char *s, char *e, * like $subscripts{$which}. We should advance past the * braces and key */ } + else if (len == 1) { + if ( s[0] == '$' + && s[1] + && memCHRs("[#!%*<>()-=", tmpbuf[1])) + { + /* Here we have what could be a punctuation variable. If the + * next character after it is a closing bracket, it makes it + * quite likely to be that, and hence a subscript. If it is + * something else, more mildly a subscript */ + if (/*{*/ memCHRs("])} =", tmpbuf[2])) + weight -= 10; + else + weight -= 1; + } else { /* Not a multi-char identifier already known in the * program; is somewhat likely to be a subscript. @@ -4787,19 +4800,6 @@ S_intuit_more(pTHX_ char *s, char *e, weight -= 10; } } - else if ( s[0] == '$' - && s[1] - && memCHRs("[#!%*<>()-=", s[1])) - { - /* Here we have what could be a punctuation variable. If the - * next character after it is a closing bracket, it makes it - * quite likely to be that, and hence a subscript. If it is - * something else, more mildly a subscript */ - if (/*{*/ memCHRs("])} =", s[2])) - weight -= 10; - else - weight -= 1; - } break; /* khw: [:blank:] strongly indicates a charclass */ From 26848f00acb130b61d3fdd9cc1f535179dd192e6 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 27 Oct 2025 10:58:17 -0600 Subject: [PATCH 07/12] intuit_more: White-space only Outdent, as the previous commit removed a block; and one block we indent for a future new block --- toke.c | 103 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 51 insertions(+), 52 deletions(-) diff --git a/toke.c b/toke.c index 15c8b3530684..b683573c3f38 100644 --- a/toke.c +++ b/toke.c @@ -4733,48 +4733,48 @@ S_intuit_more(pTHX_ char *s, char *e, * */ - /* khw: where did the magic number 4 come from?. This buffer - * was 4 times as large as tokenbuf in 1997, and had not - * changed since the code was first added */ - char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) * 4 ]; - - /* (Reserve tmpbuf[0] for future commits) */ - if (! scan_ident(s, tmpbuf + 1, C_ARRAY_END(tmpbuf), - CHECK_ONLY)) - { - /* An illegal identifier means this can't be a subscript; - * it's an error or it could be a charclass */ - return false; - } + /* khw: where did the magic number 4 come from?. This buffer + * was 4 times as large as tokenbuf in 1997, and had not + * changed since the code was first added */ + char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) * 4 ]; + + /* (Reserve tmpbuf[0] for future commits) */ + if (! scan_ident(s, tmpbuf + 1, C_ARRAY_END(tmpbuf), + CHECK_ONLY)) + { + /* An illegal identifier means this can't be a subscript; + * it's an error or it could be a charclass */ + return false; + } - Size_t len; /* (C++ forbids joining these 2 lines) */ - len = strlen(tmpbuf + 1); + Size_t len; /* (C++ forbids joining these 2 lines) */ + len = strlen(tmpbuf + 1); - /* khw: This only looks at global variables; lexicals came - * later, and this hasn't been updated. Ouch!! */ - if ( len > 1 - && gv_fetchpvn_flags(tmpbuf + 1, - len, - UTF ? SVf_UTF8 : 0, - SVt_PV)) - { + /* khw: This only looks at global variables; lexicals came + * later, and this hasn't been updated. Ouch!! */ + if ( len > 1 + && gv_fetchpvn_flags(tmpbuf + 1, + len, + UTF ? SVf_UTF8 : 0, + SVt_PV)) + { weight -= 100; - /* khw: Below we keep track of repeated characters; People - * rarely say qr/[aba]/, as the second a is pointless. - * (Some do it though as a mnemonic that is meaningful to - * them.) But generally, repeated characters make things - * more likely to be a charclass. But here, this an - * identifier so likely a subscript. Its spelling should - * be irrelevant to the repeated characters test. So, we - * should advance past it. Suppose it is a hash element, - * like $subscripts{$which}. We should advance past the - * braces and key */ - } - else if (len == 1) { - if ( s[0] == '$' - && s[1] - && memCHRs("[#!%*<>()-=", tmpbuf[1])) + /* khw: Below we keep track of repeated characters; + * People rarely say qr/[aba]/, as the second a is + * pointless. (Some do it though as a mnemonic that is + * meaningful to them.) But generally, repeated characters + * make things more likely to be a charclass. But here, + * this an identifier so likely a subscript. Its spelling + * should be irrelevant to the repeated characters test. + * So, we should advance past it. Suppose it is a hash + * element, like $subscripts{$which}. We should advance + * past the braces and key */ + } + else /* len == 1 */ + if ( s[0] == '$' + && s[1] + && memCHRs("[#!%*<>()-=", tmpbuf[1])) { /* Here we have what could be a punctuation variable. If the * next character after it is a closing bracket, it makes it @@ -4785,20 +4785,19 @@ S_intuit_more(pTHX_ char *s, char *e, else weight -= 1; } - else { - /* Not a multi-char identifier already known in the - * program; is somewhat likely to be a subscript. - * - * khw: Our test suite contains several constructs like - * [$A-Z]. Excluding length 1 identifiers in the - * conditional above means such are much less likely to be - * mistaken for subscripts. I would argue that if the next - * character is a '-' followed by an alpha, that would make - * it much more likely to be a charclass. It would only - * make sense to be an expression if that alpha string is a - * bareword with meaning; something like [$A-ord] */ - weight -= 10; - } + else { /* len == 1 */ + /* Not a multi-char identifier already known in the program; + * is somewhat likely to be a subscript. + * + * khw: Our test suite contains several constructs like + * [$A-Z]. Excluding length 1 identifiers in the conditional + * above means such are much less likely to be mistaken for + * subscripts. I would argue that if the next character is a + * '-' followed by an alpha, that would make it much more + * likely to be a charclass. It would only make sense to be + * an expression if that alpha string is a bareword with + * meaning; something like [$A-ord] */ + weight -= 10; } break; From 7eb08b775b3db413007e0d1e2dd92a9327731230 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 27 Oct 2025 11:13:28 -0600 Subject: [PATCH 08/12] intuit_more: Remove over allocation of buffer I don't know why the code here uses a buffer 4 times bigger than what is the maximum an identifier can be. One might think this is because Unicode characters can be 4 bytes long, but this number has been in effect since before Perl knew about Unicode; or maybe it was to avoid a potential buffer overflow. But in any event, I'm pretty certain there is no need for that now. The buffer is now allocated to handle the maximum Unicode-needed size, and buffer overflow is checked for and avoided. --- toke.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/toke.c b/toke.c index b683573c3f38..ff2337031179 100644 --- a/toke.c +++ b/toke.c @@ -4733,15 +4733,13 @@ S_intuit_more(pTHX_ char *s, char *e, * */ - /* khw: where did the magic number 4 come from?. This buffer - * was 4 times as large as tokenbuf in 1997, and had not - * changed since the code was first added */ - char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) * 4 ]; + /* (Reserve tmpbuf[0] for future commits, hence +1) */ + char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) + 1 ]; - /* (Reserve tmpbuf[0] for future commits) */ if (! scan_ident(s, tmpbuf + 1, C_ARRAY_END(tmpbuf), CHECK_ONLY)) { + /* An illegal identifier means this can't be a subscript; * it's an error or it could be a charclass */ return false; From b61e5b86a75065f7418a3d5e5344b2ac3b8e0b47 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 27 Oct 2025 11:19:11 -0600 Subject: [PATCH 09/12] intuit_more: Save scan_ident return It returns more than a boolean; save the actual return for use in future commits --- toke.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/toke.c b/toke.c index ff2337031179..ae1f25ca99fc 100644 --- a/toke.c +++ b/toke.c @@ -4735,10 +4735,13 @@ S_intuit_more(pTHX_ char *s, char *e, /* (Reserve tmpbuf[0] for future commits, hence +1) */ char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) + 1 ]; + char * s_after_ident; - if (! scan_ident(s, tmpbuf + 1, C_ARRAY_END(tmpbuf), - CHECK_ONLY)) - { + /* scan_ident returns NULL if the input looks like an identifier + * that is illegal, e.g., it is too long or is like $001. */ + s_after_ident = scan_ident(s, tmpbuf + 1, C_ARRAY_END(tmpbuf), + CHECK_ONLY); + if (s_after_ident == NULL) { /* An illegal identifier means this can't be a subscript; * it's an error or it could be a charclass */ From ac4d84e4a33c01ff45b7001dec6000eb55230366 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 27 Oct 2025 17:11:42 -0600 Subject: [PATCH 10/12] intuit_more: 0 length identifier means nothing more If scan_ident indicates that $ @ & are followed by nothing that looks like an identifier, then this isn't an expression. It has to be a character class or an error. Almost anything is an identifier when 'use utf8' isn't in effect; when it is, non ASCII has to be an Identifier Start character following these --- toke.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/toke.c b/toke.c index ae1f25ca99fc..2de59c87939a 100644 --- a/toke.c +++ b/toke.c @@ -4751,6 +4751,13 @@ S_intuit_more(pTHX_ char *s, char *e, Size_t len; /* (C++ forbids joining these 2 lines) */ len = strlen(tmpbuf + 1); + /* If it doesn't look like an identifier at all, scan_ident will + * set tmpbuf[1] to NUL. This is either an error or a character + * class. */ + if (len == 0) { + return false; + } + /* khw: This only looks at global variables; lexicals came * later, and this hasn't been updated. Ouch!! */ if ( len > 1 @@ -4772,10 +4779,9 @@ S_intuit_more(pTHX_ char *s, char *e, * element, like $subscripts{$which}. We should advance * past the braces and key */ } - else /* len == 1 */ - if ( s[0] == '$' - && s[1] - && memCHRs("[#!%*<>()-=", tmpbuf[1])) + else if ( len == 1 + && s[0] == '$' + && memCHRs("[#!%*<>()-=", tmpbuf[1])) { /* Here we have what could be a punctuation variable. If the * next character after it is a closing bracket, it makes it @@ -4786,7 +4792,7 @@ S_intuit_more(pTHX_ char *s, char *e, else weight -= 1; } - else { /* len == 1 */ + else { /* Not a multi-char identifier already known in the program; * is somewhat likely to be a subscript. * From 5e9f08230f8654620450927e070fc4c62ac1f0bd Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 27 Oct 2025 17:18:12 -0600 Subject: [PATCH 11/12] intuit_more: Handle numeric identifiers This function was totally unaware of the possibility of these. --- toke.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 64 insertions(+), 8 deletions(-) diff --git a/toke.c b/toke.c index 2de59c87939a..1edde19afe6e 100644 --- a/toke.c +++ b/toke.c @@ -4758,13 +4758,70 @@ S_intuit_more(pTHX_ char *s, char *e, return false; } - /* khw: This only looks at global variables; lexicals came - * later, and this hasn't been updated. Ouch!! */ - if ( len > 1 - && gv_fetchpvn_flags(tmpbuf + 1, - len, - UTF ? SVf_UTF8 : 0, - SVt_PV)) + /* If there is extra stuff in the source, like braces, it means + * this is almost definitely intended to be an identifier */ + bool decorated; + decorated = (Size_t) (s_after_ident - s) > len; + + if (isDIGIT_A(tmpbuf[1])) { + + /* &41 and &4b are illegal subroutine names so is an error or + * a charclass */ + if (s[0] == '&') { + return false; + } + + /* Here, matches [$@]\d+. If the next input character is a + * \w, we would have something like $456x, which is an illegal + * identifer, so is an error or a charclass */ + if ( ! decorated + && isWORDCHAR_lazy_if_safe(s_after_ident, + PL_bufend, UTF)) + { + return false; + } + + /* We don't get here if this potential identifier starts with + * leading zeros, due to the logic in scan_ident. */ + assert(len == 1 || tmpbuf[0] != '0'); + + /* The chances are vanishingly small that someone is going to + * want [$0] to expand to the program's name in a character + * class. But, what would the program's name be doing as part + * of a subscript either? The only likely scenario is that + * this is meant to be a charclass matching either '$' or '0'. + * */ + if (tmpbuf[1] == '0') { + return false; + } + + /* Here it is either something like $1 which is supposed to + * match either dollar or 1, or it is supposed to expand to + * what is in $1 left over from a capturing group from the + * previous pattern match. In the latter case, it could be + * either a part of wanting to calculate a subscript, or to + * use as the contents of as part of the character class. + * Larger (undecorated) numbers are much less likely to have + * had capturing groups, so they lean more towards a + * charclass. 100 is what this function has traditionally + * used for len>1; khw thinks there is no bias one way or the + * other for length 1 ones. But has chosen 100 for decorated + * identifiers + * + * XXX long enough identifiers could probably return false + * immediately here, rather than using weights. */ + if (decorated || len > 1) { + weight -= 100; + } + } + else if ( len > 1 + /* khw: This only looks at global variables; lexicals + * came later, and this hasn't been updated. Ouch!! + * */ + && gv_fetchpvn_flags(tmpbuf + 1, + len, + UTF ? SVf_UTF8 : 0, + SVt_PV)) { weight -= 100; @@ -4817,7 +4874,6 @@ S_intuit_more(pTHX_ char *s, char *e, * \? must be subscript for things like \d, but not \a. */ - case '\\': if (s[1] == '\0') { /* \ followed by NUL strongly indicates character class */ From 9f313046b39df4c4f6b352637f40d05b1f42ca66 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 27 Oct 2025 17:51:23 -0600 Subject: [PATCH 12/12] intuit_more: Check if identifier exists The code attempted to do this, but was written before lexical variables existed, and had never been updated. --- toke.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/toke.c b/toke.c index 1edde19afe6e..d5d06d3748cd 100644 --- a/toke.c +++ b/toke.c @@ -4814,15 +4814,15 @@ S_intuit_more(pTHX_ char *s, char *e, weight -= 100; } } - else if ( len > 1 - /* khw: This only looks at global variables; lexicals - * came later, and this hasn't been updated. Ouch!! - * */ - && gv_fetchpvn_flags(tmpbuf + 1, - len, - UTF ? SVf_UTF8 : 0, - SVt_PV)) - { + else if (len > 1) { + /* See if there is a known identifier of the given kind. For + * arrays, this might also be a reference to one of its + * elements. XXX Maybe the latter should require a following + * '[' */ + if ( is_existing_identifier(tmpbuf, len, s[0], UTF) + || ( s[0] == '$' + && is_existing_identifier(tmpbuf, len, '@', UTF))) + { weight -= 100; /* khw: Below we keep track of repeated characters; @@ -4835,6 +4835,16 @@ S_intuit_more(pTHX_ char *s, char *e, * So, we should advance past it. Suppose it is a hash * element, like $subscripts{$which}. We should advance * past the braces and key */ + } + else { /* Isn't a known identifier */ + /* Under strict, this means an error. */ + if (under_strict_vars) { + return false; + } + + /* Otherwise still somewhat likely to be a subscript */ + weight -= 10; + } } else if ( len == 1 && s[0] == '$'