From 3b86b9c35fe971c99d42c1e18691823f43bbab3f Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Sun, 17 Aug 2025 16:22:26 -0600
Subject: [PATCH 01/12] toke.c: S_intuit_more: Simplify a bit of code

---
 toke.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/toke.c b/toke.c
index aef854a9e641..70bc026cba4f 100644
--- a/toke.c
+++ b/toke.c
@@ -4590,10 +4590,7 @@ S_intuit_more(pTHX_ char *s, char *e,
      * written, and regcurly never required a comma, as in {0}.  Probably it is
      * ok as-is */
     if (s[0] == '{') {
-        if (regcurly(s, e, NULL)) {
-            return FALSE;
-        }
-        return TRUE;
+        return ! regcurly(s, e, NULL);
     }
 
     /* Here is '[': maybe we have a character class.  Examine the guts */

From b302a1970435cfff84ab0b95dae512e8df0e3e6d Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Fri, 5 Sep 2025 16:01:32 -0600
Subject: [PATCH 02/12] intuit_more: Handle easy thing first

This swaps the order of handling a conditional
---
 toke.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/toke.c b/toke.c
index 70bc026cba4f..72db7fa55fbc 100644
--- a/toke.c
+++ b/toke.c
@@ -4811,8 +4811,11 @@ S_intuit_more(pTHX_ char *s, char *e,
 
 
           case '\\':
-            if (s[1]) {
-                if (memCHRs("wds]", s[1])) {
+            if (s[1] == '\0') {
+                /* \ followed by NUL strongly indicates character class */
+                weight += 100;
+            }
+            else if (memCHRs("wds]", s[1])) {
                     weight += 100;  /* \w \d \s => strongly charclass */
                     /* khw: \] can't happen, as any ']' is beyond our search.
                      * Why not \W \D \S \h \v, etc as well?  Should they have
@@ -4866,9 +4869,6 @@ S_intuit_more(pTHX_ char *s, char *e,
                  * purposes of the 'seen' array.  Whatever is matched by these
                  * backslashed sequences should not be added to 'seen'.  That
                  * includes the backslash. */
-            }
-            else /* \ followed by NUL strongly indicates character class */
-                weight += 100;
             break;
 
           case '-':

From eb6d4674b453e234bf7b43a5ef4b2661719de940 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Fri, 5 Sep 2025 16:03:00 -0600
Subject: [PATCH 03/12] intuit_more: White space only

The previous commit allows this one to outdent things.
---
 toke.c | 103 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 51 insertions(+), 52 deletions(-)

diff --git a/toke.c b/toke.c
index 72db7fa55fbc..b3764e74f99b 100644
--- a/toke.c
+++ b/toke.c
@@ -4816,59 +4816,58 @@ S_intuit_more(pTHX_ char *s, char *e,
                 weight += 100;
             }
             else if (memCHRs("wds]", s[1])) {
-                    weight += 100;  /* \w \d \s => strongly charclass */
-                    /* khw: \] can't happen, as any ']' is beyond our search.
-                     * Why not \W \D \S \h \v, etc as well?  Should they have
-                     * the same weights as \w \d \s or should all or some be
-                     * in the 'abcfnrtvx' below? */
-                } else if (seen[(U8)'\''] || seen[(U8)'"']) {
-                    weight += 1;
-                    /* khw: This is problematic.  Enough so, that I misread
-                     * it, and added a wrong comment about what it does in
-                     * 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee.  Note that it
-                     * doesn't look at the current character.  What it
-                     * actually does is: if any quote has been seen in the
-                     * parse, don't do the rest of the else's below, but for
-                     * every subsequent backslashed character encountered
-                     * (except \0 \w \s \d), increment the weight to lean a
-                     * bit more towards being a charclass.  That means that
-                     * every backslash sequence following the first occurrence
-                     * of a quote increments the weight regardless of what the
-                     * sequence is.  Again, \0 \w \d and \s are not controlled
-                     * by this else, so they change the weight by a lot more.
-                     * But what makes them so special that they aren't subject
-                     * to this.  Any why does having a quote change the
-                     * behavior from then on.  And why only backslashed
-                     * sequences get this treatment?  This code has been
-                     * unchanged since this function was added in 1993.  I
-                     * don't get it.  Instead, it does seem to me that it is
-                     * especially unlikely to repeat a quote in a charclass,
-                     * but that having just a single quote is indicative of a
-                     * charclass, and having pairs of quotes is indicative of
-                     * a subscript.  Similarly for things that could indicate
-                     * nesting of braces or parens. */
-                }
-                else if (memCHRs("abcfnrtvx", s[1]))
-                    weight += 40;   /* \n, etc => charclass */
-                    /* khw: Why not \e etc as well? */
-                else if (isDIGIT(s[1])) {
-                    weight += 40;   /* \123 => charclass */
-                    while (s[1] && isDIGIT(s[1]))
-                        s++;
-                }
+                weight += 100;  /* \w \d \s => strongly charclass */
+                /* khw: \] can't happen, as any ']' is beyond our search.  Why
+                 * not \W \D \S \h \v, etc as well?  Should they have the same
+                 * weights as \w \d \s or should all or some be in the
+                 * 'abcfnrtvx' below? */
+            }
+            else if (seen[(U8)'\''] || seen[(U8)'"']) {
+                weight += 1;
+                /* khw: This is problematic.  Enough so, that I misread it,
+                 * and added a wrong comment about what it does in
+                 * 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee.  Note that it
+                 * doesn't look at the current character.  What it actually
+                 * does is: if any quote has been seen in the parse, don't do
+                 * the rest of the else's below, but for every subsequent
+                 * backslashed character encountered (except \0 \w \s \d),
+                 * increment the weight to lean a bit more towards being a
+                 * charclass.  That means that every backslash sequence
+                 * following the first occurrence of a quote increments the
+                 * weight regardless of what the sequence is.  Again, \0 \w \d
+                 * and \s are not controlled by this else, so they change the
+                 * weight by a lot more.  But what makes them so special that
+                 * they aren't subject to this.  Any why does having a quote
+                 * change the behavior from then on.  And why only backslashed
+                 * sequences get this treatment?  This code has been unchanged
+                 * since this function was added in 1993.  I don't get it.
+                 * Instead, it does seem to me that it is especially unlikely
+                 * to repeat a quote in a charclass, but that having just a
+                 * single quote is indicative of a charclass, and having pairs
+                 * of quotes is indicative of a subscript.  Similarly for
+                 * things that could indicate nesting of braces or parens. */
+            }
+            else if (memCHRs("abcfnrtvx", s[1]))
+                weight += 40;   /* \n, etc => charclass */
+                /* khw: Why not \e etc as well? */
+            else if (isDIGIT(s[1])) {
+                weight += 40;   /* \123 => charclass */
+                while (s[1] && isDIGIT(s[1]))
+                    s++;
+            }
 
-                /* khw: There are lots more possible escape sequences.  Some,
-                 * like \A,\z have no special meaning to charclasses, so might
-                 * indicate a subscript, but I don't know what they would be
-                 * doing there either.  Some have been added to the language
-                 * after this code was written, but no one thought to, or
-                 * could wade through this function, to add them.  Things like
-                 * \p{} for properties, \N and \N{}, for example.
-                 *
-                 * It's problematic that \a is treated as plain 'a' for
-                 * purposes of the 'seen' array.  Whatever is matched by these
-                 * backslashed sequences should not be added to 'seen'.  That
-                 * includes the backslash. */
+            /* khw: There are lots more possible escape sequences.  Some, like
+             * \A,\z have no special meaning to charclasses, so might indicate
+             * a subscript, but I don't know what they would be doing there
+             * either.  Some have been added to the language after this code
+             * was written, but no one thought to, or could wade through this
+             * function, to add them.  Things like \p{} for properties, \N and
+             * \N{}, for example.
+             *
+             * It's problematic that \a is treated as plain 'a' for purposes
+             * of the 'seen' array.  Whatever is matched by these backslashed
+             * sequences should not be added to 'seen'.  That includes the
+             * backslash. */
             break;
 
           case '-':

From 5eeec5ecaac8a940f98bad379715093595686fe6 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 27 Oct 2025 04:19:29 -0600
Subject: [PATCH 04/12] intuit_more: Reserve first byte of buffer

Future commits will want to use this space.
---
 toke.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/toke.c b/toke.c
index b3764e74f99b..fbff10d5b467 100644
--- a/toke.c
+++ b/toke.c
@@ -4740,19 +4740,21 @@ S_intuit_more(pTHX_ char *s, char *e,
                  * changed since the code was first added */
                 char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) * 4 ];
 
-                if (! scan_ident(s, tmpbuf, C_ARRAY_END(tmpbuf), CHECK_ONLY))
+                /* (Reserve tmpbuf[0] for future commits) */
+                if (! scan_ident(s, tmpbuf + 1, C_ARRAY_END(tmpbuf),
+                                 CHECK_ONLY))
                 {
                     /* An illegal identifier means this can't be a subscript;
                      * it's an error or it could be a charclass */
                     return false;
                 }
 
-                len = strlen(tmpbuf);
+                len = strlen(tmpbuf + 1);
 
                 /* khw: This only looks at global variables; lexicals came
                  * later, and this hasn't been updated.  Ouch!! */
                 if (   len > 1
-                    && gv_fetchpvn_flags(tmpbuf,
+                    && gv_fetchpvn_flags(tmpbuf + 1,
                                          len,
                                          UTF ? SVf_UTF8 : 0,
                                          SVt_PV))

From 2aeccb953b3e28495f53e605427520fbf40aaaf4 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 27 Oct 2025 10:08:35 -0600
Subject: [PATCH 05/12] intuit_more: Move declaration to first use

---
 toke.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/toke.c b/toke.c
index fbff10d5b467..41a6d61e90ef 100644
--- a/toke.c
+++ b/toke.c
@@ -4733,7 +4733,6 @@ S_intuit_more(pTHX_ char *s, char *e,
              *
              */
             if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) {
-                Size_t len;
 
                 /* khw: where did the magic number 4 come from?.  This buffer
                  * was 4 times as large as tokenbuf in 1997, and had not
@@ -4749,6 +4748,7 @@ S_intuit_more(pTHX_ char *s, char *e,
                     return false;
                 }
 
+                Size_t len; /* (C++ forbids joining these 2 lines) */
                 len = strlen(tmpbuf + 1);
 
                 /* khw: This only looks at global variables; lexicals came

From 5d79145ed13618e67b642268fc60c85d5c750e9c Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 27 Oct 2025 10:44:02 -0600
Subject: [PATCH 06/12] intuit_more: Look for more variable types.

Using \w here missed the possibility of lots of other syntaxes of
variables, like $::foo or ${foo}, that scan_ident looks for.  So call
scan_ident without first filtering what it looks at.

It is more convenient to then swap the code block that handles
punctuation variables with the code block that handles other length 1
identifiers
---
 toke.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/toke.c b/toke.c
index 41a6d61e90ef..15c8b3530684 100644
--- a/toke.c
+++ b/toke.c
@@ -4732,7 +4732,6 @@ S_intuit_more(pTHX_ char *s, char *e,
              * looks for.
              *
              */
-            if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) {
 
                 /* khw: where did the magic number 4 come from?.  This buffer
                  * was 4 times as large as tokenbuf in 1997, and had not
@@ -4772,6 +4771,20 @@ S_intuit_more(pTHX_ char *s, char *e,
                      * like $subscripts{$which}.  We should advance past the
                      * braces and key */
                 }
+                else if (len == 1) {
+                 if (   s[0] == '$'
+                     && s[1]
+                     && memCHRs("[#!%*<>()-=", tmpbuf[1]))
+            {
+                /* Here we have what could be a punctuation variable.  If the
+                 * next character after it is a closing bracket, it makes it
+                 * quite likely to be that, and hence a subscript.  If it is
+                 * something else, more mildly a subscript */
+                if (/*{*/ memCHRs("])} =", tmpbuf[2]))
+                    weight -= 10;
+                else
+                    weight -= 1;
+            }
                 else {
                     /* Not a multi-char identifier already known in the
                      * program; is somewhat likely to be a subscript.
@@ -4787,19 +4800,6 @@ S_intuit_more(pTHX_ char *s, char *e,
                     weight -= 10;
                 }
             }
-            else if (   s[0] == '$'
-                     && s[1]
-                     && memCHRs("[#!%*<>()-=", s[1]))
-            {
-                /* Here we have what could be a punctuation variable.  If the
-                 * next character after it is a closing bracket, it makes it
-                 * quite likely to be that, and hence a subscript.  If it is
-                 * something else, more mildly a subscript */
-                if (/*{*/ memCHRs("])} =", s[2]))
-                    weight -= 10;
-                else
-                    weight -= 1;
-            }
             break;
 
           /* khw:  [:blank:] strongly indicates a charclass */

From 26848f00acb130b61d3fdd9cc1f535179dd192e6 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 27 Oct 2025 10:58:17 -0600
Subject: [PATCH 07/12] intuit_more: White-space only

Outdent, as the previous commit removed a block; and one block we indent
for a future new block
---
 toke.c | 103 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 51 insertions(+), 52 deletions(-)

diff --git a/toke.c b/toke.c
index 15c8b3530684..b683573c3f38 100644
--- a/toke.c
+++ b/toke.c
@@ -4733,48 +4733,48 @@ S_intuit_more(pTHX_ char *s, char *e,
              *
              */
 
-                /* khw: where did the magic number 4 come from?.  This buffer
-                 * was 4 times as large as tokenbuf in 1997, and had not
-                 * changed since the code was first added */
-                char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) * 4 ];
-
-                /* (Reserve tmpbuf[0] for future commits) */
-                if (! scan_ident(s, tmpbuf + 1, C_ARRAY_END(tmpbuf),
-                                 CHECK_ONLY))
-                {
-                    /* An illegal identifier means this can't be a subscript;
-                     * it's an error or it could be a charclass */
-                    return false;
-                }
+            /* khw: where did the magic number 4 come from?.  This buffer
+             * was 4 times as large as tokenbuf in 1997, and had not
+             * changed since the code was first added */
+            char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) * 4 ];
+
+            /* (Reserve tmpbuf[0] for future commits) */
+            if (! scan_ident(s, tmpbuf + 1, C_ARRAY_END(tmpbuf),
+                             CHECK_ONLY))
+            {
+                /* An illegal identifier means this can't be a subscript;
+                 * it's an error or it could be a charclass */
+                return false;
+            }
 
-                Size_t len; /* (C++ forbids joining these 2 lines) */
-                len = strlen(tmpbuf + 1);
+            Size_t len; /* (C++ forbids joining these 2 lines) */
+            len = strlen(tmpbuf + 1);
 
-                /* khw: This only looks at global variables; lexicals came
-                 * later, and this hasn't been updated.  Ouch!! */
-                if (   len > 1
-                    && gv_fetchpvn_flags(tmpbuf + 1,
-                                         len,
-                                         UTF ? SVf_UTF8 : 0,
-                                         SVt_PV))
-                {
+            /* khw: This only looks at global variables; lexicals came
+             * later, and this hasn't been updated.  Ouch!! */
+            if (   len > 1
+                && gv_fetchpvn_flags(tmpbuf + 1,
+                                     len,
+                                     UTF ? SVf_UTF8 : 0,
+                                     SVt_PV))
+            {
                     weight -= 100;
 
-                    /* khw: Below we keep track of repeated characters;  People
-                     * rarely say qr/[aba]/, as the second a is pointless.
-                     * (Some do it though as a mnemonic that is meaningful to
-                     * them.)  But generally, repeated characters make things
-                     * more likely to be a charclass.  But here, this an
-                     * identifier so likely a subscript.  Its spelling should
-                     * be irrelevant to the repeated characters test.  So, we
-                     * should advance past it.  Suppose it is a hash element,
-                     * like $subscripts{$which}.  We should advance past the
-                     * braces and key */
-                }
-                else if (len == 1) {
-                 if (   s[0] == '$'
-                     && s[1]
-                     && memCHRs("[#!%*<>()-=", tmpbuf[1]))
+                    /* khw: Below we keep track of repeated characters;
+                     * People rarely say qr/[aba]/, as the second a is
+                     * pointless.  (Some do it though as a mnemonic that is
+                     * meaningful to them.) But generally, repeated characters
+                     * make things more likely to be a charclass.  But here,
+                     * this an identifier so likely a subscript.  Its spelling
+                     * should be irrelevant to the repeated characters test.
+                     * So, we should advance past it.  Suppose it is a hash
+                     * element, like $subscripts{$which}.  We should advance
+                     * past the braces and key */
+            }
+            else /* len == 1 */
+               if (   s[0] == '$'
+                   && s[1]
+                   && memCHRs("[#!%*<>()-=", tmpbuf[1]))
             {
                 /* Here we have what could be a punctuation variable.  If the
                  * next character after it is a closing bracket, it makes it
@@ -4785,20 +4785,19 @@ S_intuit_more(pTHX_ char *s, char *e,
                 else
                     weight -= 1;
             }
-                else {
-                    /* Not a multi-char identifier already known in the
-                     * program; is somewhat likely to be a subscript.
-                     *
-                     * khw: Our test suite contains several constructs like
-                     * [$A-Z].  Excluding length 1 identifiers in the
-                     * conditional above means such are much less likely to be
-                     * mistaken for subscripts.  I would argue that if the next
-                     * character is a '-' followed by an alpha, that would make
-                     * it much more likely to be a charclass.  It would only
-                     * make sense to be an expression if that alpha string is a
-                     * bareword with meaning; something like [$A-ord] */
-                    weight -= 10;
-                }
+            else { /* len == 1 */
+                /* Not a multi-char identifier already known in the program;
+                 * is somewhat likely to be a subscript.
+                 *
+                 * khw: Our test suite contains several constructs like
+                 * [$A-Z].  Excluding length 1 identifiers in the conditional
+                 * above means such are much less likely to be mistaken for
+                 * subscripts.  I would argue that if the next character is a
+                 * '-' followed by an alpha, that would make it much more
+                 * likely to be a charclass.  It would only make sense to be
+                 * an expression if that alpha string is a bareword with
+                 * meaning; something like [$A-ord] */
+                weight -= 10;
             }
             break;
 

From 7eb08b775b3db413007e0d1e2dd92a9327731230 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 27 Oct 2025 11:13:28 -0600
Subject: [PATCH 08/12] intuit_more: Remove over allocation of buffer

I don't know why the code here uses a buffer 4 times bigger than what is
the maximum an identifier can be.  One might think this is because
Unicode characters can be 4 bytes long, but this number has been in
effect since before Perl knew about Unicode; or maybe it was to avoid
a potential buffer overflow.

But in any event, I'm pretty certain there is no need for that now.  The
buffer is now allocated to handle the maximum Unicode-needed size, and
buffer overflow is checked for and avoided.
---
 toke.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/toke.c b/toke.c
index b683573c3f38..ff2337031179 100644
--- a/toke.c
+++ b/toke.c
@@ -4733,15 +4733,13 @@ S_intuit_more(pTHX_ char *s, char *e,
              *
              */
 
-            /* khw: where did the magic number 4 come from?.  This buffer
-             * was 4 times as large as tokenbuf in 1997, and had not
-             * changed since the code was first added */
-            char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) * 4 ];
+            /* (Reserve tmpbuf[0] for future commits, hence +1) */
+            char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) + 1 ];
 
-            /* (Reserve tmpbuf[0] for future commits) */
             if (! scan_ident(s, tmpbuf + 1, C_ARRAY_END(tmpbuf),
                              CHECK_ONLY))
             {
+
                 /* An illegal identifier means this can't be a subscript;
                  * it's an error or it could be a charclass */
                 return false;

From b61e5b86a75065f7418a3d5e5344b2ac3b8e0b47 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 27 Oct 2025 11:19:11 -0600
Subject: [PATCH 09/12] intuit_more: Save scan_ident return

It returns more than a boolean; save the actual return for use in future
commits
---
 toke.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/toke.c b/toke.c
index ff2337031179..ae1f25ca99fc 100644
--- a/toke.c
+++ b/toke.c
@@ -4735,10 +4735,13 @@ S_intuit_more(pTHX_ char *s, char *e,
 
             /* (Reserve tmpbuf[0] for future commits, hence +1) */
             char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) + 1 ];
+            char * s_after_ident;
 
-            if (! scan_ident(s, tmpbuf + 1, C_ARRAY_END(tmpbuf),
-                             CHECK_ONLY))
-            {
+            /* scan_ident returns NULL if the input looks like an identifier
+             * that is illegal, e.g., it is too long or is like $001. */
+            s_after_ident = scan_ident(s, tmpbuf + 1, C_ARRAY_END(tmpbuf),
+                                       CHECK_ONLY);
+            if (s_after_ident == NULL) {
 
                 /* An illegal identifier means this can't be a subscript;
                  * it's an error or it could be a charclass */

From ac4d84e4a33c01ff45b7001dec6000eb55230366 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 27 Oct 2025 17:11:42 -0600
Subject: [PATCH 10/12] intuit_more: 0 length identifier means nothing more

If scan_ident indicates that $ @ & are followed by nothing that looks
like an identifier, then this isn't an expression.  It has to be a
character class or an error.

Almost anything is an identifier when 'use utf8' isn't in effect;  when
it is, non ASCII has to be an Identifier Start character following these
---
 toke.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/toke.c b/toke.c
index ae1f25ca99fc..2de59c87939a 100644
--- a/toke.c
+++ b/toke.c
@@ -4751,6 +4751,13 @@ S_intuit_more(pTHX_ char *s, char *e,
             Size_t len; /* (C++ forbids joining these 2 lines) */
             len = strlen(tmpbuf + 1);
 
+            /* If it doesn't look like an identifier at all, scan_ident will
+             * set tmpbuf[1] to NUL.  This is either an error or a character
+             * class. */
+            if (len == 0) {
+                return false;
+            }
+
             /* khw: This only looks at global variables; lexicals came
              * later, and this hasn't been updated.  Ouch!! */
             if (   len > 1
@@ -4772,10 +4779,9 @@ S_intuit_more(pTHX_ char *s, char *e,
                      * element, like $subscripts{$which}.  We should advance
                      * past the braces and key */
             }
-            else /* len == 1 */
-               if (   s[0] == '$'
-                   && s[1]
-                   && memCHRs("[#!%*<>()-=", tmpbuf[1]))
+            else if (   len == 1
+                     && s[0] == '$'
+                     && memCHRs("[#!%*<>()-=", tmpbuf[1]))
             {
                 /* Here we have what could be a punctuation variable.  If the
                  * next character after it is a closing bracket, it makes it
@@ -4786,7 +4792,7 @@ S_intuit_more(pTHX_ char *s, char *e,
                 else
                     weight -= 1;
             }
-            else { /* len == 1 */
+            else {
                 /* Not a multi-char identifier already known in the program;
                  * is somewhat likely to be a subscript.
                  *

From 5e9f08230f8654620450927e070fc4c62ac1f0bd Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 27 Oct 2025 17:18:12 -0600
Subject: [PATCH 11/12] intuit_more: Handle numeric identifiers

This function was totally unaware of the possibility of these.
---
 toke.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 64 insertions(+), 8 deletions(-)

diff --git a/toke.c b/toke.c
index 2de59c87939a..1edde19afe6e 100644
--- a/toke.c
+++ b/toke.c
@@ -4758,13 +4758,70 @@ S_intuit_more(pTHX_ char *s, char *e,
                 return false;
             }
 
-            /* khw: This only looks at global variables; lexicals came
-             * later, and this hasn't been updated.  Ouch!! */
-            if (   len > 1
-                && gv_fetchpvn_flags(tmpbuf + 1,
-                                     len,
-                                     UTF ? SVf_UTF8 : 0,
-                                     SVt_PV))
+            /* If there is extra stuff in the source, like braces, it means
+             * this is almost definitely intended to be an identifier */
+            bool decorated;
+            decorated = (Size_t) (s_after_ident - s) > len;
+
+            if (isDIGIT_A(tmpbuf[1])) {
+
+                /* &41 and &4b are illegal subroutine names so is an error or
+                 * a charclass */
+                if (s[0] == '&') {
+                    return false;
+                }
+
+                /* Here, matches [$@]\d+.  If the next input character is a
+                 * \w, we would have something like $456x, which is an illegal
+                 * identifer, so is an error or a charclass */
+                if ( ! decorated
+                    && isWORDCHAR_lazy_if_safe(s_after_ident,
+                                               PL_bufend, UTF))
+                {
+                    return false;
+                }
+
+                /* We don't get here if this potential identifier starts with
+                 * leading zeros, due to the logic in scan_ident. */
+                assert(len == 1 || tmpbuf[0] != '0');
+
+                /* The chances are vanishingly small that someone is going to
+                 * want [$0] to expand to the program's name in a character
+                 * class.  But, what would the program's name be doing as part
+                 * of a subscript either?  The only likely scenario is that
+                 * this is meant to be a charclass matching either '$' or '0'.
+                 * */
+                if (tmpbuf[1] == '0') {
+                    return false;
+                }
+
+                /* Here it is either something like $1 which is supposed to
+                 * match either dollar or 1, or it is supposed to expand to
+                 * what is in $1 left over from a capturing group from the
+                 * previous pattern match.  In the latter case, it could be
+                 * either a part of wanting to calculate a subscript, or to
+                 * use as the contents of as part of the character class.
+                 * Larger (undecorated) numbers are much less likely to have
+                 * had capturing groups, so they lean more towards a
+                 * charclass.  100 is what this function has traditionally
+                 * used for len>1; khw thinks there is no bias one way or the
+                 * other for length 1 ones.  But has chosen 100 for decorated
+                 * identifiers
+                 *
+                 * XXX long enough identifiers could probably return false
+                 * immediately here, rather than using weights. */
+                if (decorated || len > 1) {
+                        weight -= 100;
+                }
+            }
+            else if (   len > 1
+                         /* khw: This only looks at global variables; lexicals
+                          * came later, and this hasn't been updated.  Ouch!!
+                          * */
+                      && gv_fetchpvn_flags(tmpbuf + 1,
+                                           len,
+                                           UTF ? SVf_UTF8 : 0,
+                                           SVt_PV))
             {
                     weight -= 100;
 
@@ -4817,7 +4874,6 @@ S_intuit_more(pTHX_ char *s, char *e,
            *      \? must be subscript for things like \d, but not \a.
            */
 
-
           case '\\':
             if (s[1] == '\0') {
                 /* \ followed by NUL strongly indicates character class */

From 9f313046b39df4c4f6b352637f40d05b1f42ca66 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 27 Oct 2025 17:51:23 -0600
Subject: [PATCH 12/12] intuit_more: Check if identifier exists

The code attempted to do this, but was written before lexical variables
existed, and had never been updated.
---
 toke.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/toke.c b/toke.c
index 1edde19afe6e..d5d06d3748cd 100644
--- a/toke.c
+++ b/toke.c
@@ -4814,15 +4814,15 @@ S_intuit_more(pTHX_ char *s, char *e,
                         weight -= 100;
                 }
             }
-            else if (   len > 1
-                         /* khw: This only looks at global variables; lexicals
-                          * came later, and this hasn't been updated.  Ouch!!
-                          * */
-                      && gv_fetchpvn_flags(tmpbuf + 1,
-                                           len,
-                                           UTF ? SVf_UTF8 : 0,
-                                           SVt_PV))
-            {
+            else if (len > 1) {
+                /* See if there is a known identifier of the given kind.  For
+                 * arrays, this might also be a reference to one of its
+                 * elements.   XXX Maybe the latter should require a following
+                 * '[' */
+                if (   is_existing_identifier(tmpbuf, len, s[0], UTF)
+                    || (   s[0] == '$'
+                        && is_existing_identifier(tmpbuf, len, '@', UTF)))
+                {
                     weight -= 100;
 
                     /* khw: Below we keep track of repeated characters;
@@ -4835,6 +4835,16 @@ S_intuit_more(pTHX_ char *s, char *e,
                      * So, we should advance past it.  Suppose it is a hash
                      * element, like $subscripts{$which}.  We should advance
                      * past the braces and key */
+                }
+                else {  /* Isn't a known identifier */
+                    /* Under strict, this means an error. */
+                    if (under_strict_vars) {
+                        return false;
+                    }
+
+                    /* Otherwise still somewhat likely to be a subscript */
+                    weight -= 10;
+                }
             }
             else if (   len == 1
                      && s[0] == '$'