|  | 
| 22 | 22 | import com.google.javascript.jscomp.parsing.parser.util.ErrorReporter; | 
| 23 | 23 | import com.google.javascript.jscomp.parsing.parser.util.SourcePosition; | 
| 24 | 24 | import com.google.javascript.jscomp.parsing.parser.util.SourceRange; | 
|  | 25 | +import com.google.javascript.jscomp.parsing.parser.util.UnicodeMatch; | 
| 25 | 26 | import java.util.ArrayList; | 
| 26 | 27 | import javax.annotation.Nullable; | 
| 27 | 28 | 
 | 
| @@ -869,111 +870,18 @@ private static String processUnicodeEscapes(String value) { | 
| 869 | 870 |     return value; | 
| 870 | 871 |   } | 
| 871 | 872 | 
 | 
| 872 |  | -  @SuppressWarnings("ShortCircuitBoolean") // Intentional to minimize branches in this code | 
|  | 873 | +  /** | 
|  | 874 | +   * Interface from UnicodeRegex. Includes old optimizations. | 
|  | 875 | +   */ | 
| 873 | 876 |   private static boolean isIdentifierStart(char ch) { | 
| 874 |  | -    // Most code is written in pure ASCII, so create a fast path here. | 
| 875 |  | -    if (ch <= 127) { | 
| 876 |  | -      // Intentionally avoiding short circuiting behavior of "||" and "&&". | 
| 877 |  | -      // This minimizes branches in this code which minimizes branch prediction misses. | 
| 878 |  | -      return ((ch >= 'A' & ch <= 'Z') | (ch >= 'a' & ch <= 'z') | (ch == '_' | ch == '$')); | 
| 879 |  | -    } | 
| 880 |  | - | 
| 881 |  | -    // Handle non-ASCII characters. | 
| 882 |  | -    // TODO(tjgq): This should include all characters with the ID_Start property. | 
| 883 |  | -    if (Character.isLetter(ch)) { | 
| 884 |  | -      return true; | 
| 885 |  | -    } | 
| 886 |  | - | 
| 887 |  | -    // Workaround for b/36459436. | 
| 888 |  | -    // When running under GWT/J2CL, Character.isLetter only handles ASCII. | 
| 889 |  | -    // Angular relies heavily on Latin Small Letter Barred O and Greek Capital Letter Delta. | 
| 890 |  | -    // Greek letters are occasionally found in math code. | 
| 891 |  | -    // Latin letters are found in our own tests. | 
| 892 |  | -    return (ch >= 0x00C0 & ch <= 0x00D6) // Latin letters | 
| 893 |  | -        // 0x00D7 = multiplication sign, not a letter | 
| 894 |  | -        | (ch >= 0x00D8 & ch <= 0x00F6) // Latin letters | 
| 895 |  | -        // 0x00F7 = division sign, not a letter | 
| 896 |  | -        | (ch >= 0x00F8 & ch <= 0x00FF) // Latin letters | 
| 897 |  | -        | ch == 0x0275 // Latin Barred O | 
| 898 |  | -        | (ch >= 0x0391 & ch <= 0x03A1) // Greek uppercase letters | 
| 899 |  | -        // 0x03A2 = unassigned | 
| 900 |  | -        | (ch >= 0x03A3 & ch <= 0x03A9) // Remaining Greek uppercase letters | 
| 901 |  | -        | (ch >= 0x03B1 & ch <= 0x03C9); // Greek lowercase letters | 
| 902 |  | -  } | 
| 903 |  | - | 
| 904 |  | -  // Check if char is Unicode Category "Combining spacing mark (Mc)" | 
| 905 |  | -  // This list is not exhaustive! | 
| 906 |  | -  @SuppressWarnings("ShortCircuitBoolean") // Intentional to minimize branches in this code | 
| 907 |  | -  private static boolean isCombiningMark(char ch) { | 
| 908 |  | -    return ( | 
| 909 |  | -      // 0300-036F | 
| 910 |  | -      (0x0300 <= ch & ch <= 0x036F) | | 
| 911 |  | -      // 1AB0–1AFF | 
| 912 |  | -      (0x1AB0 <= ch & ch <= 0x1AFF) | | 
| 913 |  | -      // 1DC0–1DFF | 
| 914 |  | -      (0x1DC0 <= ch & ch <= 0x1DFF) | | 
| 915 |  | -      // 20D0–20FF | 
| 916 |  | -      (0x20D0 <= ch & ch <= 0x20FF) | | 
| 917 |  | -      // FE20–FE2F | 
| 918 |  | -      (0xFE20 <= ch & ch <= 0xFE2F) | 
| 919 |  | -    ); | 
| 920 |  | -    // TODO (ctjl): Implement in a more reliable and future-proofed way, i.e.: | 
| 921 |  | -    // return Character.getType(ch) == Character.NON_SPACING_MARK; | 
| 922 |  | -  } | 
| 923 |  | - | 
| 924 |  | -  // TODO (ctjl): Implement | 
| 925 |  | -  private static boolean isConnectorPunctuation() { | 
| 926 |  | -    return true; | 
|  | 877 | +    return UnicodeMatch.isJavascriptIdentifierStart(ch); | 
| 927 | 878 |   } | 
| 928 |  | - | 
| 929 |  | -  // TODO (ctjl): Implement | 
| 930 |  | -  private static boolean isZeroWidthJoiner() { | 
| 931 |  | -    return true; | 
| 932 |  | -  } | 
| 933 |  | - | 
| 934 |  | -  // TODO (ctjl): Implement | 
| 935 |  | -  private static boolean isZeroWidthNonJoiner() { | 
| 936 |  | -    return true; | 
| 937 |  | -  } | 
| 938 |  | - | 
| 939 |  | -  @SuppressWarnings("ShortCircuitBoolean") // Intentional to minimize branches in this code | 
|  | 879 | +   | 
|  | 880 | +  /** | 
|  | 881 | +   * Interface from UnicodeRegex. Includes old optimizations. | 
|  | 882 | +   */ | 
| 940 | 883 |   private static boolean isIdentifierPart(char ch) { | 
| 941 |  | -    /** | 
| 942 |  | -      https://www.ecma-international.org/ecma-262/5.1/#sec-7.6 | 
| 943 |  | -      IdentifierPart :: | 
| 944 |  | -        IdentifierStart | 
| 945 |  | -        ✓ isIdentifierPart() | 
| 946 |  | -
 | 
| 947 |  | -        UnicodeCombiningMark | 
| 948 |  | -        ✓ isCombiningMark() | 
| 949 |  | -
 | 
| 950 |  | -        UnicodeDigit | 
| 951 |  | -        ✓ Character.isDigit() | 
| 952 |  | -
 | 
| 953 |  | -        UnicodeConnectorPunctuation | 
| 954 |  | -        ✓ isConnectorPunctuation() | 
| 955 |  | -
 | 
| 956 |  | -        <ZWNJ> | 
| 957 |  | -        ✓ isZeroWidthNonJoiner() | 
| 958 |  | -           | 
| 959 |  | -        <ZWJ> | 
| 960 |  | -        ✓ isZeroWidthJoiner() | 
| 961 |  | -     */ | 
| 962 |  | - | 
| 963 |  | -    // Most code is written in pure ASCII, so create a fast path here. | 
| 964 |  | -    if (ch <= 127) { | 
| 965 |  | -      return ((ch >= 'A' & ch <= 'Z') | 
| 966 |  | -          | (ch >= 'a' & ch <= 'z') | 
| 967 |  | -          | (ch >= '0' & ch <= '9') | 
| 968 |  | -          | (ch == '_' | ch == '$')); | 
| 969 |  | -    } | 
| 970 |  | - | 
| 971 |  | -    // Handle non-ASCII characters. | 
| 972 |  | -    // TODO(tjgq): This should include all characters with the ID_Continue property, plus | 
| 973 |  | -    // TODO(ctjl): Implement remaining grammar (zero-width joiners, etc.) | 
| 974 |  | -    return isIdentifierStart(ch) | 
| 975 |  | -        || isCombiningMark(ch) | 
| 976 |  | -        || Character.isDigit(ch); | 
|  | 884 | +    return UnicodeMatch.isJavascriptIdentifierPart(ch); | 
| 977 | 885 |   } | 
| 978 | 886 | 
 | 
| 979 | 887 |   private Token scanStringLiteral(int beginIndex, char terminator) { | 
|  | 
0 commit comments