jquast · jquast · Feb 14, 2024 · Feb 14, 2024 · Feb 14, 2024 · Sep 17, 2025
diff --git a/bin/update-tables.py b/bin/update-tables.py
@@ -68,6 +68,8 @@
     *range(0xD7B0, 0xD800),  # Hangul Jungseong O-Yeo  .. Undefined Character of Hangul Jamo Extended-B
 )
 
+HEX_STR_VS15 = 'FE0E'
+HEX_STR_VS16 = 'FE0F'
 
 def _bisearch(ucs, table):
     """A copy of wcwwidth._bisearch, to prevent having issues when depending on code that imports
@@ -439,13 +441,15 @@ def fetch_table_vs16_data() -> UnicodeTableRenderCtx:
 
     # parse table formatted by the latest emoji release (developed with
     # 15.1.0) and parse a single file for all individual releases
-    table[unicode_version] = parse_vs16_data(fname=UnicodeDataFile.EmojiVariationSequences(unicode_latest),
-                                             ubound_unicode_version=unicode_version)
+    table[unicode_version] = parse_vs_data(fname=UnicodeDataFile.EmojiVariationSequences(unicode_latest),
+                                           ubound_unicode_version=unicode_version,
+                                           hex_str_vs=HEX_STR_VS16)
 
     # parse and join the final emoji release 12.0 of the earlier "type"
     table[unicode_version].values.update(
-        parse_vs16_data(fname=UnicodeDataFile.LegacyEmojiVariationSequences(),
-                        ubound_unicode_version=unicode_version).values)
+        parse_vs_data(fname=UnicodeDataFile.LegacyEmojiVariationSequences(),
+                      ubound_unicode_version=unicode_version,
+                      hex_str_vs=HEX_STR_VS16).values)
 
     # perform culling on any values that are already understood as 'wide'
     # without the variation-16 selector
@@ -458,16 +462,60 @@ def fetch_table_vs16_data() -> UnicodeTableRenderCtx:
     return UnicodeTableRenderCtx('VS16_NARROW_TO_WIDE', table)
 
 
-def parse_vs16_data(fname: str, ubound_unicode_version: UnicodeVersion):
+def parse_vs_data(fname: str, ubound_unicode_version: UnicodeVersion, hex_str_vs: str):
     with open(fname, encoding='utf-8') as fin:
-        table_iter = parse_vs16_table(fin)
+        table_iter = parse_vs_table(fin, hex_str_vs)
         # pull "date string"
         date = next(table_iter).comment.split(':', 1)[1].strip()
         # pull values only matching this unicode version and lower
         values = {entry.code_range[0] for entry in table_iter}
     return TableDef(ubound_unicode_version, date, values)
 
 
+def fetch_table_vs15_data() -> UnicodeTableRenderCtx:
+    """
+    Fetch and create a "wide to narrow variation-15" lookup table.
+
+    Characters in this table are wide, but when combined with a variation selector-15 (\uFE0E), they
+    become narrow, for the given versions of unicode.
+
+    UNICODE_VERSION=9.0.0 or greater is required to enable detection of the effect of *any*
+    'variation selector-15' wide emoji becoming narrow.
+
+    Some terminals display U+231a, u+FE0E as a narrow font, but consuming a wide cell (iTerm2),
+    while most others display it as a wide cell, only.
+
+    It is fair to call these ambiguous, see related 'ucs-detect' project.
+    """
+    table: dict[UnicodeVersion, TableDef] = {}
+    unicode_latest = fetch_unicode_versions()[-1]
+
+    wide_tables = fetch_table_wide_data().table
+    unicode_version = UnicodeVersion.parse('9.0.0')
+
+    # parse table formatted by the latest emoji release (developed with
+    # 15.1.0) and parse a single file for all individual releases
+    table[unicode_version] = parse_vs_data(fname=UnicodeDataFile.EmojiVariationSequences(unicode_latest),
+                                           ubound_unicode_version=unicode_version,
+                                           hex_str_vs=HEX_STR_VS15)
+
+    # parse and join the final emoji release 12.0 of the earlier "type"
+    table[unicode_version].values.update(
+        parse_vs_data(fname=UnicodeDataFile.LegacyEmojiVariationSequences(),
+                      ubound_unicode_version=unicode_version,
+                      hex_str_vs=HEX_STR_VS15).values)
+
+    # perform culling on any values that are already understood as 'narrow'
+    # without the variation-15 selector
+    wide_table = wide_tables[unicode_version].as_value_ranges()
+    table[unicode_version].values = {
+        ucs for ucs in table[unicode_version].values
+        if _bisearch(ucs, wide_table)
+    }
+
+    return UnicodeTableRenderCtx('VS15_WIDE_TO_NARROW', table)
+
+
 def cite_source_description(filename: str) -> tuple[str, str]:
     """Return unicode.org source data file's own description as citation."""
     with open(filename, encoding='utf-8') as f:
@@ -512,9 +560,8 @@ def parse_unicode_table(file: Iterable[str]) -> Iterator[TableEntry]:
         yield TableEntry(code_range, tuple(properties), comment)
 
 
-def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
-    """Parse emoji-variation-sequences.txt for codepoints that precede 0xFE0F."""
-    hex_str_vs16 = 'FE0F'
+def parse_vs_table(fp: Iterable[str], hex_str_vs: str = 'FE0F') -> Iterator[TableEntry]:
+    """Parse emoji-variation-sequences.txt for codepoints that precede `hex_str_vs`."""
     for line in fp:
         data, _, comment = line.partition('#')
         data_fields: Iterator[str] = (field.strip() for field in data.split(';'))
@@ -526,8 +573,8 @@ def parse_vs16_table(fp: Iterable[str]) -> Iterator[TableEntry]:
                 yield TableEntry(None, tuple(properties), comment)
             continue
         code_points = code_points_str.split()
-        if len(code_points) == 2 and code_points[1] == hex_str_vs16:
-            # yield a single "code range" entry for a single value that precedes FE0F
+        if len(code_points) == 2 and code_points[1] == hex_str_vs:
+            # yield a single "code range" entry for a single value that precedes hex_str_vs
             yield TableEntry((int(code_points[0], 16), int(code_points[0], 16)), tuple(properties), comment)
 
 
@@ -690,7 +737,7 @@ def replace_if_modified(new_filename: str, original_filename: str) -> None:
         significant_changes = False
         for line in diff_lines:
             if (line.startswith(('@@', '---', '+++')) or
-                (line.startswith(('-','+')) and 'This code generated' in line)):
+                    (line.startswith(('-', '+')) and 'This code generated' in line)):
                 continue
             else:
                 significant_changes = line.startswith(('-', '+'))
@@ -717,6 +764,7 @@ def get_codegen_definitions() -> Iterator[RenderDefinition]:
             UnicodeVersionPyRenderCtx(fetch_unicode_versions())
         )
         yield UnicodeTableRenderDef.new('table_vs16.py', fetch_table_vs16_data())
+        yield UnicodeTableRenderDef.new('table_vs15.py', fetch_table_vs15_data())
         yield UnicodeTableRenderDef.new('table_wide.py', fetch_table_wide_data())
         yield UnicodeTableRenderDef.new('table_zero.py', fetch_table_zero_data())
         yield UnicodeVersionRstRenderDef.new(fetch_source_headers())

diff --git a/bin/verify-table-integrity.py b/bin/verify-table-integrity.py
@@ -64,9 +64,7 @@
 
 
 def bisearch_pair(ucs, table):
-    """
-    A copy of wcwidth._bisearch() but also returns the range of matched values.
-    """
+    """A copy of wcwidth._bisearch() but also returns the range of matched values."""
     lbound = 0
     ubound = len(table) - 1