From 375e9cf476c64008f795a2544e2ed248eaea160c Mon Sep 17 00:00:00 2001 From: kiddos Date: Wed, 28 May 2025 01:17:37 +0800 Subject: [PATCH 1/4] Add StringUtils.truncateToByteLength --- .../org/apache/commons/lang3/StringUtils.java | 25 +++++++++++++++++++ .../apache/commons/lang3/StringUtilsTest.java | 12 +++++++++ 2 files changed, 37 insertions(+) diff --git a/src/main/java/org/apache/commons/lang3/StringUtils.java b/src/main/java/org/apache/commons/lang3/StringUtils.java index ab35a44c7f8..e99371688cd 100644 --- a/src/main/java/org/apache/commons/lang3/StringUtils.java +++ b/src/main/java/org/apache/commons/lang3/StringUtils.java @@ -8872,6 +8872,31 @@ public static String truncate(final String str, final int maxWidth) { return truncate(str, 0, maxWidth); } + public static String truncateToByteLength(String str, int maxBytes, Charset charset) { + if (str == null) { + return null; + } + + byte[] bytes = StringUtils.getBytes(str, charset); + if (bytes.length <= maxBytes) { + return str; + } + + // Binary search or iterative approach to find the right character length + int low = 0; + int high = str.length(); + while (low < high) { + int mid = (low + high + 1) / 2; + if (str.substring(0, mid).getBytes(charset).length <= maxBytes) { + low = mid; + } else { + high = mid - 1; + } + } + + return str.substring(0, low); + } + /** * Truncates a String. This will turn * "Now is the time for all good men" into "is the time for all". diff --git a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java index 6482182bd7b..9c9ec685d4c 100644 --- a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java +++ b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java @@ -3089,6 +3089,18 @@ public void testTruncate_StringIntInt() { assertEquals("", StringUtils.truncate("abcdefghijklmno", Integer.MAX_VALUE, Integer.MAX_VALUE)); } + @Test + public void testTruncateToByteLength() { + assertNull(StringUtils.truncateToByteLength(null, 0, Charset.defaultCharset())); + assertEquals("abcdefghij", StringUtils.truncateToByteLength("abcdefghijklmno", 10, Charset.defaultCharset())); + assertEquals("abcdefghijklmno", StringUtils.truncateToByteLength("abcdefghijklmno", 15, Charset.defaultCharset())); + assertEquals("abcdefghijklmno", StringUtils.truncateToByteLength("abcdefghijklmno", 20, Charset.defaultCharset())); + assertEquals("\u4F60\u597D\u55CE", StringUtils.truncateToByteLength("\u4F60\u597D\u55CE", 10, Charset.defaultCharset())); + assertEquals("\u4F60", StringUtils.truncateToByteLength("\u4F60\u597D\u55CE", 5, Charset.defaultCharset())); + assertEquals("\u2713\u2714", StringUtils.truncateToByteLength("\u2713\u2714", 6, Charset.defaultCharset())); + assertEquals("", StringUtils.truncateToByteLength("\u2713\u2714", 2, Charset.defaultCharset())); + } + @Test public void testUnCapitalize() { assertNull(StringUtils.uncapitalize(null)); From b12a9ee875dc25c63671525d50eb1bfec49fdc5c Mon Sep 17 00:00:00 2001 From: kiddos Date: Thu, 29 May 2025 01:36:57 +0800 Subject: [PATCH 2/4] fix case for emojis --- .../org/apache/commons/lang3/StringUtils.java | 17 +++++++++++------ .../apache/commons/lang3/StringUtilsTest.java | 3 +++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/apache/commons/lang3/StringUtils.java b/src/main/java/org/apache/commons/lang3/StringUtils.java index e99371688cd..86d59c45ff7 100644 --- a/src/main/java/org/apache/commons/lang3/StringUtils.java +++ b/src/main/java/org/apache/commons/lang3/StringUtils.java @@ -8884,17 +8884,22 @@ public static String truncateToByteLength(String str, int maxBytes, Charset char // Binary search or iterative approach to find the right character length int low = 0; - int high = str.length(); - while (low < high) { - int mid = (low + high + 1) / 2; - if (str.substring(0, mid).getBytes(charset).length <= maxBytes) { - low = mid; + int high = str.codePointCount(0, str.length()); + int count = 0; + while (low <= high) { + int mid = low + (high - low) / 2; + int charIndex = str.offsetByCodePoints(0, mid); + byte[] currentBytes = StringUtils.getBytes(str.substring(0, charIndex), charset); + if (currentBytes.length <= maxBytes) { + low = mid + 1; + count = mid; } else { high = mid - 1; } } - return str.substring(0, low); + int idx = str.offsetByCodePoints(0, count); + return str.substring(0, idx); } /** diff --git a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java index 9c9ec685d4c..18969a42fac 100644 --- a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java +++ b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java @@ -3099,6 +3099,9 @@ public void testTruncateToByteLength() { assertEquals("\u4F60", StringUtils.truncateToByteLength("\u4F60\u597D\u55CE", 5, Charset.defaultCharset())); assertEquals("\u2713\u2714", StringUtils.truncateToByteLength("\u2713\u2714", 6, Charset.defaultCharset())); assertEquals("", StringUtils.truncateToByteLength("\u2713\u2714", 2, Charset.defaultCharset())); + assertEquals("\uD83D\uDE80", StringUtils.truncateToByteLength("\uD83D\uDE80\u2728\uD83C\uDF89", 6, Charset.defaultCharset())); + assertEquals("", StringUtils.truncateToByteLength("\uD83D\uDE80\u2728\uD83C\uDF89", 3, Charset.defaultCharset())); + assertEquals("", StringUtils.truncateToByteLength("\uD83D\uDE03", 3, Charset.defaultCharset())); } @Test From 4ba9a41b722f81c286bcd1ffbbdb794b31bc82e0 Mon Sep 17 00:00:00 2001 From: kiddos Date: Sun, 8 Jun 2025 22:34:10 +0800 Subject: [PATCH 3/4] add test cases --- .../java/org/apache/commons/lang3/StringUtilsTest.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java index 18969a42fac..dbdf9540a79 100644 --- a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java +++ b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java @@ -3102,6 +3102,15 @@ public void testTruncateToByteLength() { assertEquals("\uD83D\uDE80", StringUtils.truncateToByteLength("\uD83D\uDE80\u2728\uD83C\uDF89", 6, Charset.defaultCharset())); assertEquals("", StringUtils.truncateToByteLength("\uD83D\uDE80\u2728\uD83C\uDF89", 3, Charset.defaultCharset())); assertEquals("", StringUtils.truncateToByteLength("\uD83D\uDE03", 3, Charset.defaultCharset())); + assertEquals("\uD83D\uDE03", StringUtils.truncateToByteLength("\uD83D\uDE03", 4, Charset.defaultCharset())); + assertEquals("\uD83D\uDE03\uD83D\uDE03", StringUtils.truncateToByteLength( + "\uD83D\uDE03\uD83D\uDE03\uD83D\uDE03\uD83D\uDE03\uD83D\uDE03", 9, Charset.defaultCharset())); + + for (int i = 0; i < 100; ++i) { + String s = StringUtils.truncateToByteLength("🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊", i, Charset.defaultCharset()); + byte[] data = s.getBytes(); + assertTrue(data.length <= i); + } } @Test From e9a1a31120ae2fe0a43b9f0d4b6eccff900db183 Mon Sep 17 00:00:00 2001 From: kiddos Date: Sun, 29 Jun 2025 23:23:30 +0800 Subject: [PATCH 4/4] case with graphene cluster only check if output bytes is actually smaller or equal then expected and not null --- src/test/java/org/apache/commons/lang3/StringUtilsTest.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java index a8d4ff26f33..08ac89d2058 100644 --- a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java +++ b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java @@ -3108,6 +3108,7 @@ void testTruncateToByteLength() { for (int i = 0; i < 100; ++i) { String s = StringUtils.truncateToByteLength("🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊", i, Charset.defaultCharset()); + assertNotNull(s); byte[] data = s.getBytes(); assertTrue(data.length <= i); }