Skip to content

Commit 3bf3a70

Browse files
committed
String/Encoding: toUtf8
1 parent c8e0644 commit 3bf3a70

File tree

5 files changed

+231
-2
lines changed

5 files changed

+231
-2
lines changed

modules/String/Encoding.mpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
export module CppUtils.String.Encoding;
2+
3+
import std;
4+
5+
export namespace CppUtils::String
6+
{
7+
inline auto toUtf8(char32_t codePoint, std::string&& string = {}) -> std::string
8+
{
9+
if (codePoint < 0x80)
10+
{
11+
string += static_cast<char>(codePoint);
12+
}
13+
else if (codePoint < 0x8'00)
14+
{
15+
string += static_cast<char>(0xC0 | (codePoint >> 6));
16+
string += static_cast<char>(0x80 | (codePoint & 0x3F));
17+
}
18+
else if (codePoint < 0x1'00'00)
19+
{
20+
string += static_cast<char>(0xE0 | (codePoint >> 12));
21+
string += static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F));
22+
string += static_cast<char>(0x80 | (codePoint & 0x3F));
23+
}
24+
else if (codePoint < 0x11'00'00)
25+
{
26+
string += static_cast<char>(0xF0 | (codePoint >> 18));
27+
string += static_cast<char>(0x80 | ((codePoint >> 12) & 0x3F));
28+
string += static_cast<char>(0x80 | ((codePoint >> 6) & 0x3F));
29+
string += static_cast<char>(0x80 | (codePoint & 0x3F));
30+
}
31+
return string;
32+
}
33+
34+
inline auto toUtf8(std::u32string_view unicodeString, std::string&& string = {}) -> std::string
35+
{
36+
for (const auto codePoint : unicodeString)
37+
string = toUtf8(codePoint, std::move(string));
38+
return string;
39+
}
40+
}

modules/String/String.mpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ export module CppUtils.String;
22

33
export import CppUtils.String.Concept;
44
export import CppUtils.String.Cursor;
5+
export import CppUtils.String.Encoding;
56
export import CppUtils.String.Hash;
67
export import CppUtils.String.HashTable;
78
export import CppUtils.String.Utility;

modules/Terminal/DynamicAreaBuffer.mpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ export namespace CppUtils::Terminal
1414
using Buffer = std::vector<Line>;
1515

1616
inline DynamicAreaBuffer(const Container::Size2& size, CharAttributes defaultCharAttributes = {}):
17-
DynamicAreaBuffer{size, Buffer{size.height(), Line{size.width(), defaultCharAttributes}}}
17+
DynamicAreaBuffer{size, Buffer(size.height(), Line(size.width(), defaultCharAttributes))}
1818
{}
1919

2020
inline DynamicAreaBuffer(const Container::Size2& size, char defaultChar):
@@ -30,7 +30,7 @@ export namespace CppUtils::Terminal
3030
{
3131
auto lock = std::unique_lock{m_mutex};
3232
m_size = size;
33-
m_buffer = Buffer{m_size.height(), Line{m_size.width()}};
33+
m_buffer = Buffer(m_size.height(), Line(m_size.width()));
3434
}
3535

3636
[[nodiscard]] inline auto getSize() const noexcept -> Container::Size2 override

tests/String/Encoding.mpp

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
export module CppUtils.UnitTests.String.Encoding;
2+
3+
import std;
4+
import CppUtils;
5+
6+
export namespace CppUtils::UnitTest::String::Encoding
7+
{
8+
inline auto _ = TestSuite{"String/Encoding", {"Logger"}, [](auto& suite) {
9+
using namespace std::literals;
10+
using Logger = CppUtils::Logger<"CppUtils">;
11+
12+
suite.addTest("toUtf8(char32_t)", [&] {
13+
// 1 octet (0x00 - 0x7F)
14+
{
15+
static_assert(char32_t{0x24} == U'$');
16+
17+
auto result = CppUtils::String::toUtf8(char32_t{0x24});
18+
Logger::print("toUtf8(0x24) -> {}\n", result);
19+
suite.expectEqual(result, "\x24");
20+
21+
result = CppUtils::String::toUtf8(U'$');
22+
Logger::print("toUtf8(U'$') -> {}\n", result);
23+
suite.expectEqual(result, "\x24");
24+
}
25+
26+
{
27+
static_assert(char32_t{0x41} == U'A');
28+
29+
auto result = CppUtils::String::toUtf8(char32_t{0x41});
30+
Logger::print("toUtf8(0x41) -> {}\n", result);
31+
suite.expectEqual(result, "A");
32+
33+
result = CppUtils::String::toUtf8(U'A');
34+
Logger::print("toUtf8(U'A') -> {}\n", result);
35+
suite.expectEqual(result, "A");
36+
}
37+
38+
// 2 octet (0x80 - 0x7FF)
39+
{
40+
static_assert(char32_t{0xA2} == U'¢');
41+
42+
auto result = CppUtils::String::toUtf8(char32_t{0xA2});
43+
Logger::print("toUtf8(0xA2) -> {}\n", result);
44+
suite.expectEqual(result, "\xC2\xA2");
45+
46+
result = CppUtils::String::toUtf8(U'¢');
47+
Logger::print("toUtf8(U'¢') -> {}\n", result);
48+
suite.expectEqual(result, "\xC2\xA2");
49+
}
50+
51+
{
52+
static_assert(char32_t{0x3'B1} == U'α');
53+
54+
auto result = CppUtils::String::toUtf8(char32_t{0x3'B1});
55+
Logger::print("toUtf8(0x3B1) -> {}\n", result);
56+
suite.expectEqual(result, "\xCE\xB1");
57+
58+
result = CppUtils::String::toUtf8(U'α');
59+
Logger::print("toUtf8(U'α') -> {}\n", result);
60+
suite.expectEqual(result, "\xCE\xB1");
61+
}
62+
63+
// 3 octet (0x800 - 0xFFFF)
64+
{
65+
static_assert(char32_t{0x20'AC} == U'€');
66+
67+
auto result = CppUtils::String::toUtf8(char32_t{0x20'AC});
68+
Logger::print("toUtf8(0x20AC) -> {}\n", result);
69+
suite.expectEqual(result, "\xE2\x82\xAC");
70+
71+
result = CppUtils::String::toUtf8(U'€');
72+
Logger::print("toUtf8(U'€') -> {}\n", result);
73+
suite.expectEqual(result, "\xE2\x82\xAC");
74+
}
75+
76+
{
77+
static_assert(char32_t{0x30'93} == U'ん');
78+
79+
auto result = CppUtils::String::toUtf8(char32_t{0x30'93});
80+
Logger::print("toUtf8(0x3093) -> {}\n", result);
81+
suite.expectEqual(result, "\xE3\x82\x93");
82+
83+
result = CppUtils::String::toUtf8(U'ん');
84+
Logger::print("toUtf8(U'ん') -> {}\n", result);
85+
suite.expectEqual(result, "\xE3\x82\x93");
86+
}
87+
88+
// 4 octet (0x10000 - 0x10FFFF)
89+
{
90+
static_assert(char32_t{0x1'03'48} == U'𐍈');
91+
92+
auto result = CppUtils::String::toUtf8(char32_t{0x1'03'48});
93+
Logger::print("toUtf8(0x10348) -> {}\n", result);
94+
suite.expectEqual(result, "\xF0\x90\x8D\x88");
95+
96+
result = CppUtils::String::toUtf8(U'𐍈');
97+
Logger::print("toUtf8(U'𐍈') -> {}\n", result);
98+
suite.expectEqual(result, "\xF0\x90\x8D\x88");
99+
}
100+
101+
{
102+
static_assert(char32_t{0x1'F6'00} == U'😀');
103+
104+
auto result = CppUtils::String::toUtf8(char32_t{0x1'F6'00});
105+
Logger::print("toUtf8(0x1F600) -> {}\n", result);
106+
suite.expectEqual(result, "\xF0\x9F\x98\x80");
107+
108+
result = CppUtils::String::toUtf8(U'😀');
109+
Logger::print("toUtf8(U'😀') -> {}\n", result);
110+
suite.expectEqual(result, "\xF0\x9F\x98\x80");
111+
}
112+
});
113+
114+
suite.addTest("toUtf8(std::u32string_view)", [&] {
115+
// 1 octet
116+
{
117+
auto result = CppUtils::String::toUtf8(U"$"sv);
118+
Logger::print("toUtf8(U\"$\") -> {}\n", result);
119+
suite.expectEqual(result, "\x24");
120+
}
121+
122+
{
123+
auto result = CppUtils::String::toUtf8(U"A"sv);
124+
Logger::print("toUtf8(U\"A\") -> {}\n", result);
125+
suite.expectEqual(result, "A");
126+
}
127+
128+
// 2 octet
129+
{
130+
auto result = CppUtils::String::toUtf8(U"¢"sv);
131+
Logger::print("toUtf8(U\"¢\") -> {}\n", result);
132+
suite.expectEqual(result, "\xC2\xA2");
133+
}
134+
135+
{
136+
auto result = CppUtils::String::toUtf8(U"α"sv);
137+
Logger::print("toUtf8(U\"α\") -> {}\n", result);
138+
suite.expectEqual(result, "\xCE\xB1");
139+
}
140+
141+
// 3 octet
142+
{
143+
auto result = CppUtils::String::toUtf8(U"€"sv);
144+
Logger::print("toUtf8(U\"€\") -> {}\n", result);
145+
suite.expectEqual(result, "\xE2\x82\xAC");
146+
}
147+
148+
{
149+
auto result = CppUtils::String::toUtf8(U"ん"sv);
150+
Logger::print("toUtf8(U\"ん\") -> {}\n", result);
151+
suite.expectEqual(result, "\xE3\x82\x93");
152+
}
153+
154+
// 4 octet
155+
{
156+
auto result = CppUtils::String::toUtf8(U"𐍈"sv);
157+
Logger::print("toUtf8(U\"𐍈\") -> {}\n", result);
158+
suite.expectEqual(result, "\xF0\x90\x8D\x88");
159+
}
160+
161+
{
162+
auto result = CppUtils::String::toUtf8(U"😀"sv);
163+
Logger::print("toUtf8(U\"😀\") -> {}\n", result);
164+
suite.expectEqual(result, "\xF0\x9F\x98\x80");
165+
}
166+
167+
// Combined
168+
{
169+
auto result = CppUtils::String::toUtf8(U"Hello World!"sv);
170+
Logger::print("toUtf8(U\"Hello World!\") -> {}\n", result);
171+
suite.expectEqual(result, "Hello World!");
172+
}
173+
174+
{
175+
auto result = CppUtils::String::toUtf8(U"$A¢α€ん𐍈😀"sv);
176+
Logger::print("toUtf8(U\"$A¢α€ん𐍈😀\") -> {}\n", result);
177+
suite.expectEqual(result, "$A\xC2\xA2\xCE\xB1\xE2\x82\xAC\xE3\x82\x93\xF0\x90\x8D\x88\xF0\x9F\x98\x80");
178+
}
179+
180+
{
181+
auto result = CppUtils::String::toUtf8(U"A€😀"sv);
182+
Logger::print("toUtf8(U\"A€😀\") -> {}\n", result);
183+
suite.expectEqual(result, "A\xE2\x82\xAC\xF0\x9F\x98\x80");
184+
}
185+
});
186+
}};
187+
}

tests/UnitTests.mpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ export import CppUtils.UnitTests.Math.Random;
3232
export import CppUtils.UnitTests.Memory;
3333
export import CppUtils.UnitTests.Network;
3434
export import CppUtils.UnitTests.Stl.Format;
35+
export import CppUtils.UnitTests.String.Encoding;
3536
export import CppUtils.UnitTests.String.Utility;
3637
export import CppUtils.UnitTests.System.Error;
3738
export import CppUtils.UnitTests.Terminal.Canvas;

0 commit comments

Comments
 (0)