Skip to content

Commit 5914df6

Browse files
Merge pull request #97 from contour-terminal/feature/better-unicode-query
[unicode-query] add `gc` and `runs` sub command to introspect grapheme clusters and runs
2 parents 8103f23 + a6cbd7b commit 5914df6

File tree

2 files changed

+151
-6
lines changed

2 files changed

+151
-6
lines changed

src/tools/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
if(LIBUNICODE_TOOLS)
22
add_executable(unicode-query unicode-query.cpp)
3-
target_link_libraries(unicode-query unicode)
3+
target_link_libraries(unicode-query unicode fmt::fmt-header-only)
44
if(LIBUNICODE_BUILD_STATIC)
55
target_link_libraries(unicode-query "-static")
66
endif()

src/tools/unicode-query.cpp

Lines changed: 150 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@
1515
#include <libunicode/codepoint_properties_loader.h>
1616
#include <libunicode/convert.h>
1717
#include <libunicode/grapheme_segmenter.h>
18+
#include <libunicode/run_segmenter.h>
1819
#include <libunicode/ucd.h>
1920
#include <libunicode/ucd_enums.h>
21+
#include <libunicode/ucd_fmt.h>
2022
#include <libunicode/ucd_ostream.h>
2123
#include <libunicode/utf8_grapheme_segmenter.h>
2224

@@ -28,32 +30,71 @@
2830
#include <sstream>
2931
#include <string>
3032

33+
#if !defined(_WIN32)
34+
#include <unistd.h>
35+
#endif
36+
3137
using namespace std;
3238

3339
namespace
3440
{
3541

36-
std::string quotedAndEscaped(std::string const& text)
42+
std::string escapeControlCodes(std::string const& text)
43+
{
44+
auto result = stringstream {};
45+
for (char const ch: text)
46+
{
47+
if (ch < 0x20)
48+
result << "\\x" << setw(2) << std::hex << (unsigned(ch) & 0xFF);
49+
else
50+
result << ch;
51+
}
52+
return result.str();
53+
}
54+
55+
std::string escaped(std::string const& text)
3756
{
3857
auto result = stringstream {};
39-
result << '"';
4058
for (char const ch: text)
4159
{
4260
if (std::isprint(ch) && ch != '"')
4361
result << ch;
4462
else
4563
result << "\\x" << setw(2) << std::hex << (unsigned(ch) & 0xFF);
4664
}
47-
result << "\"";
4865
return result.str();
4966
}
5067

68+
std::string quotedAndEscaped(std::string const& text)
69+
{
70+
return '"' + escaped(text) + '"';
71+
}
72+
5173
int printUsage(int exitCode)
5274
{
53-
cout << "unicode-query [properties] U+XXXX [...]\n";
75+
cout << "unicode-query [properties] U+XXXX [...]\n"
76+
<< " gc [-e] [--] \"Text string\"\n"
77+
<< " runs [-e] [--] \"Text string\"\n";
5478
return exitCode;
5579
}
5680

81+
std::string_view seq(std::string_view const& text)
82+
{
83+
static const bool isTTY = []() {
84+
#if !defined(_WIN32)
85+
auto const isPTY = isatty(STDOUT_FILENO);
86+
return isPTY;
87+
#else
88+
return false;
89+
#endif
90+
}();
91+
if (isTTY)
92+
return text;
93+
else
94+
return {};
95+
}
96+
97+
// {{{ properties
5798
optional<char32_t> parseChar(std::string_view text)
5899
{
59100
if (text.size() >= 3 && text[0] == 'U' && text[1] == '+')
@@ -116,7 +157,7 @@ void showCodepointProperties(char32_t codepoint)
116157
cout << "Emoji Segmentation Category : " << properties.emoji_segmentation_category << '\n';
117158
cout << "Grapheme Cluster Break : " << properties.grapheme_cluster_break << '\n';
118159
cout << "\n";
119-
// clang-format off
160+
// clang-format on
120161
}
121162

122163
int showCodepointProperties(int argc, char const* argv[])
@@ -134,7 +175,99 @@ int showCodepointProperties(int argc, char const* argv[])
134175
}
135176
return EXIT_SUCCESS;
136177
}
178+
// }}}
179+
180+
// {{{ grapheme clusters
181+
int showGraphemeClusters(int argc, char const* argv[])
182+
{
183+
int i = 0;
184+
bool escapeText = false;
185+
for (; i < argc; ++i)
186+
{
187+
auto const arg = string_view(argv[i]);
188+
if (arg == "-e")
189+
escapeText = true;
190+
else if (arg == "--")
191+
{
192+
++i;
193+
break;
194+
}
195+
else if (arg.starts_with('-'))
196+
return printUsage(EXIT_FAILURE);
197+
else
198+
break;
199+
}
200+
for (; i < argc; ++i)
201+
{
202+
auto const text = string_view(argv[i]);
203+
auto const gcs = unicode::utf8_grapheme_segmenter(text);
204+
for (auto const& gc: gcs)
205+
{
206+
auto const text32 = std::u32string_view(gc);
207+
auto const text8 = unicode::convert_to<char>(text32);
208+
std::cout << (escapeText ? escaped(text8) : escapeControlCodes(text8)) << "\n";
209+
}
210+
}
211+
return EXIT_SUCCESS;
212+
}
213+
// }}}
214+
215+
// {{{ runs
216+
int showRuns(istream& in, bool escapeRunText)
217+
{
218+
string bytes((istreambuf_iterator<char>(in)), istreambuf_iterator<char>());
219+
u32string const codepoints = unicode::convert_to<char32_t>(string_view(bytes));
220+
221+
unicode::run_segmenter rs(codepoints);
222+
unicode::run_segmenter::range run;
223+
224+
while (rs.consume(unicode::out(run)))
225+
{
226+
auto const script = get<unicode::Script>(run.properties);
227+
auto const presentationStyle = get<unicode::PresentationStyle>(run.properties);
228+
229+
auto const text32 = u32string_view(codepoints.data() + run.start, run.end - run.start);
230+
auto const text8 = unicode::convert_to<char>(text32);
231+
auto const textEscaped = escapeRunText ? escaped(text8) : escapeControlCodes(text8);
232+
233+
cout << run.start << "-" << run.end - 1 << " (" << run.end - run.start << "): " << script << " " << presentationStyle
234+
<< "\n"
235+
<< '"' << seq("\033[32m") << textEscaped << seq("\033[m") << "\"\n\n";
236+
}
237+
238+
return EXIT_SUCCESS;
239+
}
240+
241+
int showRuns(int argc, char const* argv[])
242+
{
243+
// [-e]
244+
int i = 0;
245+
bool escaped = false;
246+
for (; i < argc; ++i)
247+
{
248+
auto const arg = string_view(argv[i]);
249+
if (arg == "-e")
250+
escaped = true;
251+
else if (arg == "--")
252+
{
253+
++i;
254+
break;
255+
}
256+
else if (arg.starts_with('-'))
257+
return printUsage(EXIT_FAILURE);
258+
else
259+
break;
260+
}
261+
262+
for (; i < argc; ++i)
263+
{
264+
auto in = std::istringstream(argv[i]);
265+
showRuns(in, escaped);
266+
}
137267

268+
return EXIT_SUCCESS;
269+
}
270+
// }}}
138271
} // namespace
139272

140273
// Example usage:
@@ -154,6 +287,18 @@ int main(int argc, char const* argv[])
154287
if (string_view(argv[argIndex]) == "help")
155288
return printUsage(EXIT_SUCCESS);
156289

290+
if (string_view(argv[argIndex]) == "runs")
291+
{
292+
++argIndex;
293+
return showRuns(argc - argIndex, argv + argIndex);
294+
}
295+
296+
if (string_view(argv[argIndex]) == "gc")
297+
{
298+
++argIndex;
299+
return showGraphemeClusters(argc - argIndex, argv + argIndex);
300+
}
301+
157302
if (string_view(argv[argIndex]) == "properties")
158303
++argIndex;
159304

0 commit comments

Comments
 (0)