1515#include < libunicode/codepoint_properties_loader.h>
1616#include < libunicode/convert.h>
1717#include < libunicode/grapheme_segmenter.h>
18+ #include < libunicode/run_segmenter.h>
1819#include < libunicode/ucd.h>
1920#include < libunicode/ucd_enums.h>
21+ #include < libunicode/ucd_fmt.h>
2022#include < libunicode/ucd_ostream.h>
2123#include < libunicode/utf8_grapheme_segmenter.h>
2224
2830#include < sstream>
2931#include < string>
3032
33+ #if !defined(_WIN32)
34+ #include < unistd.h>
35+ #endif
36+
3137using namespace std ;
3238
3339namespace
3440{
3541
36- std::string quotedAndEscaped (std::string const & text)
42+ std::string escapeControlCodes (std::string const & text)
43+ {
44+ auto result = stringstream {};
45+ for (char const ch: text)
46+ {
47+ if (ch < 0x20 )
48+ result << " \\ x" << setw (2 ) << std::hex << (unsigned (ch) & 0xFF );
49+ else
50+ result << ch;
51+ }
52+ return result.str ();
53+ }
54+
55+ std::string escaped (std::string const & text)
3756{
3857 auto result = stringstream {};
39- result << ' "' ;
4058 for (char const ch: text)
4159 {
4260 if (std::isprint (ch) && ch != ' "' )
4361 result << ch;
4462 else
4563 result << " \\ x" << setw (2 ) << std::hex << (unsigned (ch) & 0xFF );
4664 }
47- result << " \" " ;
4865 return result.str ();
4966}
5067
68+ std::string quotedAndEscaped (std::string const & text)
69+ {
70+ return ' "' + escaped (text) + ' "' ;
71+ }
72+
5173int printUsage (int exitCode)
5274{
53- cout << " unicode-query [properties] U+XXXX [...]\n " ;
75+ cout << " unicode-query [properties] U+XXXX [...]\n "
76+ << " gc [-e] [--] \" Text string\"\n "
77+ << " runs [-e] [--] \" Text string\"\n " ;
5478 return exitCode;
5579}
5680
81+ std::string_view seq (std::string_view const & text)
82+ {
83+ static const bool isTTY = []() {
84+ #if !defined(_WIN32)
85+ auto const isPTY = isatty (STDOUT_FILENO);
86+ return isPTY;
87+ #else
88+ return false ;
89+ #endif
90+ }();
91+ if (isTTY)
92+ return text;
93+ else
94+ return {};
95+ }
96+
97+ // {{{ properties
5798optional<char32_t > parseChar (std::string_view text)
5899{
59100 if (text.size () >= 3 && text[0 ] == ' U' && text[1 ] == ' +' )
@@ -116,7 +157,7 @@ void showCodepointProperties(char32_t codepoint)
116157 cout << " Emoji Segmentation Category : " << properties.emoji_segmentation_category << ' \n ' ;
117158 cout << " Grapheme Cluster Break : " << properties.grapheme_cluster_break << ' \n ' ;
118159 cout << " \n " ;
119- // clang-format off
160+ // clang-format on
120161}
121162
122163int showCodepointProperties (int argc, char const * argv[])
@@ -134,7 +175,99 @@ int showCodepointProperties(int argc, char const* argv[])
134175 }
135176 return EXIT_SUCCESS;
136177}
178+ // }}}
179+
180+ // {{{ grapheme clusters
181+ int showGraphemeClusters (int argc, char const * argv[])
182+ {
183+ int i = 0 ;
184+ bool escapeText = false ;
185+ for (; i < argc; ++i)
186+ {
187+ auto const arg = string_view (argv[i]);
188+ if (arg == " -e" )
189+ escapeText = true ;
190+ else if (arg == " --" )
191+ {
192+ ++i;
193+ break ;
194+ }
195+ else if (arg.starts_with (' -' ))
196+ return printUsage (EXIT_FAILURE);
197+ else
198+ break ;
199+ }
200+ for (; i < argc; ++i)
201+ {
202+ auto const text = string_view (argv[i]);
203+ auto const gcs = unicode::utf8_grapheme_segmenter (text);
204+ for (auto const & gc: gcs)
205+ {
206+ auto const text32 = std::u32string_view (gc);
207+ auto const text8 = unicode::convert_to<char >(text32);
208+ std::cout << (escapeText ? escaped (text8) : escapeControlCodes (text8)) << " \n " ;
209+ }
210+ }
211+ return EXIT_SUCCESS;
212+ }
213+ // }}}
214+
215+ // {{{ runs
216+ int showRuns (istream& in, bool escapeRunText)
217+ {
218+ string bytes ((istreambuf_iterator<char >(in)), istreambuf_iterator<char >());
219+ u32string const codepoints = unicode::convert_to<char32_t >(string_view (bytes));
220+
221+ unicode::run_segmenter rs (codepoints);
222+ unicode::run_segmenter::range run;
223+
224+ while (rs.consume (unicode::out (run)))
225+ {
226+ auto const script = get<unicode::Script>(run.properties );
227+ auto const presentationStyle = get<unicode::PresentationStyle>(run.properties );
228+
229+ auto const text32 = u32string_view (codepoints.data () + run.start , run.end - run.start );
230+ auto const text8 = unicode::convert_to<char >(text32);
231+ auto const textEscaped = escapeRunText ? escaped (text8) : escapeControlCodes (text8);
232+
233+ cout << run.start << " -" << run.end - 1 << " (" << run.end - run.start << " ): " << script << " " << presentationStyle
234+ << " \n "
235+ << ' "' << seq (" \033 [32m" ) << textEscaped << seq (" \033 [m" ) << " \"\n\n " ;
236+ }
237+
238+ return EXIT_SUCCESS;
239+ }
240+
241+ int showRuns (int argc, char const * argv[])
242+ {
243+ // [-e]
244+ int i = 0 ;
245+ bool escaped = false ;
246+ for (; i < argc; ++i)
247+ {
248+ auto const arg = string_view (argv[i]);
249+ if (arg == " -e" )
250+ escaped = true ;
251+ else if (arg == " --" )
252+ {
253+ ++i;
254+ break ;
255+ }
256+ else if (arg.starts_with (' -' ))
257+ return printUsage (EXIT_FAILURE);
258+ else
259+ break ;
260+ }
261+
262+ for (; i < argc; ++i)
263+ {
264+ auto in = std::istringstream (argv[i]);
265+ showRuns (in, escaped);
266+ }
137267
268+ return EXIT_SUCCESS;
269+ }
270+ // }}}
138271} // namespace
139272
140273// Example usage:
@@ -154,6 +287,18 @@ int main(int argc, char const* argv[])
154287 if (string_view (argv[argIndex]) == " help" )
155288 return printUsage (EXIT_SUCCESS);
156289
290+ if (string_view (argv[argIndex]) == " runs" )
291+ {
292+ ++argIndex;
293+ return showRuns (argc - argIndex, argv + argIndex);
294+ }
295+
296+ if (string_view (argv[argIndex]) == " gc" )
297+ {
298+ ++argIndex;
299+ return showGraphemeClusters (argc - argIndex, argv + argIndex);
300+ }
301+
157302 if (string_view (argv[argIndex]) == " properties" )
158303 ++argIndex;
159304
0 commit comments