@@ -52,35 +52,55 @@ defmodule String.Unicode do
5252 end
5353 end
5454
55+ # Handle Regional
56+ for codepoint <- cluster [ "Regional_Indicator" ] do
57+ def next_grapheme_size ( << unquote ( codepoint ) , rest :: binary >> ) do
58+ next_regional_size ( rest , unquote ( byte_size ( codepoint ) ) )
59+ end
60+ end
61+
5562 # Handle Hangul L
5663 for codepoint <- cluster [ "L" ] do
5764 def next_grapheme_size ( << unquote ( codepoint ) , rest :: binary >> ) do
5865 next_hangul_l_size ( rest , unquote ( byte_size ( codepoint ) ) )
5966 end
6067 end
6168
69+ # Handle Hangul V
70+ for codepoint <- cluster [ "LV" ] ++ cluster [ "V" ] do
71+ def next_grapheme_size ( << unquote ( codepoint ) , rest :: binary >> ) do
72+ next_hangul_v_size ( rest , unquote ( byte_size ( codepoint ) ) )
73+ end
74+ end
75+
6276 # Handle Hangul T
63- for codepoint <- cluster [ "T" ] do
77+ for codepoint <- cluster [ "LVT" ] ++ cluster [ " T"] do
6478 def next_grapheme_size ( << unquote ( codepoint ) , rest :: binary >> ) do
6579 next_hangul_t_size ( rest , unquote ( byte_size ( codepoint ) ) )
6680 end
6781 end
6882
69- # Handle Regional
70- for codepoint <- cluster [ "Regional_Indicator " ] do
83+ # Handle E_Base
84+ for codepoint <- cluster [ "E_Base" ] ++ cluster [ "E_Base_GAZ "] do
7185 def next_grapheme_size ( << unquote ( codepoint ) , rest :: binary >> ) do
72- next_regional_size ( rest , unquote ( byte_size ( codepoint ) ) )
86+ next_extend_size ( rest , unquote ( byte_size ( codepoint ) ) , :e_base )
7387 end
7488 end
7589
76- # Handle extended entries
90+ # Handle ZWJ
91+ for codepoint <- cluster [ "ZWJ" ] do
92+ def next_grapheme_size ( << unquote ( codepoint ) , rest :: binary >> ) do
93+ next_extend_size ( rest , unquote ( byte_size ( codepoint ) ) , :zwj )
94+ end
95+ end
7796
97+ # Handle extended entries
7898 def next_grapheme_size ( << cp :: utf8 , rest :: binary >> ) do
7999 case cp do
80- x when x <= 0x007F -> next_extend_size ( rest , 1 )
81- x when x <= 0x07FF -> next_extend_size ( rest , 2 )
82- x when x <= 0xFFFF -> next_extend_size ( rest , 3 )
83- _ -> next_extend_size ( rest , 4 )
100+ x when x <= 0x007F -> next_extend_size ( rest , 1 , :other )
101+ x when x <= 0x07FF -> next_extend_size ( rest , 2 , :other )
102+ x when x <= 0xFFFF -> next_extend_size ( rest , 3 , :other )
103+ _ -> next_extend_size ( rest , 4 , :other )
84104 end
85105 end
86106
@@ -92,82 +112,139 @@ defmodule String.Unicode do
92112 nil
93113 end
94114
95- # Handle Hangul L
96- for codepoint <- cluster [ "L" ] do
97- defp next_hangul_l_size ( << unquote ( codepoint ) , rest :: binary >> , size ) do
98- next_hangul_l_size ( rest , size + unquote ( byte_size ( codepoint ) ) )
115+ # Handle hanguls
116+ defp next_hangul_l_size ( rest , size ) do
117+ case next_hangul ( rest , size ) do
118+ { :l , rest , size } -> next_hangul_l_size ( rest , size )
119+ { :v , rest , size } -> next_hangul_v_size ( rest , size )
120+ { :lv , rest , size } -> next_hangul_v_size ( rest , size )
121+ { :lvt , rest , size } -> next_hangul_t_size ( rest , size )
122+ _ -> next_extend_size ( rest , size , :other )
99123 end
100124 end
101125
102- for codepoint <- cluster [ "LV" ] do
103- defp next_hangul_l_size ( << unquote ( codepoint ) , rest :: binary >> , size ) do
104- next_hangul_v_size ( rest , size + unquote ( byte_size ( codepoint ) ) )
126+ defp next_hangul_v_size ( rest , size ) do
127+ case next_hangul ( rest , size ) do
128+ { :v , rest , size } -> next_hangul_v_size ( rest , size )
129+ { :t , rest , size } -> next_hangul_t_size ( rest , size )
130+ _ -> next_extend_size ( rest , size , :other )
105131 end
106132 end
107133
108- for codepoint <- cluster [ "LVT" ] do
109- defp next_hangul_l_size ( << unquote ( codepoint ) , rest :: binary >> , size ) do
110- next_hangul_t_size ( rest , size + unquote ( byte_size ( codepoint ) ) )
134+ defp next_hangul_t_size ( rest , size ) do
135+ case next_hangul ( rest , size ) do
136+ { :t , rest , size } -> next_hangul_t_size ( rest , size )
137+ _ -> next_extend_size ( rest , size , :other )
111138 end
112139 end
113140
114- defp next_hangul_l_size ( rest , size ) do
115- next_hangul_v_size ( rest , size )
141+ for codepoint <- cluster [ "L" ] do
142+ defp next_hangul ( << unquote ( codepoint ) , rest :: binary >> , size ) do
143+ { :l , rest , size + unquote ( byte_size ( codepoint ) ) }
144+ end
116145 end
117146
118- # Handle Hangul V
119147 for codepoint <- cluster [ "V" ] do
120- defp next_hangul_v_size ( << unquote ( codepoint ) , rest :: binary >> , size ) do
121- next_hangul_v_size ( rest , size + unquote ( byte_size ( codepoint ) ) )
148+ defp next_hangul ( << unquote ( codepoint ) , rest :: binary >> , size ) do
149+ { :v , rest , size + unquote ( byte_size ( codepoint ) ) }
122150 end
123151 end
124152
125- defp next_hangul_v_size ( rest , size ) do
126- next_hangul_t_size ( rest , size )
153+ for codepoint <- cluster [ "T" ] do
154+ defp next_hangul ( << unquote ( codepoint ) , rest :: binary >> , size ) do
155+ { :t , rest , size + unquote ( byte_size ( codepoint ) ) }
156+ end
127157 end
128158
129- # Handle Hangul T
130- for codepoint <- cluster [ "T" ] do
131- defp next_hangul_t_size ( << unquote ( codepoint ) , rest :: binary >> , size ) do
132- next_hangul_t_size ( rest , size + unquote ( byte_size ( codepoint ) ) )
159+ for codepoint <- cluster [ "LV" ] do
160+ defp next_hangul ( << unquote ( codepoint ) , rest :: binary >> , size ) do
161+ { :lv , rest , size + unquote ( byte_size ( codepoint ) ) }
133162 end
134163 end
135164
136- defp next_hangul_t_size ( rest , size ) do
137- next_extend_size ( rest , size )
165+ for codepoint <- cluster [ "LVT" ] do
166+ defp next_hangul ( << unquote ( codepoint ) , rest :: binary >> , size ) do
167+ { :lvt , rest , size + unquote ( byte_size ( codepoint ) ) }
168+ end
169+ end
170+
171+ defp next_hangul ( _ , _ ) do
172+ false
138173 end
139174
140175 # Handle regional
141176 for codepoint <- cluster [ "Regional_Indicator" ] do
142177 defp next_regional_size ( << unquote ( codepoint ) , rest :: binary >> , size ) do
143- next_regional_size ( rest , size + unquote ( byte_size ( codepoint ) ) )
178+ next_extend_size ( rest , size + unquote ( byte_size ( codepoint ) ) , :other )
144179 end
145180 end
146-
147181 defp next_regional_size ( rest , size ) do
148- next_extend_size ( rest , size )
182+ next_extend_size ( rest , size , :other )
183+ end
184+
185+ # Handle Extend+SpacingMark+ZWJ
186+ for codepoint <- cluster [ "Extend" ] do
187+ defp next_extend_size ( << unquote ( codepoint ) , rest :: binary >> , size , marker ) do
188+ next_extend_size ( rest , size + unquote ( byte_size ( codepoint ) ) , keep_ebase ( marker ) )
189+ end
149190 end
150191
151- # Handle Extend+SpacingMark
152- for codepoint <- cluster [ "Extend" ] ++ cluster [ "SpacingMark" ] do
153- defp next_extend_size ( << unquote ( codepoint ) , rest :: binary >> , size ) do
154- next_extend_size ( rest , size + unquote ( byte_size ( codepoint ) ) )
192+ for codepoint <- cluster [ "SpacingMark" ] do
193+ defp next_extend_size ( << unquote ( codepoint ) , rest :: binary >> , size , _marker ) do
194+ next_extend_size ( rest , size + unquote ( byte_size ( codepoint ) ) , :other )
155195 end
156196 end
157197
158- defp next_extend_size ( rest , size ) do
198+ for codepoint <- cluster [ "ZWJ" ] do
199+ defp next_extend_size ( << unquote ( codepoint ) , rest :: binary >> , size , _marker ) do
200+ next_extend_size ( rest , size + unquote ( byte_size ( codepoint ) ) , :zwj )
201+ end
202+ end
203+
204+ for codepoint <- cluster [ "E_Modifier" ] do
205+ defp next_extend_size ( << unquote ( codepoint ) , rest :: binary >> , size , :e_base ) do
206+ next_extend_size ( rest , size + unquote ( byte_size ( codepoint ) ) , :other )
207+ end
208+ end
209+
210+ for codepoint <- cluster [ "Glue_After_Zwj" ] do
211+ defp next_extend_size ( << unquote ( codepoint ) , rest :: binary >> , size , :zwj ) do
212+ next_extend_size ( rest , size + unquote ( byte_size ( codepoint ) ) , :other )
213+ end
214+ end
215+
216+ for codepoint <- cluster [ "E_Base_GAZ" ] do
217+ defp next_extend_size ( << unquote ( codepoint ) , rest :: binary >> , size , :zwj ) do
218+ next_extend_size ( rest , size + unquote ( byte_size ( codepoint ) ) , :e_base )
219+ end
220+ end
221+
222+ defp next_extend_size ( rest , size , _ ) do
159223 { size , rest }
160224 end
161225
226+ defp keep_ebase ( :e_base ) , do: :e_base
227+ defp keep_ebase ( _ ) , do: :other
228+
162229 # Handle Prepend
163230 for codepoint <- cluster [ "Prepend" ] do
164231 defp next_prepend_size ( << unquote ( codepoint ) , rest :: binary >> , size ) do
165232 next_prepend_size ( rest , size + unquote ( byte_size ( codepoint ) ) )
166233 end
167234 end
168-
235+
236+ # However, if we see a control character, we have to break it
237+ for codepoint <- cluster [ "CR" ] ++ cluster [ "LF" ] ++ cluster [ "Control" ] do
238+ defp next_prepend_size ( << unquote ( codepoint ) , _ :: binary >> = rest , size ) do
239+ { size , rest }
240+ end
241+ end
242+
169243 defp next_prepend_size ( rest , size ) do
170- { size , rest }
244+ case next_grapheme_size ( rest ) do
245+ { more , rest } -> { more + size , rest }
246+ nil -> { size , rest }
247+ end
171248 end
172249
173250 # Graphemes
0 commit comments