From 0899a8ba18679f3c3f35a70f6b21c1b962c21c63 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sun, 11 Jan 2026 08:18:32 +0900 Subject: [PATCH 1/2] Remove some now-unused code from mbfl_strcut The legacy mbfl_strcut function is only used to implement mb_strcut for legacy text encodings which 1) do not use a fixed number of bytes per codepoint, 2) do not have an 'mblen_table' which can be used to quickly determine the codepoint length of a byte sequence, and 3) do not have a specialized 'mb_cut' function which implements mb_strcut for that text encoding. Remove unused code from mbfl_strcut, and leave only what is currently needed for the implementation of mb_strcut. --- ext/mbstring/libmbfl/mbfl/mbfilter.c | 349 +++++++++++---------------- 1 file changed, 136 insertions(+), 213 deletions(-) diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter.c b/ext/mbstring/libmbfl/mbfl/mbfilter.c index 1c30c9f417755..d2d68795fa0d4 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter.c @@ -103,7 +103,6 @@ mbfl_strcut( size_t from, size_t length) { - const mbfl_encoding *encoding = string->encoding; mbfl_memory_device device; if (from >= string->len) { @@ -113,145 +112,97 @@ mbfl_strcut( mbfl_string_init(result); result->encoding = string->encoding; - if ((encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4)) || encoding->mblen_table != NULL) { - const unsigned char *start = NULL; - const unsigned char *end = NULL; - unsigned char *w; - size_t sz; - - if (encoding->flag & MBFL_ENCTYPE_WCS2) { - from &= -2; - - if (length >= string->len - from) { - length = string->len - from; - } - - start = string->val + from; - end = start + (length & -2); - } else if (encoding->flag & MBFL_ENCTYPE_WCS4) { - from &= -4; - - if (length >= string->len - from) { - length = string->len - from; - } - - start = string->val + from; - end = start + (length & -4); - } else if ((encoding->flag & MBFL_ENCTYPE_SBCS)) { - if (length >= string->len - from) { - length = string->len - from; - } - - start = string->val + from; - end = start + length; - } else if (encoding->mblen_table != NULL) { - const unsigned char *mbtab = encoding->mblen_table; - const unsigned char *p, *q; - int m; - - /* search start position */ - for (m = 0, p = string->val, q = p + from; - p < q; p += (m = mbtab[*p])); - - if (p > q) { - p -= m; - } - - start = p; + mbfl_convert_filter *encoder = NULL; + mbfl_convert_filter *decoder = NULL; + const unsigned char *p, *q, *r; + struct { + mbfl_convert_filter encoder; + mbfl_convert_filter decoder; + const unsigned char *p; + size_t pos; + } bk, _bk; + + /* output code filter */ + if (!(decoder = mbfl_convert_filter_new( + &mbfl_encoding_wchar, + string->encoding, + mbfl_memory_device_output, 0, &device))) { + return NULL; + } - /* search end position */ - if (length >= string->len - (start - string->val)) { - end = string->val + string->len; - } else { - for (q = p + length; p < q; p += (m = mbtab[*p])); + /* wchar filter */ + if (!(encoder = mbfl_convert_filter_new( + string->encoding, + &mbfl_encoding_wchar, + mbfl_filter_output_null, + NULL, NULL))) { + mbfl_convert_filter_delete(decoder); + return NULL; + } - if (p > q) { - p -= m; - } - end = p; - } - } else { - /* never reached */ - return NULL; - } + mbfl_memory_device_init(&device, length + 8, 0); - /* allocate memory and copy string */ - sz = end - start; - w = ecalloc(sz + 8, sizeof(unsigned char)); + p = string->val; - memcpy(w, start, sz); - w[sz] = '\0'; - w[sz + 1] = '\0'; - w[sz + 2] = '\0'; - w[sz + 3] = '\0'; + /* search start position */ + for (q = string->val + from; p < q; p++) { + (*encoder->filter_function)(*p, encoder); + } - result->val = w; - result->len = sz; - } else { - mbfl_convert_filter *encoder = NULL; - mbfl_convert_filter *decoder = NULL; - const unsigned char *p, *q, *r; - struct { - mbfl_convert_filter encoder; - mbfl_convert_filter decoder; - const unsigned char *p; - size_t pos; - } bk, _bk; - - /* output code filter */ - if (!(decoder = mbfl_convert_filter_new( - &mbfl_encoding_wchar, - string->encoding, - mbfl_memory_device_output, 0, &device))) { - return NULL; - } + /* switch the drain direction */ + encoder->output_function = (output_function_t)decoder->filter_function; + encoder->flush_function = (flush_function_t)decoder->filter_flush; + encoder->data = decoder; - /* wchar filter */ - if (!(encoder = mbfl_convert_filter_new( - string->encoding, - &mbfl_encoding_wchar, - mbfl_filter_output_null, - NULL, NULL))) { - mbfl_convert_filter_delete(decoder); - return NULL; - } + q = string->val + string->len; - mbfl_memory_device_init(&device, length + 8, 0); + /* save the encoder, decoder state and the pointer */ + mbfl_convert_filter_copy(decoder, &_bk.decoder); + mbfl_convert_filter_copy(encoder, &_bk.encoder); + _bk.p = p; + _bk.pos = device.pos; - p = string->val; + if (length > q - p) { + length = q - p; + } - /* search start position */ - for (q = string->val + from; p < q; p++) { + if (length >= 20) { + /* output a little shorter than "length" */ + /* XXX: the constant "20" was determined purely on the heuristics. */ + for (r = p + length - 20; p < r; p++) { (*encoder->filter_function)(*p, encoder); } - /* switch the drain direction */ - encoder->output_function = (output_function_t)decoder->filter_function; - encoder->flush_function = (flush_function_t)decoder->filter_flush; - encoder->data = decoder; - - q = string->val + string->len; - - /* save the encoder, decoder state and the pointer */ - mbfl_convert_filter_copy(decoder, &_bk.decoder); - mbfl_convert_filter_copy(encoder, &_bk.encoder); - _bk.p = p; - _bk.pos = device.pos; - - if (length > q - p) { - length = q - p; - } + /* if the offset of the resulting string exceeds the length, + * then restore the state */ + if (device.pos > length) { + p = _bk.p; + device.pos = _bk.pos; + if (decoder->filter_dtor) + decoder->filter_dtor(decoder); + if (encoder->filter_dtor) + encoder->filter_dtor(encoder); + mbfl_convert_filter_copy(&_bk.decoder, decoder); + mbfl_convert_filter_copy(&_bk.encoder, encoder); + bk = _bk; + } else { + /* save the encoder, decoder state and the pointer */ + mbfl_convert_filter_copy(decoder, &bk.decoder); + mbfl_convert_filter_copy(encoder, &bk.encoder); + bk.p = p; + bk.pos = device.pos; - if (length >= 20) { - /* output a little shorter than "length" */ - /* XXX: the constant "20" was determined purely on the heuristics. */ - for (r = p + length - 20; p < r; p++) { - (*encoder->filter_function)(*p, encoder); - } + /* flush the stream */ + (*encoder->filter_flush)(encoder); /* if the offset of the resulting string exceeds the length, * then restore the state */ if (device.pos > length) { + if (bk.decoder.filter_dtor) + bk.decoder.filter_dtor(&bk.decoder); + if (bk.encoder.filter_dtor) + bk.encoder.filter_dtor(&bk.encoder); + p = _bk.p; device.pos = _bk.pos; if (decoder->filter_dtor) @@ -262,86 +213,11 @@ mbfl_strcut( mbfl_convert_filter_copy(&_bk.encoder, encoder); bk = _bk; } else { - /* save the encoder, decoder state and the pointer */ - mbfl_convert_filter_copy(decoder, &bk.decoder); - mbfl_convert_filter_copy(encoder, &bk.encoder); - bk.p = p; - bk.pos = device.pos; - - /* flush the stream */ - (*encoder->filter_flush)(encoder); - - /* if the offset of the resulting string exceeds the length, - * then restore the state */ - if (device.pos > length) { - if (bk.decoder.filter_dtor) - bk.decoder.filter_dtor(&bk.decoder); - if (bk.encoder.filter_dtor) - bk.encoder.filter_dtor(&bk.encoder); - - p = _bk.p; - device.pos = _bk.pos; - if (decoder->filter_dtor) - decoder->filter_dtor(decoder); - if (encoder->filter_dtor) - encoder->filter_dtor(encoder); - mbfl_convert_filter_copy(&_bk.decoder, decoder); - mbfl_convert_filter_copy(&_bk.encoder, encoder); - bk = _bk; - } else { - if (_bk.decoder.filter_dtor) - _bk.decoder.filter_dtor(&_bk.decoder); - if (_bk.encoder.filter_dtor) - _bk.encoder.filter_dtor(&_bk.encoder); - - p = bk.p; - device.pos = bk.pos; - if (decoder->filter_dtor) - decoder->filter_dtor(decoder); - if (encoder->filter_dtor) - encoder->filter_dtor(encoder); - mbfl_convert_filter_copy(&bk.decoder, decoder); - mbfl_convert_filter_copy(&bk.encoder, encoder); - } - } - } else { - bk = _bk; - } - - /* detect end position */ - while (p < q) { - (*encoder->filter_function)(*p, encoder); - - if (device.pos > length) { - /* restore filter */ - p = bk.p; - device.pos = bk.pos; - if (decoder->filter_dtor) - decoder->filter_dtor(decoder); - if (encoder->filter_dtor) - encoder->filter_dtor(encoder); - mbfl_convert_filter_copy(&bk.decoder, decoder); - mbfl_convert_filter_copy(&bk.encoder, encoder); - break; - } - - p++; - - /* backup current state */ - mbfl_convert_filter_copy(decoder, &_bk.decoder); - mbfl_convert_filter_copy(encoder, &_bk.encoder); - _bk.pos = device.pos; - _bk.p = p; - - (*encoder->filter_flush)(encoder); - - if (device.pos > length) { if (_bk.decoder.filter_dtor) _bk.decoder.filter_dtor(&_bk.decoder); if (_bk.encoder.filter_dtor) _bk.encoder.filter_dtor(&_bk.encoder); - /* restore filter */ p = bk.p; device.pos = bk.pos; if (decoder->filter_dtor) @@ -350,39 +226,86 @@ mbfl_strcut( encoder->filter_dtor(encoder); mbfl_convert_filter_copy(&bk.decoder, decoder); mbfl_convert_filter_copy(&bk.encoder, encoder); - break; } + } + } else { + bk = _bk; + } - if (bk.decoder.filter_dtor) - bk.decoder.filter_dtor(&bk.decoder); - if (bk.encoder.filter_dtor) - bk.encoder.filter_dtor(&bk.encoder); + /* detect end position */ + while (p < q) { + (*encoder->filter_function)(*p, encoder); - p = _bk.p; - device.pos = _bk.pos; + if (device.pos > length) { + /* restore filter */ + p = bk.p; + device.pos = bk.pos; if (decoder->filter_dtor) decoder->filter_dtor(decoder); if (encoder->filter_dtor) encoder->filter_dtor(encoder); - mbfl_convert_filter_copy(&_bk.decoder, decoder); - mbfl_convert_filter_copy(&_bk.encoder, encoder); - - bk = _bk; + mbfl_convert_filter_copy(&bk.decoder, decoder); + mbfl_convert_filter_copy(&bk.encoder, encoder); + break; } - decoder->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE; + p++; + + /* backup current state */ + mbfl_convert_filter_copy(decoder, &_bk.decoder); + mbfl_convert_filter_copy(encoder, &_bk.encoder); + _bk.pos = device.pos; + _bk.p = p; + (*encoder->filter_flush)(encoder); + if (device.pos > length) { + if (_bk.decoder.filter_dtor) + _bk.decoder.filter_dtor(&_bk.decoder); + if (_bk.encoder.filter_dtor) + _bk.encoder.filter_dtor(&_bk.encoder); + + /* restore filter */ + p = bk.p; + device.pos = bk.pos; + if (decoder->filter_dtor) + decoder->filter_dtor(decoder); + if (encoder->filter_dtor) + encoder->filter_dtor(encoder); + mbfl_convert_filter_copy(&bk.decoder, decoder); + mbfl_convert_filter_copy(&bk.encoder, encoder); + break; + } + if (bk.decoder.filter_dtor) bk.decoder.filter_dtor(&bk.decoder); if (bk.encoder.filter_dtor) bk.encoder.filter_dtor(&bk.encoder); - result = mbfl_memory_device_result(&device, result); + p = _bk.p; + device.pos = _bk.pos; + if (decoder->filter_dtor) + decoder->filter_dtor(decoder); + if (encoder->filter_dtor) + encoder->filter_dtor(encoder); + mbfl_convert_filter_copy(&_bk.decoder, decoder); + mbfl_convert_filter_copy(&_bk.encoder, encoder); - mbfl_convert_filter_delete(encoder); - mbfl_convert_filter_delete(decoder); + bk = _bk; } + decoder->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE; + (*encoder->filter_flush)(encoder); + + if (bk.decoder.filter_dtor) + bk.decoder.filter_dtor(&bk.decoder); + if (bk.encoder.filter_dtor) + bk.encoder.filter_dtor(&bk.encoder); + + result = mbfl_memory_device_result(&device, result); + + mbfl_convert_filter_delete(encoder); + mbfl_convert_filter_delete(decoder); + return result; } From 8e135389700ef1632b5e6dc027ed7a6b7797df5e Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sun, 11 Jan 2026 08:23:59 +0900 Subject: [PATCH 2/2] Remove unused conversion code from mbstring Over the last few years, I refactored mbstring to perform encoding conversion a buffer at a time, rather than a single byte at a time. This resulted in a huge performance increase. After the refactoring, the old "byte-at-a-time" code was retained for two reasons: 1) It was used by the mailparse PECL extension. 2) It was used to implement mb_strcut for some text encodings. However, after reviewing mailparse's use of mbstring, it is clear that mailparse only relies on mbstring for decoding of QPrint, and possibly Base64. It does not use the byte-at-a-time conversion code for any other encoding. Further, mb_strcut only relies on the byte-at-a-time conversion code for a limited number of legacy text encodings, such as ISO-2022-JP, HZ, UTF-7, etc. Hence, we can remove over 5000 lines of unused code without breaking anything. This will help to reduce binary size, and make the mbstring codebase easier to navigate for new contributors. --- ext/mbstring/libmbfl/filters/mbfilter_cjk.c | 4603 ++--------------- ext/mbstring/libmbfl/filters/mbfilter_cjk.h | 4 - .../libmbfl/filters/mbfilter_cp51932.h | 5 - .../libmbfl/filters/mbfilter_singlebyte.c | 150 +- ext/mbstring/libmbfl/filters/mbfilter_ucs2.c | 157 +- ext/mbstring/libmbfl/filters/mbfilter_ucs2.h | 12 - ext/mbstring/libmbfl/filters/mbfilter_ucs4.c | 239 +- ext/mbstring/libmbfl/filters/mbfilter_ucs4.h | 12 - ext/mbstring/libmbfl/filters/mbfilter_utf16.c | 269 +- ext/mbstring/libmbfl/filters/mbfilter_utf16.h | 13 - ext/mbstring/libmbfl/filters/mbfilter_utf32.c | 177 +- ext/mbstring/libmbfl/filters/mbfilter_utf32.h | 13 - ext/mbstring/libmbfl/filters/mbfilter_utf8.c | 393 +- ext/mbstring/libmbfl/filters/mbfilter_utf8.h | 12 - 14 files changed, 608 insertions(+), 5451 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cjk.c b/ext/mbstring/libmbfl/filters/mbfilter_cjk.c index 716fec0c054d9..6a9c3803c4703 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cjk.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cjk.c @@ -4720,116 +4720,6 @@ const mbfl_encoding mbfl_encoding_2022kr = { * SJIS variants */ -static int mbfl_filt_conv_sjis_wchar(int c, mbfl_convert_filter *filter) -{ - int s1, s2, w; - - switch (filter->status) { - case 0: - if (c >= 0 && c < 0x80) { /* ASCII */ - CK((*filter->output_function)(c, filter->data)); - } else if (c > 0xA0 && c < 0xE0) { /* Kana */ - CK((*filter->output_function)(0xFEC0 + c, filter->data)); - } else if (c > 0x80 && c < 0xF0 && c != 0xA0) { /* Kanji, first byte */ - filter->status = 1; - filter->cache = c; - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - case 1: /* Kanji, second byte */ - filter->status = 0; - int c1 = filter->cache; - if (c >= 0x40 && c <= 0xFC && c != 0x7F) { - SJIS_DECODE(c1, c, s1, s2); - w = (s1 - 0x21)*94 + s2 - 0x21; - if (w >= 0 && w < jisx0208_ucs_table_size) { - w = jisx0208_ucs_table[w]; - if (!w) - w = MBFL_BAD_INPUT; - } else { - w = MBFL_BAD_INPUT; - } - CK((*filter->output_function)(w, filter->data)); - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - } - - return 0; -} - -static int mbfl_filt_conv_sjis_wchar_flush(mbfl_convert_filter *filter) -{ - if (filter->status && filter->status != 4) { - (*filter->output_function)(MBFL_BAD_INPUT, filter->data); - } - filter->status = 0; - - if (filter->flush_function) { - (*filter->flush_function)(filter->data); - } - - return 0; -} - -static int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter) -{ - int c1, c2, s1 = 0, s2; - - if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) { - s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min]; - } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) { - s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min]; - } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) { - s1 = ucs_i_jis_table[c - ucs_i_jis_table_min]; - } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) { - s1 = ucs_r_jis_table[c - ucs_r_jis_table_min]; - } - if (s1 <= 0) { - if (c == 0xA5) { /* YEN SIGN */ - s1 = 0x216F; /* FULLWIDTH YEN SIGN */ - } else if (c == 0xAF || c == 0x203E) { /* U+00AF is MACRON, U+203E is OVERLINE */ - s1 = 0x2131; /* FULLWIDTH MACRON */ - } else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */ - s1 = 0x2140; - } else if (c == 0x2225) { /* PARALLEL TO */ - s1 = 0x2142; - } else if (c == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */ - s1 = 0x215D; - } else if (c == 0xFFE0) { /* FULLWIDTH CENT SIGN */ - s1 = 0x2171; - } else if (c == 0xFFE1) { /* FULLWIDTH POUND SIGN */ - s1 = 0x2172; - } else if (c == 0xFFE2) { /* FULLWIDTH NOT SIGN */ - s1 = 0x224C; - } else if (c == 0) { - s1 = 0; - } else { - s1 = -1; - } - } else if (s1 >= 0x8080) { /* JIS X 0212; not supported */ - s1 = -1; - } - - if (s1 >= 0) { - if (s1 < 0x100) { /* Latin/Kana */ - CK((*filter->output_function)(s1, filter->data)); - } else { /* Kanji */ - c1 = (s1 >> 8) & 0xFF; - c2 = s1 & 0xFF; - SJIS_ENCODE(c1, c2, s1, s2); - CK((*filter->output_function)(s1, filter->data)); - CK((*filter->output_function)(s2, filter->data)); - } - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - - return 0; -} - static const unsigned short sjis_decode_tbl1[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFFFF, 0, 188, 376, 564, 752, 940, 1128, 1316, 1504, 1692, 1880, 2068, 2256, 2444, 2632, 2820, 3008, 3196, 3384, 3572, 3760, 3948, 4136, 4324, 4512, 4700, 4888, 5076, 5264, 5452, 5640, 0xFFFF, -6016, -5828, -5640, -5452, -5264, -5076, -4888, -4700, -4512, -4324, -4136, -3948, -3760, -3572, -3384, -3196, -3008, -2820, -2632, -2444, -2256, -2068, -1880, -1692, -1504, -1316, -1128, -940, -752, -564, -376, -188, 0, 188, 376, 564, 752, 940, 1128, 1316, 1504, 1692, 1880, 2068, 2256, 2444, 2632, 2820, 3008, 3196, 3384, 3572, 3760, 3948, 4136, 4324, 4512, 4700, 4888, 5076, 5264, 5452, 5640, 5828, 6016, 6204, 6392, 6580, 6768, 6956, 7144, 7332, 7520, 7708, 7896, 8084, 8272, 8460, 8648, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }; @@ -4955,1452 +4845,449 @@ static void mb_wchar_to_sjis(uint32_t *in, size_t len, mb_convert_buf *buf, bool MB_CONVERT_BUF_STORE(buf, out, limit); } -static int mbfl_filt_conv_sjis_mac_wchar(int c, mbfl_convert_filter *filter) +static size_t mb_sjismac_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) { - int i, j, n; - int c1, s, s1, s2, w; + /* A single SJIS-Mac kuten code can convert to up to 5 Unicode codepoints, oh my! */ + ZEND_ASSERT(bufsize >= 5); - switch (filter->status) { - case 0: - if (c >= 0 && c < 0x80 && c != 0x5c) { /* latin */ - CK((*filter->output_function)(c, filter->data)); - } else if (c > 0xa0 && c < 0xe0) { /* kana */ - CK((*filter->output_function)(0xfec0 + c, filter->data)); - } else if (c > 0x80 && c <= 0xed && c != 0xa0) { /* kanji first char */ - filter->status = 1; - filter->cache = c; - } else if (c == 0x5c) { - CK((*filter->output_function)(0x00a5, filter->data)); - } else if (c == 0x80) { - CK((*filter->output_function)(0x005c, filter->data)); - } else if (c == 0xa0) { - CK((*filter->output_function)(0x00a0, filter->data)); - } else if (c == 0xfd) { - CK((*filter->output_function)(0x00a9, filter->data)); - } else if (c == 0xfe) { - CK((*filter->output_function)(0x2122, filter->data)); - } else if (c == 0xff) { - CK((*filter->output_function)(0x2026, filter->data)); - CK((*filter->output_function)(0xf87f, filter->data)); - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; + unsigned char *p = *in, *e = p + *in_len; + uint32_t *out = buf, *limit = buf + bufsize; - case 1: /* kanji second char */ - filter->status = 0; - c1 = filter->cache; - if (c >= 0x40 && c <= 0xfc && c != 0x7f) { - w = 0; - SJIS_DECODE(c1, c, s1, s2); - s = (s1 - 0x21)*94 + s2 - 0x21; - if (s <= 0x89) { - if (s == 0x1c) { - w = 0x2014; /* EM DASH */ - } else if (s == 0x1f) { - w = 0xff3c; /* FULLWIDTH REVERSE SOLIDUS */ - } else if (s == 0x20) { - w = 0x301c; /* FULLWIDTH TILDE */ - } else if (s == 0x21) { - w = 0x2016; /* PARALLEL TO */ - } else if (s == 0x3c) { - w = 0x2212; /* FULLWIDTH HYPHEN-MINUS */ - } else if (s == 0x50) { - w = 0x00a2; /* FULLWIDTH CENT SIGN */ - } else if (s == 0x51) { - w = 0x00a3; /* FULLWIDTH POUND SIGN */ - } else if (s == 0x89) { - w = 0x00ac; /* FULLWIDTH NOT SIGN */ - } - } - - /* apple gaiji area 0x8540 - 0x886d */ - if (w == 0) { - for (i=0; i<7; i++) { - if (s >= code_tbl[i][0] && s <= code_tbl[i][1]) { - w = s - code_tbl[i][0] + code_tbl[i][2]; - break; - } - } - } + while (p < e && out < limit) { + unsigned char c = *p++; - if (w == 0) { + if (c <= 0x80 || c == 0xA0) { + if (c == 0x5C) { + *out++ = 0xA5; + } else if (c == 0x80) { + *out++ = 0x5C; + } else { + *out++ = c; + } + } else if (c >= 0xA1 && c <= 0xDF) { + *out++ = 0xFEC0 + c; + } else if (c <= 0xED) { + if (p == e) { + *out++ = MBFL_BAD_INPUT; + break; + } + unsigned char c2 = *p++; + uint32_t w = sjis_decode_tbl1[c] + sjis_decode_tbl2[c2]; - for (i=0; ioutput_function)(code_tbl_m[i][j], filter->data)); + if (w <= 0x89) { + if (w == 0x1C) { + *out++ = 0x2014; /* EM DASH */ + continue; + } else if (w == 0x1F) { + *out++ = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */ + continue; + } else if (w == 0x20) { + *out++ = 0x301C; /* FULLWIDTH TILDE */ + continue; + } else if (w == 0x21) { + *out++ = 0x2016; /* PARALLEL TO */ + continue; + } else if (w == 0x3C) { + *out++ = 0x2212; /* FULLWIDTH HYPHEN-MINUS */ + continue; + } else if (w == 0x50) { + *out++ = 0xA2; /* FULLWIDTH CENT SIGN */ + continue; + } else if (w == 0x51) { + *out++ = 0xA3; /* FULLWIDTH POUND SIGN */ + continue; + } else if (w == 0x89) { + *out++ = 0xAC; /* FULLWIDTH NOT SIGN */ + continue; + } + } else { + if (w >= 0x2F0 && w <= 0x3A3) { + for (int i = 0; i < 7; i++) { + if (w >= code_tbl[i][0] && w <= code_tbl[i][1]) { + *out++ = w - code_tbl[i][0] + code_tbl[i][2]; + goto next_iteration; } - w = code_tbl_m[i][n-1]; - break; } } - } - if (w == 0) { - for (i=0; i<8; i++) { - if (s >= code_ofst_tbl[i][0] && s <= code_ofst_tbl[i][1]) { - w = code_map[i][s - code_ofst_tbl[i][0]]; - if (w == 0) { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - return 0; - } - s2 = 0; - if (s >= 0x043e && s <= 0x0441) { - s2 = 0xf87a; - } else if (s == 0x03b1 || s == 0x03b7) { - s2 = 0xf87f; - } else if (s == 0x04b8 || s == 0x04b9 || s == 0x04c4) { - s2 = 0x20dd; - } else if (s == 0x1ed9 || s == 0x1eda || s == 0x1ee8 || s == 0x1ef3 || - (s >= 0x1ef5 && s <= 0x1efb) || s == 0x1f05 || s == 0x1f06 || - s == 0x1f18 || (s >= 0x1ff2 && s <= 0x20a5)) { - s2 = 0xf87e; + if (w >= 0x340 && w <= 0x523) { + for (int i = 0; i < code_tbl_m_len; i++) { + if (w == code_tbl_m[i][0]) { + int n = 5; + if (code_tbl_m[i][1] == 0xF860) { + n = 3; + } else if (code_tbl_m[i][1] == 0xF861) { + n = 4; + } + if ((limit - out) < n) { + p -= 2; + goto finished; + } + for (int j = 1; j <= n; j++) { + *out++ = code_tbl_m[i][j]; + } + goto next_iteration; } - if (s2 > 0) { - CK((*filter->output_function)(w, filter->data)); - w = s2; + } + } + + if (w >= 0x3AC && w <= 0x20A5) { + for (int i = 0; i < 8; i++) { + if (w >= code_ofst_tbl[i][0] && w <= code_ofst_tbl[i][1]) { + uint32_t w2 = code_map[i][w - code_ofst_tbl[i][0]]; + if (!w2) { + *out++ = MBFL_BAD_INPUT; + goto next_iteration; + } + if ((limit - out) < 2) { + p -= 2; + goto finished; + } + *out++ = w2; + if (w >= 0x43E && w <= 0x441) { + *out++ = 0xF87A; + } else if (w == 0x3B1 || w == 0x3B7) { + *out++ = 0xF87F; + } else if (w == 0x4B8 || w == 0x4B9 || w == 0x4C4) { + *out++ = 0x20DD; + } else if (w == 0x1ED9 || w == 0x1EDA || w == 0x1EE8 || w == 0x1EF3 || (w >= 0x1EF5 && w <= 0x1EFB) || w == 0x1F05 || w == 0x1F06 || w == 0x1F18 || (w >= 0x1FF2 && w <= 0x20A5)) { + *out++ = 0xF87E; + } + goto next_iteration; } - break; } } } - if (w == 0 && s >= 0 && s < jisx0208_ucs_table_size) { /* X 0208 */ - w = jisx0208_ucs_table[s]; + if (w < jisx0208_ucs_table_size) { + w = jisx0208_ucs_table[w]; + if (!w) + w = MBFL_BAD_INPUT; + *out++ = w; + } else { + *out++ = MBFL_BAD_INPUT; } - - if (w <= 0) { - w = MBFL_BAD_INPUT; + } else if (c == 0xFD) { + *out++ = 0xA9; + } else if (c == 0xFE) { + *out++ = 0x2122; + } else if (c == 0xFF) { + if ((limit - out) < 2) { + p--; + break; } - CK((*filter->output_function)(w, filter->data)); + *out++ = 0x2026; + *out++ = 0xF87F; } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); + *out++ = MBFL_BAD_INPUT; } - break; - - EMPTY_SWITCH_DEFAULT_CASE(); +next_iteration: ; } - return 0; +finished: + *in_len = e - p; + *in = p; + return out - buf; } -static int mbfl_filt_conv_wchar_sjis_mac(int c, mbfl_convert_filter *filter) +static bool process_s_form(uint32_t w, uint32_t w2, unsigned int *s) { - int i, c1, c2, s1 = 0, s2 = 0, mode; - - // a1: U+0000 -> U+046F - // a2: U+2000 -> U+30FF - // i: U+4E00 -> U+9FFF - // r: U+FF00 -> U+FFFF - - switch (filter->status) { - case 1: - c1 = filter->cache; - filter->cache = filter->status = 0; - - if (c == 0xf87a) { - for (i = 0; i < 4; i++) { - if (c1 == s_form_tbl[i+34+3+3]) { - s1 = s_form_sjis_tbl[i+34+3+3]; - break; - } - } - if (s1 <= 0) { - s2 = c1; - } - } else if (c == 0x20dd) { - for (i = 0; i < 3; i++) { - if (c1 == s_form_tbl[i+34+3]) { - s1 = s_form_sjis_tbl[i+34+3]; - break; - } - } - if (s1 <= 0) { - s2 = c1; - } - } else if (c == 0xf87f) { - for (i = 0; i < 3; i++) { - if (c1 == s_form_tbl[i+34]) { - s1 = s_form_sjis_tbl[i+34]; - break; - } - } - if (s1 <= 0) { - s2 = c1; - s1 = -1; - } - } else if (c == 0xf87e) { - for (i = 0; i < 34; i++) { - if (c1 == s_form_tbl[i]) { - s1 = s_form_sjis_tbl[i]; - break; - } - } - if (s1 <= 0) { - s2 = c1; - s1 = -1; - } - } else { - s2 = c1; - s1 = c; - } - - if (s2 > 0) { - for (i = 0; i < s_form_tbl_len; i++) { - if (c1 == s_form_tbl[i]) { - s1 = s_form_sjis_fallback_tbl[i]; - break; - } + if (w2 == 0xF87A) { + for (int i = 0; i < 4; i++) { + if (w == s_form_tbl[i+34+3+3]) { + *s = s_form_sjis_tbl[i+34+3+3]; + return true; } } - - if (s1 >= 0) { - if (s1 < 0x100) { - CK((*filter->output_function)(s1, filter->data)); - } else { - CK((*filter->output_function)((s1 >> 8) & 0xff, filter->data)); - CK((*filter->output_function)(s1 & 0xff, filter->data)); + } else if (w2 == 0x20DD) { + for (int i = 0; i < 3; i++) { + if (w == s_form_tbl[i+34+3]) { + *s = s_form_sjis_tbl[i+34+3]; + return true; } - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - - if (s2 <= 0 || s1 == -1) { - break; } - s1 = s2 = 0; - ZEND_FALLTHROUGH; - - case 0: - if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) { - s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min]; - if (c == 0x5c) { - s1 = 0x80; - } else if (c == 0xa9) { - s1 = 0xfd; - } - } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) { - s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min]; - if (c == 0x2122) { - s1 = 0xfe; - } else if (c == 0x2014) { - s1 = 0x213d; - } else if (c == 0x2116) { - s1 = 0x2c1d; - } - } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) { - s1 = ucs_i_jis_table[c - ucs_i_jis_table_min]; - } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) { - s1 = ucs_r_jis_table[c - ucs_r_jis_table_min]; - } - - if (c >= 0x2000) { - for (i = 0; i < s_form_tbl_len; i++) { - if (c == s_form_tbl[i]) { - filter->status = 1; - filter->cache = c; - return 0; - } - } - - if (c == 0xf860 || c == 0xf861 || c == 0xf862) { - /* Apple 'transcoding hint' codepoints (from private use area) */ - filter->status = 2; - filter->cache = c; - return 0; + } else if (w2 == 0xF87F) { + for (int i = 0; i < 3; i++) { + if (w == s_form_tbl[i+34]) { + *s = s_form_sjis_tbl[i+34]; + return true; } } - - if (s1 <= 0) { - if (c == 0xa0) { - s1 = 0x00a0; - } else if (c == 0xa5) { /* YEN SIGN */ - /* Unicode has codepoint 0xFFE5 for a fullwidth Yen sign; - * convert codepoint 0xA5 to halfwidth Yen sign */ - s1 = 0x5c; /* HALFWIDTH YEN SIGN */ - } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */ - s1 = 0x2140; + } else if (w2 == 0xF87E) { + for (int i = 0; i < 34; i++) { + if (w == s_form_tbl[i]) { + *s = s_form_sjis_tbl[i]; + return true; } } + } - if (s1 <= 0) { - for (i=0; i= wchar2sjis_mac_r_tbl[i][0] && c <= wchar2sjis_mac_r_tbl[i][1]) { - s1 = c - wchar2sjis_mac_r_tbl[i][0] + wchar2sjis_mac_r_tbl[i][2]; - break; - } - } - - if (s1 <= 0) { - for (i=0; i= wchar2sjis_mac_r_map[i][0] && c <= wchar2sjis_mac_r_map[i][1]) { - s1 = wchar2sjis_mac_code_map[i][c-wchar2sjis_mac_r_map[i][0]]; - break; - } - } - } - - if (s1 <= 0) { - for (i=0; i 0) { - c1 = s1/94+0x21; - c2 = s1-94*(c1-0x21)+0x21; - s1 = (c1 << 8) | c2; - s2 = 1; - } - } +/* For codepoints F860-F862, which are treated specially in MacJapanese */ +static int transcoding_hint_cp_width[3] = { 3, 4, 5 }; - if ((s1 <= 0) || (s1 >= 0x8080 && s2 == 0)) { /* not found or X 0212 */ - s1 = -1; - c1 = 0; +static void mb_wchar_to_sjismac(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) +{ + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); - if (c == 0) { - s1 = 0; - } else if (s1 <= 0) { - s1 = -1; - } - } + uint32_t w; - if (s1 >= 0) { - if (s1 < 0x100) { /* latin or kana */ - CK((*filter->output_function)(s1, filter->data)); - } else { /* kanji */ - c1 = (s1 >> 8) & 0xff; - c2 = s1 & 0xff; - SJIS_ENCODE(c1, c2, s1, s2); - CK((*filter->output_function)(s1, filter->data)); - CK((*filter->output_function)(s2, filter->data)); - } + if (buf->state) { + w = buf->state & 0xFFFF; + if (buf->state & 0xFF000000L) { + goto resume_transcoding_hint; } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - break; - - case 2: - c1 = filter->cache; - filter->cache = 0; - filter->status = 0; - if (c1 == 0xf860) { - for (i = 0; i < 5; i++) { - if (c == code_tbl_m[i][2]) { - filter->cache = c | 0x10000; - filter->status = 3; - break; - } - } - } else if (c1 == 0xf861) { - for (i = 0; i < 3; i++) { - if (c == code_tbl_m[i+5][2]) { - filter->cache = c | 0x20000; - filter->status = 3; - break; - } - } - } else if (c1 == 0xf862) { - for (i = 0; i < 4; i++) { - if (c == code_tbl_m[i+5+3][2]) { - filter->cache = c | 0x40000; - filter->status = 3; - break; - } - } - } - - if (filter->status == 0) { - /* Didn't find any of expected codepoints after Apple transcoding hint */ - CK(mbfl_filt_conv_illegal_output(c1, filter)); - return mbfl_filt_conv_wchar_sjis_mac(c, filter); + buf->state = 0; + goto process_codepoint; } - break; - - case 3: - s1 = 0; - c1 = filter->cache & 0xffff; - mode = (filter->cache & 0xf0000) >> 16; - - filter->cache = filter->status = 0; + } - if (mode == 0x1) { - for (i = 0; i < 5; i++) { - if (c1 == code_tbl_m[i][2] && c == code_tbl_m[i][3]) { - s1 = code_tbl_m[i][0]; - break; - } - } + while (len--) { + w = *in++; +process_codepoint: ; + unsigned int s = 0; - if (s1 > 0) { - c1 = s1/94+0x21; - c2 = s1-94*(c1-0x21)+0x21; - SJIS_ENCODE(c1, c2, s1, s2); - CK((*filter->output_function)(s1, filter->data)); - CK((*filter->output_function)(s2, filter->data)); + if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) { + if (w == 0x5C) { + s = 0x80; + } else if (w == 0xA9) { + s = 0xFD; } else { - CK(mbfl_filt_conv_illegal_output(0xf860, filter)); - CK(mbfl_filt_conv_illegal_output(c1, filter)); - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - } else if (mode == 0x2) { - for (i = 0; i < 3; i++) { - if (c1 == code_tbl_m[i+5][2] && c == code_tbl_m[i+5][3]) { - filter->cache = c | 0x20000; - filter->status = 4; - break; - } + s = ucs_a1_jis_table[w - ucs_a1_jis_table_min]; } - } else if (mode == 0x4) { - for (i = 0; i < 4; i++) { - if (c1 == code_tbl_m[i+8][2] && c == code_tbl_m[i+8][3]) { - filter->cache = c | 0x40000; - filter->status = 4; - break; - } + } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) { + if (w == 0x2122) { + s = 0xFE; + } else if (w == 0x2014) { + s = 0x213D; + } else if (w == 0x2116) { + s = 0x2C1D; + } else { + s = ucs_a2_jis_table[w - ucs_a2_jis_table_min]; } + } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) { + s = ucs_i_jis_table[w - ucs_i_jis_table_min]; + } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) { + s = ucs_r_jis_table[w - ucs_r_jis_table_min]; } - break; - - case 4: - s1 = 0; - c1 = filter->cache & 0xffff; - mode = (filter->cache & 0xf0000) >> 16; - filter->cache = 0; - filter->status = 0; + if (w >= 0x2000) { + for (int i = 0; i < s_form_tbl_len; i++) { + if (w == s_form_tbl[i]) { + if (!len) { + if (end) { + s = s_form_sjis_fallback_tbl[i]; + if (s) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, 2); + out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF); + } else { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac); + } + } else { + buf->state = w; + } + MB_CONVERT_BUF_STORE(buf, out, limit); + return; + } + uint32_t w2 = *in++; + len--; - if (mode == 0x2) { - for (i = 0; i < 3; i++) { - if (c1 == code_tbl_m[i+5][3] && c == code_tbl_m[i+5][4]) { - s1 = code_tbl_m[i+5][0]; - break; - } - } + if (!process_s_form(w, w2, &s)) { + in--; len++; - if (s1 > 0) { - c1 = s1/94+0x21; - c2 = s1-94*(c1-0x21)+0x21; - SJIS_ENCODE(c1, c2, s1, s2); - CK((*filter->output_function)(s1, filter->data)); - CK((*filter->output_function)(s2, filter->data)); - } else { - CK(mbfl_filt_conv_illegal_output(0xf861, filter)); - for (i = 0; i < 3; i++) { - if (c1 == code_tbl_m[i+5][3]) { - CK(mbfl_filt_conv_illegal_output(code_tbl_m[i+5][2], filter)); - break; + for (int i = 0; i < s_form_tbl_len; i++) { + if (w == s_form_tbl[i]) { + s = s_form_sjis_fallback_tbl[i]; + break; + } + } } - } - CK(mbfl_filt_conv_illegal_output(c1, filter)); - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - } else if (mode == 0x4) { - for (i = 0; i < 4; i++) { - if (c1 == code_tbl_m[i+8][3] && c == code_tbl_m[i+8][4]) { - filter->cache = c | 0x40000; - filter->status = 5; - break; - } - } - } - break; - case 5: - s1 = 0; - c1 = filter->cache & 0xffff; - mode = (filter->cache & 0xf0000) >> 16; - - filter->cache = filter->status = 0; + if (s <= 0xFF) { + out = mb_convert_buf_add(out, s); + } else { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF); + } - if (mode == 0x4) { - for (i = 0; i < 4; i++) { - if (c1 == code_tbl_m[i+8][4] && c == code_tbl_m[i+8][5]) { - s1 = code_tbl_m[i+8][0]; - break; + goto next_iteration; } } - if (s1 > 0) { - c1 = s1/94+0x21; - c2 = s1-94*(c1-0x21)+0x21; - SJIS_ENCODE(c1, c2, s1, s2); - CK((*filter->output_function)(s1, filter->data)); - CK((*filter->output_function)(s2, filter->data)); - } else { - CK(mbfl_filt_conv_illegal_output(0xf862, filter)); - for (i = 0; i < 4; i++) { - if (c1 == code_tbl_m[i+8][4]) { - CK(mbfl_filt_conv_illegal_output( code_tbl_m[i+8][2], filter)); - CK(mbfl_filt_conv_illegal_output( code_tbl_m[i+8][3], filter)); - break; + if (w == 0xF860 || w == 0xF861 || w == 0xF862) { + /* Apple 'transcoding hint' codepoints (from private use area) */ + if (!len) { + if (end) { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac); + } else { + buf->state = w; } + MB_CONVERT_BUF_STORE(buf, out, limit); + return; } - CK(mbfl_filt_conv_illegal_output(c1, filter)); - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - } - break; - - EMPTY_SWITCH_DEFAULT_CASE(); - } - return 0; -} + uint32_t w2 = *in++; + len--; -static int mbfl_filt_conv_wchar_sjis_mac_flush(mbfl_convert_filter *filter) -{ - int i, c1, s1 = 0; - if (filter->status == 1 && filter->cache > 0) { - c1 = filter->cache; - for (i=0;i 0) { - CK((*filter->output_function)((s1 >> 8) & 0xff, filter->data)); - CK((*filter->output_function)(s1 & 0xff, filter->data)); - } - } - filter->cache = 0; - filter->status = 0; + for (int i = 0; i < code_tbl_m_len; i++) { + if (w == code_tbl_m[i][1] && w2 == code_tbl_m[i][2]) { + /* This might be a valid transcoding hint sequence */ + int index = 3; - if (filter->flush_function != NULL) { - return (*filter->flush_function)(filter->data); - } - - return 0; -} - -static size_t mb_sjismac_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) -{ - /* A single SJIS-Mac kuten code can convert to up to 5 Unicode codepoints, oh my! */ - ZEND_ASSERT(bufsize >= 5); - - unsigned char *p = *in, *e = p + *in_len; - uint32_t *out = buf, *limit = buf + bufsize; - - while (p < e && out < limit) { - unsigned char c = *p++; - - if (c <= 0x80 || c == 0xA0) { - if (c == 0x5C) { - *out++ = 0xA5; - } else if (c == 0x80) { - *out++ = 0x5C; - } else { - *out++ = c; - } - } else if (c >= 0xA1 && c <= 0xDF) { - *out++ = 0xFEC0 + c; - } else if (c <= 0xED) { - if (p == e) { - *out++ = MBFL_BAD_INPUT; - break; - } - unsigned char c2 = *p++; - uint32_t w = sjis_decode_tbl1[c] + sjis_decode_tbl2[c2]; - - if (w <= 0x89) { - if (w == 0x1C) { - *out++ = 0x2014; /* EM DASH */ - continue; - } else if (w == 0x1F) { - *out++ = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */ - continue; - } else if (w == 0x20) { - *out++ = 0x301C; /* FULLWIDTH TILDE */ - continue; - } else if (w == 0x21) { - *out++ = 0x2016; /* PARALLEL TO */ - continue; - } else if (w == 0x3C) { - *out++ = 0x2212; /* FULLWIDTH HYPHEN-MINUS */ - continue; - } else if (w == 0x50) { - *out++ = 0xA2; /* FULLWIDTH CENT SIGN */ - continue; - } else if (w == 0x51) { - *out++ = 0xA3; /* FULLWIDTH POUND SIGN */ - continue; - } else if (w == 0x89) { - *out++ = 0xAC; /* FULLWIDTH NOT SIGN */ - continue; - } - } else { - if (w >= 0x2F0 && w <= 0x3A3) { - for (int i = 0; i < 7; i++) { - if (w >= code_tbl[i][0] && w <= code_tbl[i][1]) { - *out++ = w - code_tbl[i][0] + code_tbl[i][2]; - goto next_iteration; + if (buf->state) { +resume_transcoding_hint: + i = buf->state >> 24; + index = (buf->state >> 16) & 0xFF; + buf->state = 0; } - } - } - if (w >= 0x340 && w <= 0x523) { - for (int i = 0; i < code_tbl_m_len; i++) { - if (w == code_tbl_m[i][0]) { - int n = 5; - if (code_tbl_m[i][1] == 0xF860) { - n = 3; - } else if (code_tbl_m[i][1] == 0xF861) { - n = 4; - } - if ((limit - out) < n) { - p -= 2; - goto finished; - } - for (int j = 1; j <= n; j++) { - *out++ = code_tbl_m[i][j]; - } - goto next_iteration; - } - } - } + int expected = transcoding_hint_cp_width[w - 0xF860]; - if (w >= 0x3AC && w <= 0x20A5) { - for (int i = 0; i < 8; i++) { - if (w >= code_ofst_tbl[i][0] && w <= code_ofst_tbl[i][1]) { - uint32_t w2 = code_map[i][w - code_ofst_tbl[i][0]]; - if (!w2) { - *out++ = MBFL_BAD_INPUT; - goto next_iteration; - } - if ((limit - out) < 2) { - p -= 2; - goto finished; - } - *out++ = w2; - if (w >= 0x43E && w <= 0x441) { - *out++ = 0xF87A; - } else if (w == 0x3B1 || w == 0x3B7) { - *out++ = 0xF87F; - } else if (w == 0x4B8 || w == 0x4B9 || w == 0x4C4) { - *out++ = 0x20DD; - } else if (w == 0x1ED9 || w == 0x1EDA || w == 0x1EE8 || w == 0x1EF3 || (w >= 0x1EF5 && w <= 0x1EFB) || w == 0x1F05 || w == 0x1F06 || w == 0x1F18 || (w >= 0x1FF2 && w <= 0x20A5)) { - *out++ = 0xF87E; + while (index <= expected) { + if (!len) { + if (end) { + for (int j = 1; j < index; j++) { + MB_CONVERT_ERROR(buf, out, limit, code_tbl_m[i][j], mb_wchar_to_sjismac); + } + } else { + buf->state = (i << 24) | (index << 16) | (w & 0xFFFF); + } + MB_CONVERT_BUF_STORE(buf, out, limit); + return; } - goto next_iteration; - } - } - } - } - - if (w < jisx0208_ucs_table_size) { - w = jisx0208_ucs_table[w]; - if (!w) - w = MBFL_BAD_INPUT; - *out++ = w; - } else { - *out++ = MBFL_BAD_INPUT; - } - } else if (c == 0xFD) { - *out++ = 0xA9; - } else if (c == 0xFE) { - *out++ = 0x2122; - } else if (c == 0xFF) { - if ((limit - out) < 2) { - p--; - break; - } - *out++ = 0x2026; - *out++ = 0xF87F; - } else { - *out++ = MBFL_BAD_INPUT; - } -next_iteration: ; - } - -finished: - *in_len = e - p; - *in = p; - return out - buf; -} - -static bool process_s_form(uint32_t w, uint32_t w2, unsigned int *s) -{ - if (w2 == 0xF87A) { - for (int i = 0; i < 4; i++) { - if (w == s_form_tbl[i+34+3+3]) { - *s = s_form_sjis_tbl[i+34+3+3]; - return true; - } - } - } else if (w2 == 0x20DD) { - for (int i = 0; i < 3; i++) { - if (w == s_form_tbl[i+34+3]) { - *s = s_form_sjis_tbl[i+34+3]; - return true; - } - } - } else if (w2 == 0xF87F) { - for (int i = 0; i < 3; i++) { - if (w == s_form_tbl[i+34]) { - *s = s_form_sjis_tbl[i+34]; - return true; - } - } - } else if (w2 == 0xF87E) { - for (int i = 0; i < 34; i++) { - if (w == s_form_tbl[i]) { - *s = s_form_sjis_tbl[i]; - return true; - } - } - } - - return false; -} - -/* For codepoints F860-F862, which are treated specially in MacJapanese */ -static int transcoding_hint_cp_width[3] = { 3, 4, 5 }; - -static void mb_wchar_to_sjismac(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) -{ - unsigned char *out, *limit; - MB_CONVERT_BUF_LOAD(buf, out, limit); - MB_CONVERT_BUF_ENSURE(buf, out, limit, len); - - uint32_t w; - - if (buf->state) { - w = buf->state & 0xFFFF; - if (buf->state & 0xFF000000L) { - goto resume_transcoding_hint; - } else { - buf->state = 0; - goto process_codepoint; - } - } - - while (len--) { - w = *in++; -process_codepoint: ; - unsigned int s = 0; - if (w >= ucs_a1_jis_table_min && w < ucs_a1_jis_table_max) { - if (w == 0x5C) { - s = 0x80; - } else if (w == 0xA9) { - s = 0xFD; - } else { - s = ucs_a1_jis_table[w - ucs_a1_jis_table_min]; - } - } else if (w >= ucs_a2_jis_table_min && w < ucs_a2_jis_table_max) { - if (w == 0x2122) { - s = 0xFE; - } else if (w == 0x2014) { - s = 0x213D; - } else if (w == 0x2116) { - s = 0x2C1D; - } else { - s = ucs_a2_jis_table[w - ucs_a2_jis_table_min]; - } - } else if (w >= ucs_i_jis_table_min && w < ucs_i_jis_table_max) { - s = ucs_i_jis_table[w - ucs_i_jis_table_min]; - } else if (w >= ucs_r_jis_table_min && w < ucs_r_jis_table_max) { - s = ucs_r_jis_table[w - ucs_r_jis_table_min]; - } - - if (w >= 0x2000) { - for (int i = 0; i < s_form_tbl_len; i++) { - if (w == s_form_tbl[i]) { - if (!len) { - if (end) { - s = s_form_sjis_fallback_tbl[i]; - if (s) { - MB_CONVERT_BUF_ENSURE(buf, out, limit, 2); - out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF); - } else { - MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac); - } - } else { - buf->state = w; - } - MB_CONVERT_BUF_STORE(buf, out, limit); - return; - } - uint32_t w2 = *in++; - len--; - - if (!process_s_form(w, w2, &s)) { - in--; len++; - - for (int i = 0; i < s_form_tbl_len; i++) { - if (w == s_form_tbl[i]) { - s = s_form_sjis_fallback_tbl[i]; - break; - } - } - } - - if (s <= 0xFF) { - out = mb_convert_buf_add(out, s); - } else { - MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); - out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF); - } - - goto next_iteration; - } - } - - if (w == 0xF860 || w == 0xF861 || w == 0xF862) { - /* Apple 'transcoding hint' codepoints (from private use area) */ - if (!len) { - if (end) { - MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac); - } else { - buf->state = w; - } - MB_CONVERT_BUF_STORE(buf, out, limit); - return; - } - - uint32_t w2 = *in++; - len--; - - for (int i = 0; i < code_tbl_m_len; i++) { - if (w == code_tbl_m[i][1] && w2 == code_tbl_m[i][2]) { - /* This might be a valid transcoding hint sequence */ - int index = 3; - - if (buf->state) { -resume_transcoding_hint: - i = buf->state >> 24; - index = (buf->state >> 16) & 0xFF; - buf->state = 0; - } - - int expected = transcoding_hint_cp_width[w - 0xF860]; - - while (index <= expected) { - if (!len) { - if (end) { - for (int j = 1; j < index; j++) { - MB_CONVERT_ERROR(buf, out, limit, code_tbl_m[i][j], mb_wchar_to_sjismac); - } - } else { - buf->state = (i << 24) | (index << 16) | (w & 0xFFFF); - } - MB_CONVERT_BUF_STORE(buf, out, limit); - return; - } - - w2 = *in++; - len--; - - if (w2 != code_tbl_m[i][index]) { - /* Didn't match */ - for (int j = 1; j < index; j++) { - MB_CONVERT_ERROR(buf, out, limit, code_tbl_m[i][j], mb_wchar_to_sjismac); - } - MB_CONVERT_ERROR(buf, out, limit, w2, mb_wchar_to_sjismac); - MB_CONVERT_BUF_ENSURE(buf, out, limit, len); - goto next_iteration; - } - - index++; - } - - /* Successful match, emit SJIS-mac bytes */ - s = code_tbl_m[i][0]; - unsigned int c1 = (s / 94) + 0x21, c2 = (s % 94) + 0x21, s1, s2; - SJIS_ENCODE(c1, c2, s1, s2); - MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); - out = mb_convert_buf_add2(out, s1, s2); - goto next_iteration; - } - } - - /* No valid transcoding hint sequence found */ - in--; len++; - MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac); - MB_CONVERT_BUF_ENSURE(buf, out, limit, len); - continue; - } - } - - if (!s) { - if (w == 0xA0) { - s = 0xA0; - } else if (w == 0xA5) { /* YEN SIGN */ - /* Unicode has codepoint 0xFFE5 for a fullwidth Yen sign; - * convert codepoint 0xA5 to halfwidth Yen sign */ - s = 0x5C; /* HALFWIDTH YEN SIGN */ - } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */ - s = 0x2140; - } else { - for (int i = 0; i < wchar2sjis_mac_r_tbl_len; i++) { - if (w >= wchar2sjis_mac_r_tbl[i][0] && w <= wchar2sjis_mac_r_tbl[i][1]) { - s = w - wchar2sjis_mac_r_tbl[i][0] + wchar2sjis_mac_r_tbl[i][2]; - s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); - goto found_kuten_code; - } - } - - for (int i = 0; i < wchar2sjis_mac_r_map_len; i++) { - if (w >= wchar2sjis_mac_r_map[i][0] && w <= wchar2sjis_mac_r_map[i][1]) { - s = wchar2sjis_mac_code_map[i][w - wchar2sjis_mac_r_map[i][0]]; - if (s) { - s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); - goto found_kuten_code; - } - } - } - - for (int i = 0; i < wchar2sjis_mac_wchar_tbl_len; i++) { - if (w == wchar2sjis_mac_wchar_tbl[i][0]) { - s = wchar2sjis_mac_wchar_tbl[i][1]; - s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); - goto found_kuten_code; - } - } - } - } - -found_kuten_code: - if ((!s && w) || s >= 0x8080) { - MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac); - MB_CONVERT_BUF_ENSURE(buf, out, limit, len); - } else if (s <= 0xFF) { - out = mb_convert_buf_add(out, s); - } else { - unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2; - SJIS_ENCODE(c1, c2, s1, s2); - MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); - out = mb_convert_buf_add2(out, s1, s2); - } - -next_iteration: ; - } - - MB_CONVERT_BUF_STORE(buf, out, limit); -} - -int mbfilter_sjis_emoji_docomo2unicode(int s, int *snd) -{ - /* All three mobile vendors had emoji for numbers on a telephone keypad - * Unicode doesn't have those, but it has a combining character which puts - * a 'keypad button' around the following character, making it look like - * a key on a telephone or keyboard. That combining char is codepoint 0x20E3. */ - if (s >= mb_tbl_code2uni_docomo1_min && s <= mb_tbl_code2uni_docomo1_max) { - if ((s >= DOCOMO_KEYPAD(1) && s <= DOCOMO_KEYPAD(9)) || s == DOCOMO_KEYPAD(0) || s == DOCOMO_KEYPAD_HASH) { - EMIT_KEYPAD_EMOJI(convert_emoji_cp(mb_tbl_code2uni_docomo1[s - mb_tbl_code2uni_docomo1_min])); - } else { - *snd = 0; - return convert_emoji_cp(mb_tbl_code2uni_docomo1[s - mb_tbl_code2uni_docomo1_min]); - } - } - return 0; -} - -int mbfilter_sjis_emoji_sb2unicode(int s, int *snd) -{ - if (s >= mb_tbl_code2uni_sb1_min && s <= mb_tbl_code2uni_sb1_max) { - if (s == 0x2817 || (s >= 0x2823 && s <= 0x282C)) { - EMIT_KEYPAD_EMOJI(mb_tbl_code2uni_sb1[s - mb_tbl_code2uni_sb1_min]); - } else { - *snd = 0; - return convert_emoji_cp(mb_tbl_code2uni_sb1[s - mb_tbl_code2uni_sb1_min]); - } - } else if (s >= mb_tbl_code2uni_sb2_min && s <= mb_tbl_code2uni_sb2_max) { - *snd = 0; - return convert_emoji_cp(mb_tbl_code2uni_sb2[s - mb_tbl_code2uni_sb2_min]); - } else if (s >= mb_tbl_code2uni_sb3_min && s <= mb_tbl_code2uni_sb3_max) { - if (s >= 0x2B02 && s <= 0x2B0B) { - EMIT_FLAG_EMOJI(nflags_sb[s - 0x2B02]); - } else { - *snd = 0; - return convert_emoji_cp(mb_tbl_code2uni_sb3[s - mb_tbl_code2uni_sb3_min]); - } - } - return 0; -} - -int mbfilter_unicode2sjis_emoji_docomo(int c, int *s1, mbfl_convert_filter *filter) -{ - /* When converting SJIS-Mobile to Unicode, we convert keypad symbol emoji - * to a sequence of 2 codepoints, one of which is a combining character which - * adds the 'key' image around the other - * - * In the other direction, look for such sequences and convert them to a - * single emoji */ - if (filter->status == 1) { - int c1 = filter->cache; - filter->cache = filter->status = 0; - if (c == 0x20E3) { - if (c1 == '#') { - *s1 = 0x2964; - } else if (c1 == '0') { - *s1 = 0x296F; - } else { /* Previous character was '1'-'9' */ - *s1 = 0x2966 + (c1 - '1'); - } - return 1; - } else { - /* This character wasn't combining character to make keypad symbol, - * so pass the previous character through... and proceed to process the - * current character as usual - * (Single-byte ASCII characters are valid in Shift-JIS...) */ - CK((*filter->output_function)(c1, filter->data)); - } - } - - if (c == '#' || (c >= '0' && c <= '9')) { - filter->status = 1; - filter->cache = c; - return 0; - } - - if (c == 0xA9) { /* Copyright sign */ - *s1 = 0x29B5; - return 1; - } else if (c == 0x00AE) { /* Registered sign */ - *s1 = 0x29BA; - return 1; - } else if (c >= mb_tbl_uni_docomo2code2_min && c <= mb_tbl_uni_docomo2code2_max) { - int i = mbfl_bisec_srch2(c, mb_tbl_uni_docomo2code2_key, mb_tbl_uni_docomo2code2_len); - if (i >= 0) { - *s1 = mb_tbl_uni_docomo2code2_value[i]; - return 1; - } - } else if (c >= mb_tbl_uni_docomo2code3_min && c <= mb_tbl_uni_docomo2code3_max) { - int i = mbfl_bisec_srch2(c - 0x10000, mb_tbl_uni_docomo2code3_key, mb_tbl_uni_docomo2code3_len); - if (i >= 0) { - *s1 = mb_tbl_uni_docomo2code3_value[i]; - return 1; - } - } else if (c >= mb_tbl_uni_docomo2code5_min && c <= mb_tbl_uni_docomo2code5_max) { - int i = mbfl_bisec_srch2(c - 0xF0000, mb_tbl_uni_docomo2code5_key, mb_tbl_uni_docomo2code5_len); - if (i >= 0) { - *s1 = mb_tbl_uni_docomo2code5_val[i]; - return 1; - } - } - return 0; -} - -int mbfilter_unicode2sjis_emoji_kddi_sjis(int c, int *s1, mbfl_convert_filter *filter) -{ - if (filter->status == 1) { - int c1 = filter->cache; - filter->cache = filter->status = 0; - if (c == 0x20E3) { - if (c1 == '#') { - *s1 = 0x25BC; - } else if (c1 == '0') { - *s1 = 0x2830; - } else { /* Previous character was '1'-'9' */ - *s1 = 0x27a6 + (c1 - '1'); - } - return 1; - } else { - CK((*filter->output_function)(c1, filter->data)); - } - } else if (filter->status == 2) { - int c1 = filter->cache; - filter->cache = filter->status = 0; - if (c >= NFLAGS('B') && c <= NFLAGS('U')) { /* B for GB, U for RU */ - for (int i = 0; i < 10; i++) { - if (c1 == NFLAGS(nflags_s[i][0]) && c == NFLAGS(nflags_s[i][1])) { - *s1 = nflags_code_kddi[i]; - return 1; - } - } - } - - /* If none of the KDDI national flag emoji matched, then we have no way - * to convert the previous codepoint... */ - mbfl_filt_conv_illegal_output(c1, filter); - } - - if (c == '#' || (c >= '0' && c <= '9')) { - filter->status = 1; - filter->cache = c; - return 0; - } else if (c >= NFLAGS('C') && c <= NFLAGS('U')) { /* C for CN, U for US */ - filter->status = 2; - filter->cache = c; - return 0; - } - - if (c == 0xA9) { /* Copyright sign */ - *s1 = 0x27DC; - return 1; - } else if (c == 0xAE) { /* Registered sign */ - *s1 = 0x27DD; - return 1; - } else if (c >= mb_tbl_uni_kddi2code2_min && c <= mb_tbl_uni_kddi2code2_max) { - int i = mbfl_bisec_srch2(c, mb_tbl_uni_kddi2code2_key, mb_tbl_uni_kddi2code2_len); - if (i >= 0) { - *s1 = mb_tbl_uni_kddi2code2_value[i]; - return 1; - } - } else if (c >= mb_tbl_uni_kddi2code3_min && c <= mb_tbl_uni_kddi2code3_max) { - int i = mbfl_bisec_srch2(c - 0x10000, mb_tbl_uni_kddi2code3_key, mb_tbl_uni_kddi2code3_len); - if (i >= 0) { - *s1 = mb_tbl_uni_kddi2code3_value[i]; - return 1; - } - } else if (c >= mb_tbl_uni_kddi2code5_min && c <= mb_tbl_uni_kddi2code5_max) { - int i = mbfl_bisec_srch2(c - 0xF0000, mb_tbl_uni_kddi2code5_key, mb_tbl_uni_kddi2code5_len); - if (i >= 0) { - *s1 = mb_tbl_uni_kddi2code5_val[i]; - return 1; - } - } - return 0; -} - -int mbfilter_unicode2sjis_emoji_sb(int c, int *s1, mbfl_convert_filter *filter) -{ - if (filter->status == 1) { - int c1 = filter->cache; - filter->cache = filter->status = 0; - if (c == 0x20E3) { - if (c1 == '#') { - *s1 = 0x2817; - } else if (c1 == '0') { - *s1 = 0x282c; - } else { /* Previous character was '1'-'9' */ - *s1 = 0x2823 + (c1 - '1'); - } - return 1; - } else { - (*filter->output_function)(c1, filter->data); - } - } else if (filter->status == 2) { - int c1 = filter->cache; - filter->cache = filter->status = 0; - if (c >= NFLAGS('B') && c <= NFLAGS('U')) { /* B for GB, U for RU */ - for (int i = 0; i < 10; i++) { - if (c1 == NFLAGS(nflags_s[i][0]) && c == NFLAGS(nflags_s[i][1])) { - *s1 = nflags_code_sb[i]; - return 1; - } - } - } - - /* If none of the SoftBank national flag emoji matched, then we have no way - * to convert the previous codepoint... */ - mbfl_filt_conv_illegal_output(c1, filter); - } - - if (c == '#' || (c >= '0' && c <= '9')) { - filter->status = 1; - filter->cache = c; - return 0; - } else if (c >= NFLAGS('C') && c <= NFLAGS('U')) { /* C for CN, U for US */ - filter->status = 2; - filter->cache = c; - return 0; - } - - if (c == 0xA9) { /* Copyright sign */ - *s1 = 0x2855; - return 1; - } else if (c == 0xAE) { /* Registered sign */ - *s1 = 0x2856; - return 1; - } else if (c >= mb_tbl_uni_sb2code2_min && c <= mb_tbl_uni_sb2code2_max) { - int i = mbfl_bisec_srch2(c, mb_tbl_uni_sb2code2_key, mb_tbl_uni_sb2code2_len); - if (i >= 0) { - *s1 = mb_tbl_uni_sb2code2_value[i]; - return 1; - } - } else if (c >= mb_tbl_uni_sb2code3_min && c <= mb_tbl_uni_sb2code3_max) { - int i = mbfl_bisec_srch2(c - 0x10000, mb_tbl_uni_sb2code3_key, mb_tbl_uni_sb2code3_len); - if (i >= 0) { - *s1 = mb_tbl_uni_sb2code3_value[i]; - return 1; - } - } else if (c >= mb_tbl_uni_sb2code5_min && c <= mb_tbl_uni_sb2code5_max) { - int i = mbfl_bisec_srch2(c - 0xF0000, mb_tbl_uni_sb2code5_key, mb_tbl_uni_sb2code5_len); - if (i >= 0) { - *s1 = mb_tbl_uni_sb2code5_val[i]; - return 1; - } - } - return 0; -} - -static int mbfl_filt_conv_sjis_mobile_wchar(int c, mbfl_convert_filter *filter) -{ - int c1, s, s1, s2, w, snd = 0; - - switch (filter->status) { - case 0: - if (c >= 0 && c < 0x80) { /* ASCII */ - if (filter->from == &mbfl_encoding_sjis_sb && c == 0x1B) { - /* ESC; escape sequences were used on older SoftBank phones for emoji */ - filter->cache = c; - filter->status = 2; - } else { - CK((*filter->output_function)(c, filter->data)); - } - } else if (c > 0xA0 && c < 0xE0) { /* Kana */ - CK((*filter->output_function)(0xFEC0 + c, filter->data)); - } else if (c > 0x80 && c < 0xFD && c != 0xA0) { /* Kanji, first byte */ - filter->status = 1; - filter->cache = c; - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - case 1: /* Kanji, second byte */ - filter->status = 0; - c1 = filter->cache; - if (c >= 0x40 && c <= 0xFC && c != 0x7F) { - w = 0; - SJIS_DECODE(c1, c, s1, s2); - s = ((s1 - 0x21) * 94) + s2 - 0x21; - if (s <= 137) { - if (s == 31) { - w = 0xFF3C; /* FULLWIDTH REVERSE SOLIDUS */ - } else if (s == 32) { - w = 0xFF5E; /* FULLWIDTH TILDE */ - } else if (s == 33) { - w = 0x2225; /* PARALLEL TO */ - } else if (s == 60) { - w = 0xFF0D; /* FULLWIDTH HYPHEN-MINUS */ - } else if (s == 80) { - w = 0xFFE0; /* FULLWIDTH CENT SIGN */ - } else if (s == 81) { - w = 0xFFE1; /* FULLWIDTH POUND SIGN */ - } else if (s == 137) { - w = 0xFFE2; /* FULLWIDTH NOT SIGN */ - } - } - if (w == 0) { - if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) { /* vendor ext1 (13ku) */ - w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min]; - } else if (s >= 0 && s < jisx0208_ucs_table_size) { /* X 0208 */ - w = jisx0208_ucs_table[s]; - } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) { /* vendor ext2 (89ku - 92ku) */ - w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min]; - } - - /* Emoji */ - if (filter->from == &mbfl_encoding_sjis_docomo && s >= mb_tbl_code2uni_docomo1_min && s <= mb_tbl_code2uni_docomo1_max) { - w = mbfilter_sjis_emoji_docomo2unicode(s, &snd); - if (snd > 0) { - CK((*filter->output_function)(snd, filter->data)); - } - } else if (filter->from == &mbfl_encoding_sjis_kddi && s >= mb_tbl_code2uni_kddi1_min && s <= mb_tbl_code2uni_kddi2_max) { - w = mbfilter_sjis_emoji_kddi2unicode(s, &snd); - if (snd > 0) { - CK((*filter->output_function)(snd, filter->data)); - } - } else if (filter->from == &mbfl_encoding_sjis_sb && s >= mb_tbl_code2uni_sb1_min && s <= mb_tbl_code2uni_sb3_max) { - w = mbfilter_sjis_emoji_sb2unicode(s, &snd); - if (snd > 0) { - CK((*filter->output_function)(snd, filter->data)); - } - } - - if (w == 0) { - if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) { /* vendor ext3 (115ku - 119ku) */ - w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min]; - } else if (s >= (94*94) && s < (114*94)) { /* user (95ku - 114ku) */ - w = s - (94*94) + 0xe000; - } - } - } - if (w <= 0) { - w = MBFL_BAD_INPUT; - } - CK((*filter->output_function)(w, filter->data)); - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - /* ESC: Softbank Emoji */ - case 2: - if (c == '$') { - filter->cache = c; - filter->status++; - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - filter->status = filter->cache = 0; - } - break; - - /* ESC $: Softbank Emoji */ - case 3: - if ((c >= 'E' && c <= 'G') || (c >= 'O' && c <= 'Q')) { - filter->cache = c; - filter->status++; - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - filter->status = filter->cache = 0; - } - break; - - /* ESC $ [GEFOPQ]: Softbank Emoji */ - case 4: - c1 = filter->cache; - if (c == 0xF) { /* Terminate sequence of emoji */ - filter->status = filter->cache = 0; - return 0; - } else { - if (c1 == 'G' && c >= 0x21 && c <= 0x7a) { - s1 = (0x91 - 0x21) * 94; - } else if (c1 == 'E' && c >= 0x21 && c <= 0x7A) { - s1 = (0x8D - 0x21) * 94; - } else if (c1 == 'F' && c >= 0x21 && c <= 0x7A) { - s1 = (0x8E - 0x21) * 94; - } else if (c1 == 'O' && c >= 0x21 && c <= 0x6D) { - s1 = (0x92 - 0x21) * 94; - } else if (c1 == 'P' && c >= 0x21 && c <= 0x6C) { - s1 = (0x95 - 0x21) * 94; - } else if (c1 == 'Q' && c >= 0x21 && c <= 0x5E) { - s1 = (0x96 - 0x21) * 94; - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - filter->status = filter->cache = 0; - return 0; - } - - w = mbfilter_sjis_emoji_sb2unicode(s1 + c - 0x21, &snd); - if (w > 0) { - if (snd > 0) { - CK((*filter->output_function)(snd, filter->data)); - } - CK((*filter->output_function)(w, filter->data)); - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - filter->status = filter->cache = 0; - } - } - } - - return 0; -} - -static int mbfl_filt_conv_wchar_sjis_mobile(int c, mbfl_convert_filter *filter) -{ - int c1, c2, s1 = 0, s2 = 0; - - if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) { - s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min]; - } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) { - s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min]; - } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) { - s1 = ucs_i_jis_table[c - ucs_i_jis_table_min]; - } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) { - s1 = ucs_r_jis_table[c - ucs_r_jis_table_min]; - } else if (c >= 0xE000 && c < (0xE000 + 20*94)) { - /* Private User Area (95ku - 114ku) */ - s1 = c - 0xE000; - c1 = (s1 / 94) + 0x7F; - c2 = (s1 % 94) + 0x21; - s1 = (c1 << 8) | c2; - s2 = 1; - } - - if (s1 <= 0) { - if (c == 0xA5) { /* YEN SIGN */ - s1 = 0x216F; /* FULLWIDTH YEN SIGN */ - } else if (c == 0xFF3c) { /* FULLWIDTH REVERSE SOLIDUS */ - s1 = 0x2140; - } else if (c == 0x2225) { /* PARALLEL TO */ - s1 = 0x2142; - } else if (c == 0xFF0D) { /* FULLWIDTH HYPHEN-MINUS */ - s1 = 0x215D; - } else if (c == 0xFFE0) { /* FULLWIDTH CENT SIGN */ - s1 = 0x2171; - } else if (c == 0xFFE1) { /* FULLWIDTH POUND SIGN */ - s1 = 0x2172; - } else if (c == 0xFFE2) { /* FULLWIDTH NOT SIGN */ - s1 = 0x224C; - } - } + w2 = *in++; + len--; - if ((s1 <= 0) || (s1 >= 0x8080 && s2 == 0)) { /* not found or X 0212 */ - s1 = -1; + if (w2 != code_tbl_m[i][index]) { + /* Didn't match */ + for (int j = 1; j < index; j++) { + MB_CONVERT_ERROR(buf, out, limit, code_tbl_m[i][j], mb_wchar_to_sjismac); + } + MB_CONVERT_ERROR(buf, out, limit, w2, mb_wchar_to_sjismac); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + goto next_iteration; + } - /* CP932 vendor ext1 (13ku) */ - for (c1 = 0; c1 < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; c1++) { - if (c == cp932ext1_ucs_table[c1]) { - s1 = (((c1 / 94) + 0x2D) << 8) + (c1 % 94) + 0x21; - break; + index++; + } + + /* Successful match, emit SJIS-mac bytes */ + s = code_tbl_m[i][0]; + unsigned int c1 = (s / 94) + 0x21, c2 = (s % 94) + 0x21, s1, s2; + SJIS_ENCODE(c1, c2, s1, s2); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, s1, s2); + goto next_iteration; + } + } + + /* No valid transcoding hint sequence found */ + in--; len++; + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + continue; } } - if (s1 <= 0) { - /* CP932 vendor ext2 (115ku - 119ku) */ - for (c1 = 0; c1 < cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; c1++) { - if (c == cp932ext2_ucs_table[c1]) { - s1 = (((c1 / 94) + 0x79) << 8) + (c1 % 94) + 0x21; - break; + if (!s) { + if (w == 0xA0) { + s = 0xA0; + } else if (w == 0xA5) { /* YEN SIGN */ + /* Unicode has codepoint 0xFFE5 for a fullwidth Yen sign; + * convert codepoint 0xA5 to halfwidth Yen sign */ + s = 0x5C; /* HALFWIDTH YEN SIGN */ + } else if (w == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */ + s = 0x2140; + } else { + for (int i = 0; i < wchar2sjis_mac_r_tbl_len; i++) { + if (w >= wchar2sjis_mac_r_tbl[i][0] && w <= wchar2sjis_mac_r_tbl[i][1]) { + s = w - wchar2sjis_mac_r_tbl[i][0] + wchar2sjis_mac_r_tbl[i][2]; + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + goto found_kuten_code; + } + } + + for (int i = 0; i < wchar2sjis_mac_r_map_len; i++) { + if (w >= wchar2sjis_mac_r_map[i][0] && w <= wchar2sjis_mac_r_map[i][1]) { + s = wchar2sjis_mac_code_map[i][w - wchar2sjis_mac_r_map[i][0]]; + if (s) { + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + goto found_kuten_code; + } + } + } + + for (int i = 0; i < wchar2sjis_mac_wchar_tbl_len; i++) { + if (w == wchar2sjis_mac_wchar_tbl[i][0]) { + s = wchar2sjis_mac_wchar_tbl[i][1]; + s = (((s / 94) + 0x21) << 8) | ((s % 94) + 0x21); + goto found_kuten_code; + } } } } - if (c == 0) { - s1 = 0; +found_kuten_code: + if ((!s && w) || s >= 0x8080) { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_sjismac); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } else if (s <= 0xFF) { + out = mb_convert_buf_add(out, s); + } else { + unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2; + SJIS_ENCODE(c1, c2, s1, s2); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, s1, s2); } - } - if ((filter->to == &mbfl_encoding_sjis_docomo && mbfilter_unicode2sjis_emoji_docomo(c, &s1, filter)) || - (filter->to == &mbfl_encoding_sjis_kddi && mbfilter_unicode2sjis_emoji_kddi_sjis(c, &s1, filter)) || - (filter->to == &mbfl_encoding_sjis_sb && mbfilter_unicode2sjis_emoji_sb(c, &s1, filter))) { - s1 = (((s1 / 94) + 0x21) << 8) | ((s1 % 94) + 0x21); +next_iteration: ; } - if (filter->status) { - return 0; - } + MB_CONVERT_BUF_STORE(buf, out, limit); +} - if (s1 >= 0) { - if (s1 < 0x100) { /* Latin/Kana */ - CK((*filter->output_function)(s1, filter->data)); - } else { /* Kanji */ - c1 = (s1 >> 8) & 0xff; - c2 = s1 & 0xff; - SJIS_ENCODE(c1, c2, s1, s2); - CK((*filter->output_function)(s1, filter->data)); - CK((*filter->output_function)(s2, filter->data)); +int mbfilter_sjis_emoji_docomo2unicode(int s, int *snd) +{ + /* All three mobile vendors had emoji for numbers on a telephone keypad + * Unicode doesn't have those, but it has a combining character which puts + * a 'keypad button' around the following character, making it look like + * a key on a telephone or keyboard. That combining char is codepoint 0x20E3. */ + if (s >= mb_tbl_code2uni_docomo1_min && s <= mb_tbl_code2uni_docomo1_max) { + if ((s >= DOCOMO_KEYPAD(1) && s <= DOCOMO_KEYPAD(9)) || s == DOCOMO_KEYPAD(0) || s == DOCOMO_KEYPAD_HASH) { + EMIT_KEYPAD_EMOJI(convert_emoji_cp(mb_tbl_code2uni_docomo1[s - mb_tbl_code2uni_docomo1_min])); + } else { + *snd = 0; + return convert_emoji_cp(mb_tbl_code2uni_docomo1[s - mb_tbl_code2uni_docomo1_min]); } - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); } - return 0; } -int mbfl_filt_conv_sjis_mobile_flush(mbfl_convert_filter *filter) +int mbfilter_sjis_emoji_sb2unicode(int s, int *snd) { - int c1 = filter->cache; - if (filter->status == 1 && (c1 == '#' || (c1 >= '0' && c1 <= '9'))) { - filter->cache = filter->status = 0; - CK((*filter->output_function)(c1, filter->data)); - } else if (filter->status == 2) { - /* First of a pair of Regional Indicator codepoints came at the end of a string */ - filter->cache = filter->status = 0; - mbfl_filt_conv_illegal_output(c1, filter); - } - - if (filter->flush_function) { - (*filter->flush_function)(filter->data); + if (s >= mb_tbl_code2uni_sb1_min && s <= mb_tbl_code2uni_sb1_max) { + if (s == 0x2817 || (s >= 0x2823 && s <= 0x282C)) { + EMIT_KEYPAD_EMOJI(mb_tbl_code2uni_sb1[s - mb_tbl_code2uni_sb1_min]); + } else { + *snd = 0; + return convert_emoji_cp(mb_tbl_code2uni_sb1[s - mb_tbl_code2uni_sb1_min]); + } + } else if (s >= mb_tbl_code2uni_sb2_min && s <= mb_tbl_code2uni_sb2_max) { + *snd = 0; + return convert_emoji_cp(mb_tbl_code2uni_sb2[s - mb_tbl_code2uni_sb2_min]); + } else if (s >= mb_tbl_code2uni_sb3_min && s <= mb_tbl_code2uni_sb3_max) { + if (s >= 0x2B02 && s <= 0x2B0B) { + EMIT_FLAG_EMOJI(nflags_sb[s - 0x2B02]); + } else { + *snd = 0; + return convert_emoji_cp(mb_tbl_code2uni_sb3[s - mb_tbl_code2uni_sb3_min]); + } } - return 0; } @@ -7345,198 +6232,13 @@ process_codepoint: ; out = mb_convert_buf_add(out, s); } else { unsigned int c1 = (s >> 8) & 0xFF, c2 = s & 0xFF, s1, s2; - SJIS_ENCODE(c1, c2, s1, s2); - MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); - out = mb_convert_buf_add2(out, s1, s2); - } - } - - MB_CONVERT_BUF_STORE(buf, out, limit); -} - -static int mbfl_filt_conv_cp932_wchar(int c, mbfl_convert_filter *filter) -{ - int c1, s, s1, s2, w; - - switch (filter->status) { - case 0: - if (c >= 0 && c < 0x80) { /* latin */ - CK((*filter->output_function)(c, filter->data)); - } else if (c > 0xa0 && c < 0xe0) { /* kana */ - CK((*filter->output_function)(0xfec0 + c, filter->data)); - } else if (c > 0x80 && c < 0xfd && c != 0xa0) { /* kanji first char */ - filter->status = 1; - filter->cache = c; - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - case 1: /* kanji second char */ - filter->status = 0; - c1 = filter->cache; - if (c >= 0x40 && c <= 0xfc && c != 0x7f) { - w = 0; - SJIS_DECODE(c1, c, s1, s2); - s = (s1 - 0x21)*94 + s2 - 0x21; - if (s <= 137) { - if (s == 31) { - w = 0xff3c; /* FULLWIDTH REVERSE SOLIDUS */ - } else if (s == 32) { - w = 0xff5e; /* FULLWIDTH TILDE */ - } else if (s == 33) { - w = 0x2225; /* PARALLEL TO */ - } else if (s == 60) { - w = 0xff0d; /* FULLWIDTH HYPHEN-MINUS */ - } else if (s == 80) { - w = 0xffe0; /* FULLWIDTH CENT SIGN */ - } else if (s == 81) { - w = 0xffe1; /* FULLWIDTH POUND SIGN */ - } else if (s == 137) { - w = 0xffe2; /* FULLWIDTH NOT SIGN */ - } - } - if (w == 0) { - if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) { /* vendor ext1 (13ku) */ - w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min]; - } else if (s >= 0 && s < jisx0208_ucs_table_size) { /* X 0208 */ - w = jisx0208_ucs_table[s]; - } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) { /* vendor ext2 (89ku - 92ku) */ - w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min]; - } else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) { /* vendor ext3 (115ku - 119ku) */ - w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min]; - } else if (s >= (94*94) && s < (114*94)) { /* user (95ku - 114ku) */ - w = s - (94*94) + 0xe000; - } - } - - if (w <= 0) { - w = MBFL_BAD_INPUT; - } - - CK((*filter->output_function)(w, filter->data)); - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - EMPTY_SWITCH_DEFAULT_CASE(); - } - - return 0; -} - -static int mbfl_filt_conv_cp932_wchar_flush(mbfl_convert_filter *filter) -{ - if (filter->status) { - (*filter->output_function)(MBFL_BAD_INPUT, filter->data); - filter->status = 0; - } - - if (filter->flush_function) { - (*filter->flush_function)(filter->data); - } - - return 0; -} - -static int mbfl_filt_conv_wchar_cp932(int c, mbfl_convert_filter *filter) -{ - int c1, c2, s1, s2; - - s1 = 0; - s2 = 0; - if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) { - s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min]; - } else if (c == 0x203E) { - s1 = 0x7E; - } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) { - s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min]; - } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) { - s1 = ucs_i_jis_table[c - ucs_i_jis_table_min]; - } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) { - s1 = ucs_r_jis_table[c - ucs_r_jis_table_min]; - } else if (c >= 0xe000 && c < (0xe000 + 20*94)) { /* user (95ku - 114ku) */ - s1 = c - 0xe000; - c1 = s1/94 + 0x7f; - c2 = s1%94 + 0x21; - s1 = (c1 << 8) | c2; - s2 = 1; - } - if (s1 <= 0) { - if (c == 0xa5) { /* YEN SIGN */ - s1 = 0x5C; - } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */ - s1 = 0x2140; - } else if (c == 0x2225) { /* PARALLEL TO */ - s1 = 0x2142; - } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */ - s1 = 0x215d; - } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */ - s1 = 0x2171; - } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */ - s1 = 0x2172; - } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */ - s1 = 0x224c; - } - } - if ((s1 <= 0) || (s1 >= 0x8080 && s2 == 0)) { /* not found or X 0212 */ - s1 = -1; - c1 = 0; - c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; - while (c1 < c2) { /* CP932 vendor ext1 (13ku) */ - if (c == cp932ext1_ucs_table[c1]) { - s1 = ((c1/94 + 0x2d) << 8) + (c1%94 + 0x21); - break; - } - c1++; - } - if (s1 <= 0) { - c1 = 0; - c2 = cp932ext3_ucs_table_max - cp932ext3_ucs_table_min; - while (c1 < c2) { /* CP932 vendor ext3 (115ku - 119ku) */ - if (c == cp932ext3_ucs_table[c1]) { - s1 = ((c1/94 + 0x93) << 8) + (c1%94 + 0x21); - break; - } - c1++; - } - } - if (c == 0) { - s1 = 0; - } else if (s1 <= 0) { - s1 = -1; - } - } - if (s1 >= 0) { - if (s1 < 0x100) { /* latin or kana */ - CK((*filter->output_function)(s1, filter->data)); - } else { /* kanji */ - c1 = (s1 >> 8) & 0xff; - c2 = s1 & 0xff; - SJIS_ENCODE(c1, c2, s1, s2); - CK((*filter->output_function)(s1, filter->data)); - CK((*filter->output_function)(s2, filter->data)); + SJIS_ENCODE(c1, c2, s1, s2); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, s1, s2); } - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); } - return 0; -} - -static int mbfl_filt_conv_wchar_sjiswin(int c, mbfl_convert_filter *filter) -{ - if (c == 0xA5) { - CK((*filter->output_function)(0x81, filter->data)); - CK((*filter->output_function)(0x8F, filter->data)); - } else if (c == 0x203E) { - CK((*filter->output_function)(0x81, filter->data)); - CK((*filter->output_function)(0x50, filter->data)); - } else { - return mbfl_filt_conv_wchar_cp932(c, filter); - } - return 0; + MB_CONVERT_BUF_STORE(buf, out, limit); } static size_t mb_cp932_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) @@ -7823,26 +6525,6 @@ static const unsigned char mblen_table_sjis_mobile[] = { /* 0x81-0x9F,0xE0-0xFC static const char *mbfl_encoding_sjis_aliases[] = {"x-sjis", "SHIFT-JIS", NULL}; -static const struct mbfl_convert_vtbl vtbl_sjis_wchar = { - mbfl_no_encoding_sjis, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_sjis_wchar, - mbfl_filt_conv_sjis_wchar_flush, - NULL -}; - -static const struct mbfl_convert_vtbl vtbl_wchar_sjis = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_sjis, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_sjis, - mbfl_filt_conv_common_flush, - NULL -}; - const mbfl_encoding mbfl_encoding_sjis = { mbfl_no_encoding_sjis, "SJIS", @@ -7850,8 +6532,8 @@ const mbfl_encoding mbfl_encoding_sjis = { mbfl_encoding_sjis_aliases, mblen_table_sjis, MBFL_ENCTYPE_GL_UNSAFE, - &vtbl_sjis_wchar, - &vtbl_wchar_sjis, + NULL, + NULL, mb_sjis_to_wchar, mb_wchar_to_sjis, NULL, @@ -7860,26 +6542,6 @@ const mbfl_encoding mbfl_encoding_sjis = { static const char *mbfl_encoding_sjis_mac_aliases[] = {"MacJapanese", "x-Mac-Japanese", NULL}; -static const struct mbfl_convert_vtbl vtbl_sjis_mac_wchar = { - mbfl_no_encoding_sjis_mac, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_sjis_mac_wchar, - mbfl_filt_conv_sjis_wchar_flush, - NULL, -}; - -static const struct mbfl_convert_vtbl vtbl_wchar_sjis_mac = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_sjis_mac, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_sjis_mac, - mbfl_filt_conv_wchar_sjis_mac_flush, - NULL, -}; - const mbfl_encoding mbfl_encoding_sjis_mac = { mbfl_no_encoding_sjis_mac, "SJIS-mac", @@ -7887,8 +6549,8 @@ const mbfl_encoding mbfl_encoding_sjis_mac = { mbfl_encoding_sjis_mac_aliases, mblen_table_sjismac, MBFL_ENCTYPE_GL_UNSAFE, - &vtbl_sjis_mac_wchar, - &vtbl_wchar_sjis_mac, + NULL, + NULL, mb_sjismac_to_wchar, mb_wchar_to_sjismac, NULL, @@ -7899,26 +6561,6 @@ static const char *mbfl_encoding_sjis_docomo_aliases[] = {"SJIS-DOCOMO", "shift_ static const char *mbfl_encoding_sjis_kddi_aliases[] = {"SJIS-KDDI", "shift_jis-kddi", "x-sjis-emoji-kddi", NULL}; static const char *mbfl_encoding_sjis_sb_aliases[] = {"SJIS-SOFTBANK", "shift_jis-softbank", "x-sjis-emoji-softbank", NULL}; -static const struct mbfl_convert_vtbl vtbl_sjis_docomo_wchar = { - mbfl_no_encoding_sjis_docomo, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_sjis_mobile_wchar, - mbfl_filt_conv_sjis_wchar_flush, - NULL, -}; - -static const struct mbfl_convert_vtbl vtbl_wchar_sjis_docomo = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_sjis_docomo, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_sjis_mobile, - mbfl_filt_conv_sjis_mobile_flush, - NULL, -}; - const mbfl_encoding mbfl_encoding_sjis_docomo = { mbfl_no_encoding_sjis_docomo, "SJIS-Mobile#DOCOMO", @@ -7926,31 +6568,11 @@ const mbfl_encoding mbfl_encoding_sjis_docomo = { mbfl_encoding_sjis_docomo_aliases, mblen_table_sjis_mobile, MBFL_ENCTYPE_GL_UNSAFE, - &vtbl_sjis_docomo_wchar, - &vtbl_wchar_sjis_docomo, - mb_sjis_docomo_to_wchar, - mb_wchar_to_sjis_docomo, - NULL, - NULL, -}; - -static const struct mbfl_convert_vtbl vtbl_sjis_kddi_wchar = { - mbfl_no_encoding_sjis_kddi, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, NULL, - mbfl_filt_conv_sjis_mobile_wchar, - mbfl_filt_conv_sjis_wchar_flush, NULL, -}; - -static const struct mbfl_convert_vtbl vtbl_wchar_sjis_kddi = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_sjis_kddi, - mbfl_filt_conv_common_ctor, + mb_sjis_docomo_to_wchar, + mb_wchar_to_sjis_docomo, NULL, - mbfl_filt_conv_wchar_sjis_mobile, - mbfl_filt_conv_sjis_mobile_flush, NULL, }; @@ -7961,31 +6583,11 @@ const mbfl_encoding mbfl_encoding_sjis_kddi = { mbfl_encoding_sjis_kddi_aliases, mblen_table_sjis_mobile, MBFL_ENCTYPE_GL_UNSAFE, - &vtbl_sjis_kddi_wchar, - &vtbl_wchar_sjis_kddi, - mb_sjis_kddi_to_wchar, - mb_wchar_to_sjis_kddi, - NULL, - NULL, -}; - -static const struct mbfl_convert_vtbl vtbl_sjis_sb_wchar = { - mbfl_no_encoding_sjis_sb, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, NULL, - mbfl_filt_conv_sjis_mobile_wchar, - mbfl_filt_conv_sjis_wchar_flush, NULL, -}; - -static const struct mbfl_convert_vtbl vtbl_wchar_sjis_sb = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_sjis_sb, - mbfl_filt_conv_common_ctor, + mb_sjis_kddi_to_wchar, + mb_wchar_to_sjis_kddi, NULL, - mbfl_filt_conv_wchar_sjis_mobile, - mbfl_filt_conv_sjis_mobile_flush, NULL, }; @@ -7996,8 +6598,8 @@ const mbfl_encoding mbfl_encoding_sjis_sb = { mbfl_encoding_sjis_sb_aliases, mblen_table_sjis_mobile, MBFL_ENCTYPE_GL_UNSAFE, - &vtbl_sjis_sb_wchar, - &vtbl_wchar_sjis_sb, + NULL, + NULL, mb_sjis_sb_to_wchar, mb_wchar_to_sjis_sb, NULL, @@ -8013,26 +6615,6 @@ const mbfl_encoding mbfl_encoding_sjis_sb = { static const char *mbfl_encoding_sjis2004_aliases[] = {"SJIS2004","Shift_JIS-2004", NULL}; -static const struct mbfl_convert_vtbl vtbl_sjis2004_wchar = { - mbfl_no_encoding_sjis2004, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_jis2004_wchar, - mbfl_filt_conv_jis2004_wchar_flush, - NULL, -}; - -static const struct mbfl_convert_vtbl vtbl_wchar_sjis2004 = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_sjis2004, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_jis2004, - mbfl_filt_conv_wchar_jis2004_flush, - NULL, -}; - const mbfl_encoding mbfl_encoding_sjis2004 = { mbfl_no_encoding_sjis2004, "SJIS-2004", @@ -8040,8 +6622,8 @@ const mbfl_encoding mbfl_encoding_sjis2004 = { mbfl_encoding_sjis2004_aliases, mblen_table_sjis_mobile, /* Leading byte values used for SJIS-2004 are the same as mobile SJIS variants */ MBFL_ENCTYPE_GL_UNSAFE, - &vtbl_sjis2004_wchar, - &vtbl_wchar_sjis2004, + NULL, + NULL, mb_sjis2004_to_wchar, mb_wchar_to_sjis2004, NULL, @@ -8075,252 +6657,64 @@ const mbfl_encoding mbfl_encoding_sjis2004 = { * our mappings for "CP932". * • When converting Shift-JIS to CP932, the conversion goes through Unicode. * Shift-JIS 0x7E converts to U+203E, so mapping U+203E to 0x7E means that - * 0x7E will go to 0x7E when converting Shift-JIS to CP932. - */ - -static const unsigned char mblen_table_sjiswin[] = { /* 0x81-0x9F,0xE0-0xFF */ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 -}; - -static const char *mbfl_encoding_cp932_aliases[] = {"MS932", "Windows-31J", "MS_Kanji", NULL}; -static const char *mbfl_encoding_sjiswin_aliases[] = {"SJIS-ms", "SJIS-open", NULL}; - -static const struct mbfl_convert_vtbl vtbl_cp932_wchar = { - mbfl_no_encoding_cp932, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_cp932_wchar, - mbfl_filt_conv_cp932_wchar_flush, - NULL, -}; - -static const struct mbfl_convert_vtbl vtbl_wchar_cp932 = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_cp932, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_cp932, - mbfl_filt_conv_common_flush, - NULL, -}; - -const mbfl_encoding mbfl_encoding_cp932 = { - mbfl_no_encoding_cp932, - "CP932", - "Shift_JIS", - mbfl_encoding_cp932_aliases, - mblen_table_sjiswin, - MBFL_ENCTYPE_GL_UNSAFE, - &vtbl_cp932_wchar, - &vtbl_wchar_cp932, - mb_cp932_to_wchar, - mb_wchar_to_cp932, - NULL, - NULL, -}; - -static const struct mbfl_convert_vtbl vtbl_sjiswin_wchar = { - mbfl_no_encoding_sjiswin, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_cp932_wchar, - mbfl_filt_conv_cp932_wchar_flush, - NULL, -}; - -static const struct mbfl_convert_vtbl vtbl_wchar_sjiswin = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_sjiswin, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_sjiswin, - mbfl_filt_conv_common_flush, - NULL, -}; - -const mbfl_encoding mbfl_encoding_sjiswin = { - mbfl_no_encoding_sjiswin, - "SJIS-win", - "Shift_JIS", - mbfl_encoding_sjiswin_aliases, - mblen_table_sjiswin, - MBFL_ENCTYPE_GL_UNSAFE, - &vtbl_sjiswin_wchar, - &vtbl_wchar_sjiswin, - mb_cp932_to_wchar, - mb_wchar_to_sjiswin, - NULL, - NULL, -}; - -/* - * EUC variants - */ - -static int mbfl_filt_conv_eucjp_wchar(int c, mbfl_convert_filter *filter) -{ - int c1, s, w = 0; - - switch (filter->status) { - case 0: - if (c >= 0 && c < 0x80) { /* latin */ - CK((*filter->output_function)(c, filter->data)); - } else if (c > 0xa0 && c < 0xff) { /* X 0208 first char */ - filter->status = 1; - filter->cache = c; - } else if (c == 0x8e) { /* kana first char */ - filter->status = 2; - } else if (c == 0x8f) { /* X 0212 first char */ - filter->status = 3; - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - case 1: /* got first half */ - filter->status = 0; - c1 = filter->cache; - if (c > 0xa0 && c < 0xff) { - s = (c1 - 0xa1)*94 + c - 0xa1; - if (s >= 0 && s < jisx0208_ucs_table_size) { - w = jisx0208_ucs_table[s]; - if (!w) - w = MBFL_BAD_INPUT; - } else { - w = MBFL_BAD_INPUT; - } - - CK((*filter->output_function)(w, filter->data)); - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - case 2: /* got 0x8e */ - filter->status = 0; - if (c > 0xa0 && c < 0xe0) { - w = 0xfec0 + c; - CK((*filter->output_function)(w, filter->data)); - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - case 3: /* got 0x8f, JIS X 0212 first byte */ - filter->status++; - filter->cache = c; - break; - - case 4: /* got 0x8f, JIS X 0212 second byte */ - filter->status = 0; - c1 = filter->cache; - if (c > 0xA0 && c < 0xFF && c1 > 0xA0 && c1 < 0xFF) { - s = (c1 - 0xa1)*94 + c - 0xa1; - if (s >= 0 && s < jisx0212_ucs_table_size) { - w = jisx0212_ucs_table[s]; - if (!w) - w = MBFL_BAD_INPUT; - } else { - w = MBFL_BAD_INPUT; - } - - CK((*filter->output_function)(w, filter->data)); - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - EMPTY_SWITCH_DEFAULT_CASE(); - } - - return 0; -} - -static int mbfl_filt_conv_eucjp_wchar_flush(mbfl_convert_filter *filter) -{ - if (filter->status) { - (*filter->output_function)(MBFL_BAD_INPUT, filter->data); - filter->status = 0; - } + * 0x7E will go to 0x7E when converting Shift-JIS to CP932. + */ - if (filter->flush_function) { - (*filter->flush_function)(filter->data); - } +static const unsigned char mblen_table_sjiswin[] = { /* 0x81-0x9F,0xE0-0xFF */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +}; - return 0; -} +static const char *mbfl_encoding_cp932_aliases[] = {"MS932", "Windows-31J", "MS_Kanji", NULL}; +static const char *mbfl_encoding_sjiswin_aliases[] = {"SJIS-ms", "SJIS-open", NULL}; -static int mbfl_filt_conv_wchar_eucjp(int c, mbfl_convert_filter *filter) -{ - int s = 0; +const mbfl_encoding mbfl_encoding_cp932 = { + mbfl_no_encoding_cp932, + "CP932", + "Shift_JIS", + mbfl_encoding_cp932_aliases, + mblen_table_sjiswin, + MBFL_ENCTYPE_GL_UNSAFE, + NULL, + NULL, + mb_cp932_to_wchar, + mb_wchar_to_cp932, + NULL, + NULL, +}; - if (c == 0xAF) { /* U+00AF is MACRON */ - s = 0xA2B4; /* Use JIS X 0212 overline */ - } else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) { - s = ucs_a1_jis_table[c - ucs_a1_jis_table_min]; - } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) { - s = ucs_a2_jis_table[c - ucs_a2_jis_table_min]; - } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) { - s = ucs_i_jis_table[c - ucs_i_jis_table_min]; - } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) { - s = ucs_r_jis_table[c - ucs_r_jis_table_min]; - } - if (s <= 0) { - if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */ - s = 0x2140; - } else if (c == 0x2225) { /* PARALLEL TO */ - s = 0x2142; - } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */ - s = 0x215d; - } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */ - s = 0x2171; - } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */ - s = 0x2172; - } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */ - s = 0x224c; - } else if (c == 0) { - s = 0; - } else { - s = -1; - } - } - if (s >= 0) { - if (s < 0x80) { /* latin */ - CK((*filter->output_function)(s, filter->data)); - } else if (s < 0x100) { /* kana */ - CK((*filter->output_function)(0x8e, filter->data)); - CK((*filter->output_function)(s, filter->data)); - } else if (s < 0x8080) { /* X 0208 */ - CK((*filter->output_function)(((s >> 8) & 0xff) | 0x80, filter->data)); - CK((*filter->output_function)((s & 0xff) | 0x80, filter->data)); - } else { /* X 0212 */ - CK((*filter->output_function)(0x8f, filter->data)); - CK((*filter->output_function)(((s >> 8) & 0xff) | 0x80, filter->data)); - CK((*filter->output_function)((s & 0xff) | 0x80, filter->data)); - } - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } +const mbfl_encoding mbfl_encoding_sjiswin = { + mbfl_no_encoding_sjiswin, + "SJIS-win", + "Shift_JIS", + mbfl_encoding_sjiswin_aliases, + mblen_table_sjiswin, + MBFL_ENCTYPE_GL_UNSAFE, + NULL, + NULL, + mb_cp932_to_wchar, + mb_wchar_to_sjiswin, + NULL, + NULL, +}; - return 0; -} +/* + * EUC variants + */ static size_t mb_eucjp_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) { @@ -8428,267 +6822,20 @@ static void mb_wchar_to_eucjp(uint32_t *in, size_t len, mb_convert_buf *buf, boo continue; } } - - if (s < 0x80) { - out = mb_convert_buf_add(out, s); - } else if (s < 0x100) { - out = mb_convert_buf_add2(out, 0x8E, s); - } else if (s < 0x8080) { - out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80); - } else { - MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 3); - out = mb_convert_buf_add3(out, 0x8F, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80); - } - } - - MB_CONVERT_BUF_STORE(buf, out, limit); -} - -static int mbfl_filt_conv_eucjpwin_wchar(int c, mbfl_convert_filter *filter) -{ - int c1, s, w, n; - - switch (filter->status) { - case 0: - if (c >= 0 && c < 0x80) { /* latin */ - CK((*filter->output_function)(c, filter->data)); - } else if (c >= 0xa1 && c <= 0xfe) { /* CP932 first char */ - filter->status = 1; - filter->cache = c; - } else if (c == 0x8e) { /* kana first char */ - filter->status = 2; - } else if (c == 0x8f) { /* X 0212 first char */ - filter->status = 3; - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - case 1: /* got first half */ - filter->status = 0; - c1 = filter->cache; - if (c > 0xa0 && c < 0xff) { - w = 0; - s = (c1 - 0xa1)*94 + c - 0xa1; - if (s <= 137) { - if (s == 31) { - w = 0xff3c; /* FULLWIDTH REVERSE SOLIDUS */ - } else if (s == 32) { - w = 0xff5e; /* FULLWIDTH TILDE */ - } else if (s == 33) { - w = 0x2225; /* PARALLEL TO */ - } else if (s == 60) { - w = 0xff0d; /* FULLWIDTH HYPHEN-MINUS */ - } else if (s == 80) { - w = 0xffe0; /* FULLWIDTH CENT SIGN */ - } else if (s == 81) { - w = 0xffe1; /* FULLWIDTH POUND SIGN */ - } else if (s == 137) { - w = 0xffe2; /* FULLWIDTH NOT SIGN */ - } - } - - if (w == 0) { - if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) { /* vendor ext1 (13ku) */ - w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min]; - } else if (s >= 0 && s < jisx0208_ucs_table_size) { /* X 0208 */ - w = jisx0208_ucs_table[s]; - } else if (s >= (84 * 94)) { /* user (85ku - 94ku) */ - w = s - (84 * 94) + 0xe000; - } - } - - if (w <= 0) { - w = MBFL_BAD_INPUT; - } - CK((*filter->output_function)(w, filter->data)); - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - case 2: /* got 0x8e, X0201 kana */ - filter->status = 0; - if (c > 0xa0 && c < 0xe0) { - w = 0xfec0 + c; - CK((*filter->output_function)(w, filter->data)); - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - case 3: /* got 0x8f, X 0212 first char */ - filter->status++; - filter->cache = c; - break; - - case 4: /* got 0x8f, X 0212 second char */ - filter->status = 0; - c1 = filter->cache; - if (c1 > 0xa0 && c1 < 0xff && c > 0xa0 && c < 0xff) { - s = (c1 - 0xa1)*94 + c - 0xa1; - - if (s >= 0 && s < jisx0212_ucs_table_size) { - w = jisx0212_ucs_table[s]; - - if (w == 0x007e) { - w = 0xff5e; /* FULLWIDTH TILDE */ - } - } else if (s >= (82*94) && s < (84*94)) { /* vender ext3 (83ku - 84ku) <-> CP932 (115ku -120ku) */ - s = (c1 << 8) | c; - w = 0; - n = 0; - while (n < cp932ext3_eucjp_table_size) { - if (s == cp932ext3_eucjp_table[n]) { - if (n < (cp932ext3_ucs_table_max - cp932ext3_ucs_table_min)) { - w = cp932ext3_ucs_table[n]; - } - break; - } - n++; - } - } else if (s >= (84*94)) { /* user (85ku - 94ku) */ - w = s - (84*94) + (0xe000 + (94*10)); - } else { - w = 0; - } - - if (w == 0x00A6) { - w = 0xFFE4; /* FULLWIDTH BROKEN BAR */ - } - - if (w <= 0) { - w = MBFL_BAD_INPUT; - } - CK((*filter->output_function)(w, filter->data)); - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - EMPTY_SWITCH_DEFAULT_CASE(); - } - - return 0; -} - -static int mbfl_filt_conv_eucjpwin_wchar_flush(mbfl_convert_filter *filter) -{ - if (filter->status) { - (*filter->output_function)(MBFL_BAD_INPUT, filter->data); - filter->status = 0; - } - - if (filter->flush_function) { - (*filter->flush_function)(filter->data); - } - - return 0; -} - -static int mbfl_filt_conv_wchar_eucjpwin(int c, mbfl_convert_filter *filter) -{ - int c1, c2, s1 = 0; - - if (c == 0xAF) { /* U+00AF is MACRON */ - s1 = 0xA2B4; /* Use JIS X 0212 overline */ - } else if (c == 0x203E) { - s1 = 0x7E; - } else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) { - s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min]; - } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) { - s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min]; - } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) { - s1 = ucs_i_jis_table[c - ucs_i_jis_table_min]; - } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) { - s1 = ucs_r_jis_table[c - ucs_r_jis_table_min]; - } else if (c >= 0xe000 && c < (0xe000 + 10*94)) { /* user (X0208 85ku - 94ku) */ - s1 = c - 0xe000; - c1 = s1/94 + 0x75; - c2 = s1%94 + 0x21; - s1 = (c1 << 8) | c2; - } else if (c >= (0xe000 + 10*94) && c < (0xe000 + 20*94)) { /* user (X0212 85ku - 94ku) */ - s1 = c - (0xe000 + 10*94); - c1 = s1/94 + 0xf5; - c2 = s1%94 + 0xa1; - s1 = (c1 << 8) | c2; - } - - if (s1 == 0xa2f1) { - s1 = 0x2d62; /* NUMERO SIGN */ - } - - if (s1 <= 0) { - if (c == 0xa5) { /* YEN SIGN */ - s1 = 0x5C; - } else if (c == 0x2014) { - s1 = 0x213D; - } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */ - s1 = 0x2140; - } else if (c == 0x2225) { /* PARALLEL TO */ - s1 = 0x2142; - } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */ - s1 = 0x215d; - } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */ - s1 = 0x2171; - } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */ - s1 = 0x2172; - } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */ - s1 = 0x224c; - } else { - s1 = -1; - c1 = 0; - c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; - while (c1 < c2) { /* CP932 vendor ext1 (13ku) */ - const int oh = cp932ext1_ucs_table_min / 94; - - if (c == cp932ext1_ucs_table[c1]) { - s1 = ((c1 / 94 + oh + 0x21) << 8) + (c1 % 94 + 0x21); - break; - } - c1++; - } - if (s1 < 0) { - c1 = 0; - c2 = cp932ext3_ucs_table_max - cp932ext3_ucs_table_min; - while (c1 < c2) { /* CP932 vendor ext3 (115ku - 119ku) */ - if (c == cp932ext3_ucs_table[c1]) { - if (c1 < cp932ext3_eucjp_table_size) { - s1 = cp932ext3_eucjp_table[c1]; - } - break; - } - c1++; - } - } - } - - if (c == 0) { - s1 = 0; - } else if (s1 <= 0) { - s1 = -1; - } - } - - if (s1 >= 0) { - if (s1 < 0x80) { /* latin */ - CK((*filter->output_function)(s1, filter->data)); - } else if (s1 < 0x100) { /* kana */ - CK((*filter->output_function)(0x8e, filter->data)); - CK((*filter->output_function)(s1, filter->data)); - } else if (s1 < 0x8080) { /* X 0208 */ - CK((*filter->output_function)(((s1 >> 8) & 0xff) | 0x80, filter->data)); - CK((*filter->output_function)((s1 & 0xff) | 0x80, filter->data)); - } else { /* X 0212 */ - CK((*filter->output_function)(0x8f, filter->data)); - CK((*filter->output_function)(((s1 >> 8) & 0xff) | 0x80, filter->data)); - CK((*filter->output_function)((s1 & 0xff) | 0x80, filter->data)); + + if (s < 0x80) { + out = mb_convert_buf_add(out, s); + } else if (s < 0x100) { + out = mb_convert_buf_add2(out, 0x8E, s); + } else if (s < 0x8080) { + out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80); + } else { + MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 3); + out = mb_convert_buf_add3(out, 0x8F, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80); } - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); } - return 0; + MB_CONVERT_BUF_STORE(buf, out, limit); } static size_t mb_eucjpwin_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) @@ -8884,175 +7031,6 @@ static void mb_wchar_to_eucjpwin(uint32_t *in, size_t len, mb_convert_buf *buf, MB_CONVERT_BUF_STORE(buf, out, limit); } -static int mbfl_filt_conv_cp51932_wchar(int c, mbfl_convert_filter *filter) -{ - int c1, s, w; - - switch (filter->status) { - case 0: - if (c >= 0 && c < 0x80) { /* latin */ - CK((*filter->output_function)(c, filter->data)); - } else if (c >= 0xA1 && c <= 0xFE) { /* CP932, first byte */ - filter->status = 1; - filter->cache = c; - } else if (c == 0x8e) { /* kana first char */ - filter->status = 2; - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - case 1: /* got first half */ - filter->status = 0; - c1 = filter->cache; - if (c > 0xa0 && c < 0xff) { - w = 0; - s = (c1 - 0xa1)*94 + c - 0xa1; - if (s <= 137) { - if (s == 31) { - w = 0xff3c; /* FULLWIDTH REVERSE SOLIDUS */ - } else if (s == 32) { - w = 0xff5e; /* FULLWIDTH TILDE */ - } else if (s == 33) { - w = 0x2225; /* PARALLEL TO */ - } else if (s == 60) { - w = 0xff0d; /* FULLWIDTH HYPHEN-MINUS */ - } else if (s == 80) { - w = 0xffe0; /* FULLWIDTH CENT SIGN */ - } else if (s == 81) { - w = 0xffe1; /* FULLWIDTH POUND SIGN */ - } else if (s == 137) { - w = 0xffe2; /* FULLWIDTH NOT SIGN */ - } - } - if (w == 0) { - if (s >= cp932ext1_ucs_table_min && s < cp932ext1_ucs_table_max) { /* vendor ext1 (13ku) */ - w = cp932ext1_ucs_table[s - cp932ext1_ucs_table_min]; - } else if (s >= 0 && s < jisx0208_ucs_table_size) { /* X 0208 */ - w = jisx0208_ucs_table[s]; - } else if (s >= cp932ext2_ucs_table_min && s < cp932ext2_ucs_table_max) { /* vendor ext2 (89ku - 92ku) */ - w = cp932ext2_ucs_table[s - cp932ext2_ucs_table_min]; - } - } - if (w <= 0) { - w = MBFL_BAD_INPUT; - } - CK((*filter->output_function)(w, filter->data)); - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - case 2: /* got 0x8e, X0201 kana */ - filter->status = 0; - if (c > 0xa0 && c < 0xe0) { - w = 0xfec0 + c; - CK((*filter->output_function)(w, filter->data)); - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - EMPTY_SWITCH_DEFAULT_CASE(); - } - - return 0; -} - -static int mbfl_filt_conv_cp51932_wchar_flush(mbfl_convert_filter *filter) -{ - if (filter->status) { - /* Input string was truncated */ - (*filter->output_function)(MBFL_BAD_INPUT, filter->data); - filter->status = 0; - } - - if (filter->flush_function) { - (*filter->flush_function)(filter->data); - } - - return 0; -} - -static int mbfl_filt_conv_wchar_cp51932(int c, mbfl_convert_filter *filter) -{ - int c1, c2, s1; - - s1 = 0; - if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) { - s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min]; - } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) { - s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min]; - } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) { - s1 = ucs_i_jis_table[c - ucs_i_jis_table_min]; - } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) { - s1 = ucs_r_jis_table[c - ucs_r_jis_table_min]; - } - if (s1 >= 0x8080) s1 = -1; /* we don't support JIS X0213 */ - if (s1 <= 0) { - if (c == 0xa5) { /* YEN SIGN */ - s1 = 0x216F; /* FULLWIDTH YEN SIGN */ - } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */ - s1 = 0x2140; - } else if (c == 0x2225) { /* PARALLEL TO */ - s1 = 0x2142; - } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */ - s1 = 0x215d; - } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */ - s1 = 0x2171; - } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */ - s1 = 0x2172; - } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */ - s1 = 0x224c; - } else { - s1 = -1; - c1 = 0; - c2 = cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; - while (c1 < c2) { /* CP932 vendor ext1 (13ku) */ - if (c == cp932ext1_ucs_table[c1]) { - s1 = ((c1/94 + 0x2d) << 8) + (c1%94 + 0x21); - break; - } - c1++; - } - if (s1 < 0) { - c1 = 0; - c2 = cp932ext2_ucs_table_max - cp932ext2_ucs_table_min; - while (c1 < c2) { /* CP932 vendor ext3 (115ku - 119ku) */ - if (c == cp932ext2_ucs_table[c1]) { - s1 = ((c1/94 + 0x79) << 8) +(c1%94 + 0x21); - break; - } - c1++; - } - } - } - if (c == 0) { - s1 = 0; - } else if (s1 <= 0) { - s1 = -1; - } - } - - if (s1 >= 0) { - if (s1 < 0x80) { /* latin */ - CK((*filter->output_function)(s1, filter->data)); - } else if (s1 < 0x100) { /* kana */ - CK((*filter->output_function)(0x8e, filter->data)); - CK((*filter->output_function)(s1, filter->data)); - } else if (s1 < 0x8080) { /* X 0208 */ - CK((*filter->output_function)(((s1 >> 8) & 0xff) | 0x80, filter->data)); - CK((*filter->output_function)((s1 & 0xff) | 0x80, filter->data)); - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - - return 0; -} - static size_t mb_cp51932_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) { unsigned char *p = *in, *e = p + *in_len; @@ -9357,188 +7335,60 @@ process_codepoint: ; } } } - - /* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */ - if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) { - int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len); - if (k >= 0) { - s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k]; - } - } - - /* Check for Japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */ - if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) { - int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len); - if (k >= 0) { - s = jisx0213_u5_jis_tbl[k]; - } - } - - if (!s) { - /* CJK Compatibility Forms: U+FE30-U+FE4F */ - if (w == 0xFE45) { - s = 0x233E; - } else if (w == 0xFE46) { - s = 0x233D; - } else if (w >= 0xF91D && w <= 0xF9DC) { - /* CJK Compatibility Ideographs: U+F900-U+F92A */ - int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len); - if (k >= 0) { - s = ucs_r2b_jisx0213_cmap_val[k]; - } - } - } - - if (!s && w) { - MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_eucjp2004); - MB_CONVERT_BUF_ENSURE(buf, out, limit, len); - } else if (s <= 0x7F) { - out = mb_convert_buf_add(out, s); - } else if (s <= 0xFF) { - MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); - out = mb_convert_buf_add2(out, 0x8E, s); - } else if (s <= 0x7EFF) { - MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); - out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) + 0x80, (s & 0xFF) + 0x80); - } else { - unsigned int s2 = s & 0xFF; - int k = ((s >> 8) & 0xFF) - 0x7F; - ZEND_ASSERT(k < jisx0213_p2_ofst_len); - s = jisx0213_p2_ofst[k] + 0x21; - MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3); - out = mb_convert_buf_add3(out, 0x8F, s | 0x80, s2 | 0x80); - } - } - - MB_CONVERT_BUF_STORE(buf, out, limit); -} - -static int mbfl_filt_conv_euccn_wchar(int c, mbfl_convert_filter *filter) -{ - int c1, w; - - switch (filter->status) { - case 0: - if (c >= 0 && c < 0x80) { /* latin */ - CK((*filter->output_function)(c, filter->data)); - } else if ((c >= 0xA1 && c <= 0xA9) || (c >= 0xB0 && c <= 0xF7)) { /* dbcs lead byte */ - filter->status = 1; - filter->cache = c; - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - case 1: /* dbcs second byte */ - filter->status = 0; - c1 = filter->cache; - if (c > 0xA0 && c < 0xFF) { - w = (c1 - 0x81)*192 + c - 0x40; - ZEND_ASSERT(w < cp936_ucs_table_size); - if (w == 0x1864) { - w = 0x30FB; - } else if (w == 0x186A) { - w = 0x2015; - } else if ((w >= 0x1921 && w <= 0x192A) || w == 0x1963 || (w >= 0x1C59 && w <= 0x1C7E) || (w >= 0x1DBB && w <= 0x1DC4)) { - w = 0; - } else { - w = cp936_ucs_table[w]; - } - - if (w <= 0) { - w = MBFL_BAD_INPUT; - } - - CK((*filter->output_function)(w, filter->data)); - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - EMPTY_SWITCH_DEFAULT_CASE(); - } - - return 0; -} - -static int mbfl_filt_conv_wchar_euccn(int c, mbfl_convert_filter *filter) -{ - int s = 0; - - if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) { - if (c == 0xB7 || c == 0x144 || c == 0x148 || c == 0x251 || c == 0x261) { - s = 0; - } else { - s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min]; - } - } else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) { - if (c == 0x2015) { - s = 0xA1AA; - } else if (c == 0x2014 || (c >= 0x2170 && c <= 0x2179)) { - s = 0; - } else { - s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min]; - } - } else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) { - if (c == 0x30FB) { - s = 0xA1A4; - } else { - s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min]; - } - } else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) { - s = ucs_i_cp936_table[c - ucs_i_cp936_table_min]; - } else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) { - if (c == 0xFF04) { - s = 0xA1E7; - } else if (c == 0xFF5E) { - s = 0xA1AB; - } else if (c >= 0xFF01 && c <= 0xFF5D) { - s = c - 0xFF01 + 0xA3A1; - } else if (c >= 0xFFE0 && c <= 0xFFE5) { - s = ucs_hff_s_cp936_table[c - 0xFFE0]; + + /* Check for Japanese chars in compressed mapping area: U+1E00-U+4DBF */ + if (!s && w >= ucs_c1_jisx0213_min && w <= ucs_c1_jisx0213_max) { + int k = mbfl_bisec_srch(w, ucs_c1_jisx0213_tbl, ucs_c1_jisx0213_tbl_len); + if (k >= 0) { + s = ucs_c1_jisx0213_ofst[k] + w - ucs_c1_jisx0213_tbl[2*k]; + } } - } - /* exclude CP936 extensions */ - if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) { - s = 0; - } + /* Check for Japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */ + if (!s && w >= jisx0213_u5_tbl_min && w <= jisx0213_u5_tbl_max) { + int k = mbfl_bisec_srch2(w - 0x20000, jisx0213_u5_jis_key, jisx0213_u5_tbl_len); + if (k >= 0) { + s = jisx0213_u5_jis_tbl[k]; + } + } - if (s <= 0) { - if (c < 0x80) { - s = c; - } else if (s <= 0) { - s = -1; + if (!s) { + /* CJK Compatibility Forms: U+FE30-U+FE4F */ + if (w == 0xFE45) { + s = 0x233E; + } else if (w == 0xFE46) { + s = 0x233D; + } else if (w >= 0xF91D && w <= 0xF9DC) { + /* CJK Compatibility Ideographs: U+F900-U+F92A */ + int k = mbfl_bisec_srch2(w, ucs_r2b_jisx0213_cmap_key, ucs_r2b_jisx0213_cmap_len); + if (k >= 0) { + s = ucs_r2b_jisx0213_cmap_val[k]; + } + } } - } - if (s >= 0) { - if (s < 0x80) { /* latin */ - CK((*filter->output_function)(s, filter->data)); + if (!s && w) { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_eucjp2004); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } else if (s <= 0x7F) { + out = mb_convert_buf_add(out, s); + } else if (s <= 0xFF) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, 0x8E, s); + } else if (s <= 0x7EFF) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) + 0x80, (s & 0xFF) + 0x80); } else { - CK((*filter->output_function)((s >> 8) & 0xFF, filter->data)); - CK((*filter->output_function)(s & 0xFF, filter->data)); + unsigned int s2 = s & 0xFF; + int k = ((s >> 8) & 0xFF) - 0x7F; + ZEND_ASSERT(k < jisx0213_p2_ofst_len); + s = jisx0213_p2_ofst[k] + 0x21; + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 3); + out = mb_convert_buf_add3(out, 0x8F, s | 0x80, s2 | 0x80); } - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - - return 0; -} - -static int mbfl_filt_conv_euccn_wchar_flush(mbfl_convert_filter *filter) -{ - if (filter->status == 1) { - /* 2-byte character was truncated */ - filter->status = 0; - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - - if (filter->flush_function) { - (*filter->flush_function)(filter->data); } - return 0; + MB_CONVERT_BUF_STORE(buf, out, limit); } static size_t mb_euccn_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) @@ -9645,169 +7495,6 @@ static void mb_wchar_to_euccn(uint32_t *in, size_t len, mb_convert_buf *buf, boo MB_CONVERT_BUF_STORE(buf, out, limit); } -static int mbfl_filt_conv_euctw_wchar(int c, mbfl_convert_filter *filter) -{ - int c1, s, w; - - switch (filter->status) { - case 0: - if (c >= 0 && c < 0x80) { /* latin */ - CK((*filter->output_function)(c, filter->data)); - } else if (((c >= 0xA1 && c <= 0xA6) || (c >= 0xC2 && c <= 0xFD)) && c != 0xC3) { /* 2-byte character, first byte */ - filter->status = 1; - filter->cache = c; - } else if (c == 0x8E) { /* 4-byte character, first byte */ - filter->status = 2; - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - case 1: /* 2-byte character, second byte */ - filter->status = 0; - c1 = filter->cache; - if (c > 0xA0 && c < 0xFF) { - w = (c1 - 0xA1)*94 + (c - 0xA1); - if (w >= 0 && w < cns11643_1_ucs_table_size) { - w = cns11643_1_ucs_table[w]; - } else { - w = 0; - } - - if (w <= 0) { - w = MBFL_BAD_INPUT; - } - - CK((*filter->output_function)(w, filter->data)); - } else { - filter->status = filter->cache = 0; - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - case 2: /* got 0x8e, second byte */ - if (c == 0xA1 || c == 0xA2 || c == 0xAE) { - filter->status = 3; - filter->cache = c - 0xA1; - } else { - filter->status = filter->cache = 0; - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - case 3: /* got 0x8e, third byte */ - filter->status = 0; - c1 = filter->cache; - if (c >= 0xA1 && ((c1 == 0 && ((c >= 0xA1 && c <= 0xA6) || (c >= 0xC2 && c <= 0xFD)) && c != 0xC3) || - (c1 == 1 && c <= 0xF2) || (c1 == 13 && c <= 0xE7))) { - filter->status = 4; - filter->cache = (c1 << 8) + c - 0xA1; - } else { - filter->status = filter->cache = 0; - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - case 4: /* multi-byte character, fourth byte */ - filter->status = 0; - c1 = filter->cache; - if (c1 <= 0xDFF && c > 0xA0 && c < 0xFF) { - int plane = (c1 & 0xF00) >> 8; /* This is actually the CNS-11643 plane minus one */ - s = (c1 & 0xFF)*94 + c - 0xA1; - w = 0; - if (s >= 0) { - /* A later version of CNS-11643 moved all the characters in "plane 14" to "plane 3", - * and added tens of thousands more characters in planes 4, 5, 6, and 7 - * We only support the older version of CNS-11643 - * This is the same as iconv from glibc 2.2 */ - if (plane == 0 && s < cns11643_1_ucs_table_size) { - w = cns11643_1_ucs_table[s]; - } else if (plane == 1 && s < cns11643_2_ucs_table_size) { - w = cns11643_2_ucs_table[s]; - } else if (plane == 13 && s < cns11643_14_ucs_table_size) { - w = cns11643_14_ucs_table[s]; - } - } - - if (w <= 0) { - w = MBFL_BAD_INPUT; - } - - CK((*filter->output_function)(w, filter->data)); - } else { - filter->status = filter->cache = 0; - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - EMPTY_SWITCH_DEFAULT_CASE(); - } - - return 0; -} - -static int mbfl_filt_conv_wchar_euctw(int c, mbfl_convert_filter *filter) -{ - int s = 0; - - if (c >= ucs_a1_cns11643_table_min && c < ucs_a1_cns11643_table_max) { - s = ucs_a1_cns11643_table[c - ucs_a1_cns11643_table_min]; - } else if (c >= ucs_a2_cns11643_table_min && c < ucs_a2_cns11643_table_max) { - s = ucs_a2_cns11643_table[c - ucs_a2_cns11643_table_min]; - } else if (c >= ucs_a3_cns11643_table_min && c < ucs_a3_cns11643_table_max) { - s = ucs_a3_cns11643_table[c - ucs_a3_cns11643_table_min]; - } else if (c >= ucs_i_cns11643_table_min && c < ucs_i_cns11643_table_max) { - s = ucs_i_cns11643_table[c - ucs_i_cns11643_table_min]; - } else if (c >= ucs_r_cns11643_table_min && c < ucs_r_cns11643_table_max) { - s = ucs_r_cns11643_table[c - ucs_r_cns11643_table_min]; - } - - if (s <= 0) { - if (c == 0) { - s = 0; - } else if (s <= 0) { - s = -1; - } - } - - if (s >= 0) { - int plane = (s & 0x1F0000) >> 16; - if (plane <= 1) { - if (s < 0x80) { /* latin */ - CK((*filter->output_function)(s, filter->data)); - } else { - s = (s & 0xFFFF) | 0x8080; - CK((*filter->output_function)((s >> 8) & 0xFF, filter->data)); - CK((*filter->output_function)(s & 0xFF, filter->data)); - } - } else { - s = (0x8EA00000 + (plane << 16)) | ((s & 0xFFFF) | 0x8080); - CK((*filter->output_function)(0x8e , filter->data)); - CK((*filter->output_function)((s >> 16) & 0xFF, filter->data)); - CK((*filter->output_function)((s >> 8) & 0xFF, filter->data)); - CK((*filter->output_function)(s & 0xFF, filter->data)); - } - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - return 0; -} - -static int mbfl_filt_conv_euctw_wchar_flush(mbfl_convert_filter *filter) -{ - if (filter->status) { - /* 2-byte or 4-byte character was truncated */ - filter->status = 0; - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - - if (filter->flush_function) { - (*filter->flush_function)(filter->data); - } - - return 0; -} - static size_t mb_euctw_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) { unsigned char *p = *in, *e = p + *in_len; @@ -9870,172 +7557,59 @@ static size_t mb_euctw_to_wchar(unsigned char **in, size_t *in_len, uint32_t *bu *out++ = MBFL_BAD_INPUT; } else { *out++ = MBFL_BAD_INPUT; - } - } - - *in_len = e - p; - *in = p; - return out - buf; -} - -static void mb_wchar_to_euctw(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) -{ - unsigned char *out, *limit; - MB_CONVERT_BUF_LOAD(buf, out, limit); - MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2); - - while (len--) { - uint32_t w = *in++; - unsigned int s = 0; - - if (w >= ucs_a1_cns11643_table_min && w < ucs_a1_cns11643_table_max) { - s = ucs_a1_cns11643_table[w - ucs_a1_cns11643_table_min]; - } else if (w >= ucs_a2_cns11643_table_min && w < ucs_a2_cns11643_table_max) { - s = ucs_a2_cns11643_table[w - ucs_a2_cns11643_table_min]; - } else if (w >= ucs_a3_cns11643_table_min && w < ucs_a3_cns11643_table_max) { - s = ucs_a3_cns11643_table[w - ucs_a3_cns11643_table_min]; - } else if (w >= ucs_i_cns11643_table_min && w < ucs_i_cns11643_table_max) { - s = ucs_i_cns11643_table[w - ucs_i_cns11643_table_min]; - } else if (w >= ucs_r_cns11643_table_min && w < ucs_r_cns11643_table_max) { - s = ucs_r_cns11643_table[w - ucs_r_cns11643_table_min]; - } - - if (!s) { - if (w == 0) { - out = mb_convert_buf_add(out, 0); - } else { - MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_euctw); - MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2); - } - } else { - unsigned int plane = s >> 16; - if (plane <= 1) { - if (s < 0x80) { - out = mb_convert_buf_add(out, s); - } else { - out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80); - } - } else { - MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4); - out = mb_convert_buf_add4(out, 0x8E, 0xA0 + plane, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80); - } - } - } - - MB_CONVERT_BUF_STORE(buf, out, limit); -} - -static int mbfl_filt_conv_euckr_wchar(int c, mbfl_convert_filter *filter) -{ - int c1, w, flag; - - switch (filter->status) { - case 0: - if (c >= 0 && c < 0x80) { /* latin */ - CK((*filter->output_function)(c, filter->data)); - } else if (((c >= 0xA1 && c <= 0xAC) || (c >= 0xB0 && c <= 0xFD)) && c != 0xC9) { /* dbcs lead byte */ - filter->status = 1; - filter->cache = c; - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - case 1: /* dbcs second byte */ - filter->status = 0; - c1 = filter->cache; - flag = 0; - if (c1 >= 0xa1 && c1 <= 0xc6) { - flag = 1; - } else if (c1 >= 0xc7 && c1 <= 0xfe && c1 != 0xc9) { - flag = 2; - } - if (flag > 0 && c >= 0xa1 && c <= 0xfe) { - if (flag == 1) { /* 1st: 0xa1..0xc6, 2nd: 0x41..0x7a, 0x81..0xfe */ - w = (c1 - 0x81)*190 + c - 0x41; - ZEND_ASSERT(w < uhc1_ucs_table_size); - w = uhc1_ucs_table[w]; - } else { /* 1st: 0xc7..0xc8,0xca..0xfe, 2nd: 0xa1..0xfe */ - w = (c1 - 0xc7)*94 + c - 0xa1; - ZEND_ASSERT(w < uhc3_ucs_table_size); - w = uhc3_ucs_table[w]; - } - - if (w <= 0) { - w = MBFL_BAD_INPUT; - } - CK((*filter->output_function)(w, filter->data)); - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - EMPTY_SWITCH_DEFAULT_CASE(); + } } - return 0; + *in_len = e - p; + *in = p; + return out - buf; } -static int mbfl_filt_conv_wchar_euckr(int c, mbfl_convert_filter *filter) +static void mb_wchar_to_euctw(uint32_t *in, size_t len, mb_convert_buf *buf, bool end) { - int s = 0; - - if (c >= ucs_a1_uhc_table_min && c < ucs_a1_uhc_table_max) { - s = ucs_a1_uhc_table[c - ucs_a1_uhc_table_min]; - } else if (c >= ucs_a2_uhc_table_min && c < ucs_a2_uhc_table_max) { - s = ucs_a2_uhc_table[c - ucs_a2_uhc_table_min]; - } else if (c >= ucs_a3_uhc_table_min && c < ucs_a3_uhc_table_max) { - s = ucs_a3_uhc_table[c - ucs_a3_uhc_table_min]; - } else if (c >= ucs_i_uhc_table_min && c < ucs_i_uhc_table_max) { - s = ucs_i_uhc_table[c - ucs_i_uhc_table_min]; - } else if (c >= ucs_s_uhc_table_min && c < ucs_s_uhc_table_max) { - s = ucs_s_uhc_table[c - ucs_s_uhc_table_min]; - } else if (c >= ucs_r1_uhc_table_min && c < ucs_r1_uhc_table_max) { - s = ucs_r1_uhc_table[c - ucs_r1_uhc_table_min]; - } else if (c >= ucs_r2_uhc_table_min && c < ucs_r2_uhc_table_max) { - s = ucs_r2_uhc_table[c - ucs_r2_uhc_table_min]; - } + unsigned char *out, *limit; + MB_CONVERT_BUF_LOAD(buf, out, limit); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2); - /* exclude UHC extension area (although we are using the UHC conversion tables) */ - if (((s >> 8) & 0xFF) < 0xA1 || (s & 0xFF) < 0xA1) { - s = 0; - } + while (len--) { + uint32_t w = *in++; + unsigned int s = 0; - if (s <= 0) { - if (c < 0x80) { - s = c; - } else { - s = -1; + if (w >= ucs_a1_cns11643_table_min && w < ucs_a1_cns11643_table_max) { + s = ucs_a1_cns11643_table[w - ucs_a1_cns11643_table_min]; + } else if (w >= ucs_a2_cns11643_table_min && w < ucs_a2_cns11643_table_max) { + s = ucs_a2_cns11643_table[w - ucs_a2_cns11643_table_min]; + } else if (w >= ucs_a3_cns11643_table_min && w < ucs_a3_cns11643_table_max) { + s = ucs_a3_cns11643_table[w - ucs_a3_cns11643_table_min]; + } else if (w >= ucs_i_cns11643_table_min && w < ucs_i_cns11643_table_max) { + s = ucs_i_cns11643_table[w - ucs_i_cns11643_table_min]; + } else if (w >= ucs_r_cns11643_table_min && w < ucs_r_cns11643_table_max) { + s = ucs_r_cns11643_table[w - ucs_r_cns11643_table_min]; } - } - if (s >= 0) { - if (s < 0x80) { /* latin */ - CK((*filter->output_function)(s, filter->data)); + if (!s) { + if (w == 0) { + out = mb_convert_buf_add(out, 0); + } else { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_euctw); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len * 2); + } } else { - CK((*filter->output_function)((s >> 8) & 0xff, filter->data)); - CK((*filter->output_function)(s & 0xff, filter->data)); + unsigned int plane = s >> 16; + if (plane <= 1) { + if (s < 0x80) { + out = mb_convert_buf_add(out, s); + } else { + out = mb_convert_buf_add2(out, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80); + } + } else { + MB_CONVERT_BUF_ENSURE(buf, out, limit, (len * 2) + 4); + out = mb_convert_buf_add4(out, 0x8E, 0xA0 + plane, ((s >> 8) & 0xFF) | 0x80, (s & 0xFF) | 0x80); + } } - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - - return 0; -} - -static int mbfl_filt_conv_euckr_wchar_flush(mbfl_convert_filter *filter) -{ - if (filter->status == 1) { - /* 2-byte character was truncated */ - filter->status = 0; - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - - if (filter->flush_function) { - (*filter->flush_function)(filter->data); } - return 0; + MB_CONVERT_BUF_STORE(buf, out, limit); } static size_t mb_euckr_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) @@ -10129,101 +7703,6 @@ static void mb_wchar_to_euckr(uint32_t *in, size_t len, mb_convert_buf *buf, boo MB_CONVERT_BUF_STORE(buf, out, limit); } -static int mbfl_filt_conv_uhc_wchar(int c, mbfl_convert_filter *filter) -{ - switch (filter->status) { - case 0: - if (c >= 0 && c < 0x80) { /* latin */ - CK((*filter->output_function)(c, filter->data)); - } else if (c > 0x80 && c < 0xfe && c != 0xc9) { /* dbcs lead byte */ - filter->status = 1; - filter->cache = c; - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - case 1: /* dbcs second byte */ - filter->status = 0; - int c1 = filter->cache, w = 0; - - if (c1 >= 0x81 && c1 <= 0xc6 && c >= 0x41 && c <= 0xfe) { - w = (c1 - 0x81)*190 + (c - 0x41); - if (w >= 0 && w < uhc1_ucs_table_size) { - w = uhc1_ucs_table[w]; - } - } else if (c1 >= 0xc7 && c1 < 0xfe && c >= 0xa1 && c <= 0xfe) { - w = (c1 - 0xc7)*94 + (c - 0xa1); - if (w >= 0 && w < uhc3_ucs_table_size) { - w = uhc3_ucs_table[w]; - } - } - - if (w == 0) { - w = MBFL_BAD_INPUT; - } - CK((*filter->output_function)(w, filter->data)); - break; - - EMPTY_SWITCH_DEFAULT_CASE(); - } - - return 0; -} - -static int mbfl_filt_conv_uhc_wchar_flush(mbfl_convert_filter *filter) -{ - if (filter->status == 1) { - /* 2-byte character was truncated */ - filter->status = 0; - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - - if (filter->flush_function) { - (*filter->flush_function)(filter->data); - } - - return 0; -} - -static int mbfl_filt_conv_wchar_uhc(int c, mbfl_convert_filter *filter) -{ - int s = 0; - - if (c >= ucs_a1_uhc_table_min && c < ucs_a1_uhc_table_max) { - s = ucs_a1_uhc_table[c - ucs_a1_uhc_table_min]; - } else if (c >= ucs_a2_uhc_table_min && c < ucs_a2_uhc_table_max) { - s = ucs_a2_uhc_table[c - ucs_a2_uhc_table_min]; - } else if (c >= ucs_a3_uhc_table_min && c < ucs_a3_uhc_table_max) { - s = ucs_a3_uhc_table[c - ucs_a3_uhc_table_min]; - } else if (c >= ucs_i_uhc_table_min && c < ucs_i_uhc_table_max) { - s = ucs_i_uhc_table[c - ucs_i_uhc_table_min]; - } else if (c >= ucs_s_uhc_table_min && c < ucs_s_uhc_table_max) { - s = ucs_s_uhc_table[c - ucs_s_uhc_table_min]; - } else if (c >= ucs_r1_uhc_table_min && c < ucs_r1_uhc_table_max) { - s = ucs_r1_uhc_table[c - ucs_r1_uhc_table_min]; - } else if (c >= ucs_r2_uhc_table_min && c < ucs_r2_uhc_table_max) { - s = ucs_r2_uhc_table[c - ucs_r2_uhc_table_min]; - } - - if (s == 0 && c != 0) { - s = -1; - } - - if (s >= 0) { - if (s < 0x80) { /* latin */ - CK((*filter->output_function)(s, filter->data)); - } else { - CK((*filter->output_function)((s >> 8) & 0xff, filter->data)); - CK((*filter->output_function)(s & 0xff, filter->data)); - } - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - - return 0; -} - static size_t mb_uhc_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) { unsigned char *p = *in, *e = p + *in_len; @@ -10345,26 +7824,6 @@ static const unsigned char mblen_table_eucjp[] = { /* 0xA1-0xFE */ static const char *mbfl_encoding_euc_jp_aliases[] = {"EUC", "EUC_JP", "eucJP", "x-euc-jp", NULL}; -static const struct mbfl_convert_vtbl vtbl_eucjp_wchar = { - mbfl_no_encoding_euc_jp, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_eucjp_wchar, - mbfl_filt_conv_eucjp_wchar_flush, - NULL, -}; - -static const struct mbfl_convert_vtbl vtbl_wchar_eucjp = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_euc_jp, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_eucjp, - mbfl_filt_conv_common_flush, - NULL, -}; - const mbfl_encoding mbfl_encoding_euc_jp = { mbfl_no_encoding_euc_jp, "EUC-JP", @@ -10372,8 +7831,8 @@ const mbfl_encoding mbfl_encoding_euc_jp = { mbfl_encoding_euc_jp_aliases, mblen_table_eucjp, 0, - &vtbl_eucjp_wchar, - &vtbl_wchar_eucjp, + NULL, + NULL, mb_eucjp_to_wchar, mb_wchar_to_eucjp, NULL, @@ -10382,26 +7841,6 @@ const mbfl_encoding mbfl_encoding_euc_jp = { static const char *mbfl_encoding_eucjp2004_aliases[] = {"EUC_JP-2004", NULL}; -static const struct mbfl_convert_vtbl vtbl_eucjp2004_wchar = { - mbfl_no_encoding_eucjp2004, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_jis2004_wchar, - mbfl_filt_conv_jis2004_wchar_flush, - NULL, -}; - -static const struct mbfl_convert_vtbl vtbl_wchar_eucjp2004 = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_eucjp2004, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_jis2004, - mbfl_filt_conv_wchar_jis2004_flush, - NULL, -}; - const mbfl_encoding mbfl_encoding_eucjp2004 = { mbfl_no_encoding_eucjp2004, "EUC-JP-2004", @@ -10409,8 +7848,8 @@ const mbfl_encoding mbfl_encoding_eucjp2004 = { mbfl_encoding_eucjp2004_aliases, mblen_table_eucjp, 0, - &vtbl_eucjp2004_wchar, - &vtbl_wchar_eucjp2004, + NULL, + NULL, mb_eucjp2004_to_wchar, mb_wchar_to_eucjp2004, NULL, @@ -10419,26 +7858,6 @@ const mbfl_encoding mbfl_encoding_eucjp2004 = { static const char *mbfl_encoding_eucjp_win_aliases[] = {"eucJP-open", "eucJP-ms", NULL}; -static const struct mbfl_convert_vtbl vtbl_eucjpwin_wchar = { - mbfl_no_encoding_eucjp_win, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_eucjpwin_wchar, - mbfl_filt_conv_eucjpwin_wchar_flush, - NULL, -}; - -static const struct mbfl_convert_vtbl vtbl_wchar_eucjpwin = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_eucjp_win, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_eucjpwin, - mbfl_filt_conv_common_flush, - NULL, -}; - const mbfl_encoding mbfl_encoding_eucjp_win = { mbfl_no_encoding_eucjp_win, "eucJP-win", @@ -10446,8 +7865,8 @@ const mbfl_encoding mbfl_encoding_eucjp_win = { mbfl_encoding_eucjp_win_aliases, mblen_table_eucjp, 0, - &vtbl_eucjpwin_wchar, - &vtbl_wchar_eucjpwin, + NULL, + NULL, mb_eucjpwin_to_wchar, mb_wchar_to_eucjpwin, NULL, @@ -10456,26 +7875,6 @@ const mbfl_encoding mbfl_encoding_eucjp_win = { static const char *mbfl_encoding_cp51932_aliases[] = {"cp51932", NULL}; -static const struct mbfl_convert_vtbl vtbl_cp51932_wchar = { - mbfl_no_encoding_cp51932, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_cp51932_wchar, - mbfl_filt_conv_cp51932_wchar_flush, - NULL, -}; - -static const struct mbfl_convert_vtbl vtbl_wchar_cp51932 = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_cp51932, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_cp51932, - mbfl_filt_conv_common_flush, - NULL, -}; - const mbfl_encoding mbfl_encoding_cp51932 = { mbfl_no_encoding_cp51932, "CP51932", @@ -10483,8 +7882,8 @@ const mbfl_encoding mbfl_encoding_cp51932 = { mbfl_encoding_cp51932_aliases, mblen_table_eucjp, 0, - &vtbl_cp51932_wchar, - &vtbl_wchar_cp51932, + NULL, + NULL, mb_cp51932_to_wchar, mb_wchar_to_cp51932, NULL, @@ -10512,26 +7911,6 @@ static const unsigned char mblen_table_euccn[] = { /* 0xA1-0xFE */ static const char *mbfl_encoding_euc_cn_aliases[] = {"CN-GB", "EUC_CN", "eucCN", "x-euc-cn", "gb2312", NULL}; -static const struct mbfl_convert_vtbl vtbl_euccn_wchar = { - mbfl_no_encoding_euc_cn, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_euccn_wchar, - mbfl_filt_conv_euccn_wchar_flush, - NULL, -}; - -static const struct mbfl_convert_vtbl vtbl_wchar_euccn = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_euc_cn, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_euccn, - mbfl_filt_conv_common_flush, - NULL, -}; - const mbfl_encoding mbfl_encoding_euc_cn = { mbfl_no_encoding_euc_cn, "EUC-CN", @@ -10539,35 +7918,15 @@ const mbfl_encoding mbfl_encoding_euc_cn = { mbfl_encoding_euc_cn_aliases, mblen_table_euccn, 0, - &vtbl_euccn_wchar, - &vtbl_wchar_euccn, - mb_euccn_to_wchar, - mb_wchar_to_euccn, NULL, NULL, -}; - -static const char *mbfl_encoding_euc_tw_aliases[] = {"EUC_TW", "eucTW", "x-euc-tw", NULL}; - -static const struct mbfl_convert_vtbl vtbl_euctw_wchar = { - mbfl_no_encoding_euc_tw, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, + mb_euccn_to_wchar, + mb_wchar_to_euccn, NULL, - mbfl_filt_conv_euctw_wchar, - mbfl_filt_conv_euctw_wchar_flush, NULL, }; -static const struct mbfl_convert_vtbl vtbl_wchar_euctw = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_euc_tw, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_euctw, - mbfl_filt_conv_common_flush, - NULL, -}; +static const char *mbfl_encoding_euc_tw_aliases[] = {"EUC_TW", "eucTW", "x-euc-tw", NULL}; const mbfl_encoding mbfl_encoding_euc_tw = { mbfl_no_encoding_euc_tw, @@ -10576,8 +7935,8 @@ const mbfl_encoding mbfl_encoding_euc_tw = { mbfl_encoding_euc_tw_aliases, mblen_table_euccn, 0, - &vtbl_euctw_wchar, - &vtbl_wchar_euctw, + NULL, + NULL, mb_euctw_to_wchar, mb_wchar_to_euctw, NULL, @@ -10586,26 +7945,6 @@ const mbfl_encoding mbfl_encoding_euc_tw = { static const char *mbfl_encoding_euc_kr_aliases[] = {"EUC_KR", "eucKR", "x-euc-kr", NULL}; -static const struct mbfl_convert_vtbl vtbl_euckr_wchar = { - mbfl_no_encoding_euc_kr, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_euckr_wchar, - mbfl_filt_conv_euckr_wchar_flush, - NULL, -}; - -static const struct mbfl_convert_vtbl vtbl_wchar_euckr = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_euc_kr, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_euckr, - mbfl_filt_conv_common_flush, - NULL, -}; - const mbfl_encoding mbfl_encoding_euc_kr = { mbfl_no_encoding_euc_kr, "EUC-KR", @@ -10613,8 +7952,8 @@ const mbfl_encoding mbfl_encoding_euc_kr = { mbfl_encoding_euc_kr_aliases, mblen_table_euccn, 0, - &vtbl_euckr_wchar, - &vtbl_wchar_euckr, + NULL, + NULL, mb_euckr_to_wchar, mb_wchar_to_euckr, NULL, @@ -10646,26 +7985,6 @@ static const unsigned char mblen_table_81_to_fe[] = { /* 0x81-0xFE */ static const char *mbfl_encoding_uhc_aliases[] = {"CP949", NULL}; -static const struct mbfl_convert_vtbl vtbl_uhc_wchar = { - mbfl_no_encoding_uhc, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_uhc_wchar, - mbfl_filt_conv_uhc_wchar_flush, - NULL, -}; - -static const struct mbfl_convert_vtbl vtbl_wchar_uhc = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_uhc, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_uhc, - mbfl_filt_conv_common_flush, - NULL, -}; - const mbfl_encoding mbfl_encoding_uhc = { mbfl_no_encoding_uhc, "UHC", @@ -10673,8 +7992,8 @@ const mbfl_encoding mbfl_encoding_uhc = { mbfl_encoding_uhc_aliases, mblen_table_81_to_fe, 0, - &vtbl_uhc_wchar, - &vtbl_wchar_uhc, + NULL, + NULL, mb_uhc_to_wchar, mb_wchar_to_uhc, NULL, @@ -10685,284 +8004,6 @@ const mbfl_encoding mbfl_encoding_uhc = { * GB18030/CP936 */ -static int mbfl_filt_conv_gb18030_wchar(int c, mbfl_convert_filter *filter) -{ - int c1, c2, c3, w = -1; - - switch (filter->status) { - case 0: - if (c >= 0 && c < 0x80) { /* latin */ - CK((*filter->output_function)(c, filter->data)); - } else if (c > 0x80 && c < 0xff) { /* dbcs/qbcs lead byte */ - filter->status = 1; - filter->cache = c; - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - case 1: /* dbcs/qbcs second byte */ - c1 = filter->cache; - filter->status = 0; - - if (c1 >= 0x81 && c1 <= 0x84 && c >= 0x30 && c <= 0x39) { - /* 4 byte range: Unicode BMP */ - filter->status = 2; - filter->cache = (c1 << 8) | c; - return 0; - } else if (c1 >= 0x90 && c1 <= 0xe3 && c >= 0x30 && c <= 0x39) { - /* 4 byte range: Unicode 16 planes */ - filter->status = 2; - filter->cache = (c1 << 8) | c; - return 0; - } else if (((c1 >= 0xaa && c1 <= 0xaf) || (c1 >= 0xf8 && c1 <= 0xfe)) && (c >= 0xa1 && c <= 0xfe)) { - /* UDA part 1,2: U+E000-U+E4C5 */ - w = 94*(c1 >= 0xf8 ? c1 - 0xf2 : c1 - 0xaa) + (c - 0xa1) + 0xe000; - CK((*filter->output_function)(w, filter->data)); - } else if (c1 >= 0xa1 && c1 <= 0xa7 && c >= 0x40 && c < 0xa1 && c != 0x7f) { - /* UDA part3 : U+E4C6-U+E765*/ - w = 96*(c1 - 0xa1) + c - (c >= 0x80 ? 0x41 : 0x40) + 0xe4c6; - CK((*filter->output_function)(w, filter->data)); - } - - c2 = (c1 << 8) | c; - - if (w <= 0 && ( - (c2 >= 0xa2ab && c2 <= 0xa9f0 + (0xe80f-0xe801)) || - (c2 >= 0xd7fa && c2 <= 0xd7fa + (0xe814-0xe810)) || - (c2 >= 0xfe50 && c2 <= 0xfe80 + (0xe864-0xe844)) - )) { - for (size_t offset = 0; offset < mbfl_gb18030_pua_tbl_max; offset++) { - if (c2 >= mbfl_gb18030_pua_tbl[offset][2] && c2 <= mbfl_gb18030_pua_tbl[offset][2] + mbfl_gb18030_pua_tbl[offset][1] - mbfl_gb18030_pua_tbl[offset][0]) { - w = c2 - mbfl_gb18030_pua_tbl[offset][2] + mbfl_gb18030_pua_tbl[offset][0]; - CK((*filter->output_function)(w, filter->data)); - break; - } - } - } - - if (w <= 0) { - if ((c1 >= 0xa1 && c1 <= 0xa9 && c >= 0xa1 && c <= 0xfe) || - (c1 >= 0xb0 && c1 <= 0xf7 && c >= 0xa1 && c <= 0xfe) || - (c1 >= 0x81 && c1 <= 0xa0 && c >= 0x40 && c <= 0xfe && c != 0x7f) || - (c1 >= 0xaa && c1 <= 0xfe && c >= 0x40 && c <= 0xa0 && c != 0x7f) || - (c1 >= 0xa8 && c1 <= 0xa9 && c >= 0x40 && c <= 0xa0 && c != 0x7f)) { - w = (c1 - 0x81)*192 + c - 0x40; - ZEND_ASSERT(w < cp936_ucs_table_size); - CK((*filter->output_function)(cp936_ucs_table[w], filter->data)); - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - } - break; - - case 2: /* qbcs third byte */ - c1 = (filter->cache >> 8) & 0xff; - c2 = filter->cache & 0xff; - filter->status = filter->cache = 0; - if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) && c2 >= 0x30 && c2 <= 0x39 && c >= 0x81 && c <= 0xfe) { - filter->cache = (c1 << 16) | (c2 << 8) | c; - filter->status = 3; - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - case 3: /* qbcs fourth byte */ - c1 = (filter->cache >> 16) & 0xff; - c2 = (filter->cache >> 8) & 0xff; - c3 = filter->cache & 0xff; - filter->status = filter->cache = 0; - if (((c1 >= 0x81 && c1 <= 0x84) || (c1 >= 0x90 && c1 <= 0xe3)) && c2 >= 0x30 && c2 <= 0x39 && c3 >= 0x81 && c3 <= 0xfe && c >= 0x30 && c <= 0x39) { - if (c1 >= 0x90 && c1 <= 0xe3) { - w = ((((c1 - 0x90)*10 + (c2 - 0x30))*126 + (c3 - 0x81)))*10 + (c - 0x30) + 0x10000; - if (w > 0x10FFFF) { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - return 0; - } - } else { /* Unicode BMP */ - w = (((c1 - 0x81)*10 + (c2 - 0x30))*126 + (c3 - 0x81))*10 + (c - 0x30); - if (w >= 0 && w <= 39419) { - int k = mbfl_bisec_srch(w, mbfl_gb2uni_tbl, mbfl_gb_uni_max); - w += mbfl_gb_uni_ofst[k]; - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - return 0; - } - } - CK((*filter->output_function)(w, filter->data)); - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - EMPTY_SWITCH_DEFAULT_CASE(); - } - - return 0; -} - -static int mbfl_filt_conv_gb18030_wchar_flush(mbfl_convert_filter *filter) -{ - if (filter->status) { - /* multi-byte character was truncated */ - filter->status = 0; - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - - if (filter->flush_function) { - (*filter->flush_function)(filter->data); - } - - return 0; -} - -static int mbfl_filt_conv_wchar_gb18030(int c, mbfl_convert_filter *filter) -{ - int k, k1, k2; - int c1, s = 0, s1 = 0; - - if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) { - if (c == 0x01f9) { - s = 0xa8bf; - } else { - s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min]; - } - } else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) { - if (c == 0x20ac) { /* euro-sign */ - s = 0xa2e3; - } else { - s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min]; - } - } else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) { - s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min]; - } else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) { - s = ucs_i_cp936_table[c - ucs_i_cp936_table_min]; - } else if (c >= ucs_ci_cp936_table_min && c < ucs_ci_cp936_table_max) { - /* U+F900-FA2F CJK Compatibility Ideographs */ - if (c == 0xf92c) { - s = 0xfd9c; - } else if (c == 0xf979) { - s = 0xfd9d; - } else if (c == 0xf995) { - s = 0xfd9e; - } else if (c == 0xf9e7) { - s = 0xfd9f; - } else if (c == 0xf9f1) { - s = 0xfda0; - } else if (c >= 0xfa0c && c <= 0xfa29) { - s = ucs_ci_s_cp936_table[c - 0xfa0c]; - } - } else if (c >= ucs_cf_cp936_table_min && c < ucs_cf_cp936_table_max) { - /* FE30h CJK Compatibility Forms */ - s = ucs_cf_cp936_table[c - ucs_cf_cp936_table_min]; - } else if (c >= ucs_sfv_cp936_table_min && c < ucs_sfv_cp936_table_max) { - /* U+FE50-FE6F Small Form Variants */ - s = ucs_sfv_cp936_table[c - ucs_sfv_cp936_table_min]; - } else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) { - /* U+FF00-FFFF HW/FW Forms */ - if (c == 0xff04) { - s = 0xa1e7; - } else if (c == 0xff5e) { - s = 0xa1ab; - } else if (c >= 0xff01 && c <= 0xff5d) { - s = c - 0xff01 + 0xa3a1; - } else if (c >= 0xffe0 && c <= 0xffe5) { - s = ucs_hff_s_cp936_table[c-0xffe0]; - } - } - - /* While GB18030 and CP936 are very similar, some mappings are different between these encodings; - * do a binary search in a table of differing codepoints to see if we have one */ - if (s <= 0 && c >= mbfl_gb18030_c_tbl_key[0] && c <= mbfl_gb18030_c_tbl_key[mbfl_gb18030_c_tbl_max-1]) { - k1 = mbfl_bisec_srch2(c, mbfl_gb18030_c_tbl_key, mbfl_gb18030_c_tbl_max); - if (k1 >= 0) { - s = mbfl_gb18030_c_tbl_val[k1]; - } - } - - if (c >= 0xe000 && c <= 0xe864) { /* PUA */ - if (c < 0xe766) { - if (c < 0xe4c6) { - c1 = c - 0xe000; - s = (c1 % 94) + 0xa1; - c1 /= 94; - s |= (c1 < 0x06 ? c1 + 0xaa : c1 + 0xf2) << 8; - } else { - c1 = c - 0xe4c6; - s = ((c1 / 96) + 0xa1) << 8; - c1 %= 96; - s |= c1 + (c1 >= 0x3f ? 0x41 : 0x40); - } - } else { - /* U+E766..U+E864 */ - k1 = 0; - k2 = mbfl_gb18030_pua_tbl_max; - while (k1 < k2) { - k = (k1 + k2) >> 1; - if (c < mbfl_gb18030_pua_tbl[k][0]) { - k2 = k; - } else if (c > mbfl_gb18030_pua_tbl[k][1]) { - k1 = k + 1; - } else { - s = c - mbfl_gb18030_pua_tbl[k][0] + mbfl_gb18030_pua_tbl[k][2]; - break; - } - } - } - } - - /* If we have not yet found a suitable mapping for this codepoint, it requires a 4-byte code */ - if (s <= 0 && c >= 0x0080 && c <= 0xffff) { - /* BMP */ - s = mbfl_bisec_srch(c, mbfl_uni2gb_tbl, mbfl_gb_uni_max); - if (s >= 0) { - c1 = c - mbfl_gb_uni_ofst[s]; - s = (c1 % 10) + 0x30; - c1 /= 10; - s |= ((c1 % 126) + 0x81) << 8; - c1 /= 126; - s |= ((c1 % 10) + 0x30) << 16; - c1 /= 10; - s1 = c1 + 0x81; - } - } else if (c >= 0x10000 && c <= 0x10ffff) { - /* Code set 3: Unicode U+10000..U+10FFFF */ - c1 = c - 0x10000; - s = (c1 % 10) + 0x30; - c1 /= 10; - s |= ((c1 % 126) + 0x81) << 8; - c1 /= 126; - s |= ((c1 % 10) + 0x30) << 16; - c1 /= 10; - s1 = c1 + 0x90; - } - - if (c == 0) { - s = 0; - } else if (s == 0) { - s = -1; - } - - if (s >= 0) { - if (s <= 0x80) { /* latin */ - CK((*filter->output_function)(s, filter->data)); - } else if (s1 > 0) { /* qbcs */ - CK((*filter->output_function)(s1 & 0xff, filter->data)); - CK((*filter->output_function)((s >> 16) & 0xff, filter->data)); - CK((*filter->output_function)((s >> 8) & 0xff, filter->data)); - CK((*filter->output_function)(s & 0xff, filter->data)); - } else { /* dbcs */ - CK((*filter->output_function)((s >> 8) & 0xff, filter->data)); - CK((*filter->output_function)(s & 0xff, filter->data)); - } - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - - return 0; -} - static const unsigned short gb18030_pua_tbl3[] = { /* 0xFE50 */ 0x0000,0xE816,0xE817,0xE818,0x0000,0x0000,0x0000,0x0000, @@ -11184,216 +8225,37 @@ static void mb_wchar_to_gb18030(uint32_t *in, size_t len, mb_convert_buf *buf, b c1 /= 10; s |= ((c1 % 126) + 0x81) << 8; c1 /= 126; - s |= ((c1 % 10) + 0x30) << 16; - c1 /= 10; - s |= (c1 + 0x81) << 24; - } - } else if (w >= 0x10000 && w <= 0x10FFFF) { - /* Code set 3: Unicode U+10000-U+10FFFF */ - unsigned int c1 = w - 0x10000; - s = (c1 % 10) + 0x30; - c1 /= 10; - s |= ((c1 % 126) + 0x81) << 8; - c1 /= 126; - s |= ((c1 % 10) + 0x30) << 16; - c1 /= 10; - s |= (c1 + 0x90) << 24; - } - - if (!s) { - MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_gb18030); - MB_CONVERT_BUF_ENSURE(buf, out, limit, len); - } else if (s < 0x80) { - out = mb_convert_buf_add(out, s); - } else if (s > 0xFFFFFF) { - MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4); - out = mb_convert_buf_add4(out, (s >> 24) & 0xFF, (s >> 16) & 0xFF, (s >> 8) & 0xFF, s & 0xFF); - } else { - MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); - out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF); - } - } - - MB_CONVERT_BUF_STORE(buf, out, limit); -} - -static int mbfl_filt_conv_cp936_wchar(int c, mbfl_convert_filter *filter) -{ - int c1, c2, w = -1; - - switch (filter->status) { - case 0: - if (c >= 0 && c < 0x80) { /* latin */ - CK((*filter->output_function)(c, filter->data)); - } else if (c == 0x80) { /* euro sign */ - CK((*filter->output_function)(0x20ac, filter->data)); - } else if (c < 0xff) { /* dbcs lead byte */ - filter->status = 1; - filter->cache = c; - } else { /* 0xff */ - CK((*filter->output_function)(0xf8f5, filter->data)); - } - break; - - case 1: /* dbcs second byte */ - filter->status = 0; - c1 = filter->cache; - - if (((c1 >= 0xaa && c1 <= 0xaf) || (c1 >= 0xf8 && c1 <= 0xfe)) && - (c >= 0xa1 && c <= 0xfe)) { - /* UDA part1,2: U+E000-U+E4C5 */ - w = 94*(c1 >= 0xf8 ? c1 - 0xf2 : c1 - 0xaa) + (c - 0xa1) + 0xe000; - CK((*filter->output_function)(w, filter->data)); - } else if (c1 >= 0xa1 && c1 <= 0xa7 && c >= 0x40 && c < 0xa1 && c != 0x7f) { - /* UDA part3 : U+E4C6-U+E765*/ - w = 96*(c1 - 0xa1) + c - (c >= 0x80 ? 0x41 : 0x40) + 0xe4c6; - CK((*filter->output_function)(w, filter->data)); - } - - c2 = (c1 << 8) | c; - - if (w <= 0 && ( - (c2 >= 0xa2ab && c2 <= 0xa9f0 + (0xe80f-0xe801)) || - (c2 >= 0xd7fa && c2 <= 0xd7fa + (0xe814-0xe810)) || - (c2 >= 0xfe50 && c2 <= 0xfe80 + (0xe864-0xe844)) - )) { - size_t k; - for (k = 0; k < mbfl_cp936_pua_tbl_max; k++) { - if (c2 >= mbfl_cp936_pua_tbl[k][2] && - c2 <= mbfl_cp936_pua_tbl[k][2] + - mbfl_cp936_pua_tbl[k][1] - mbfl_cp936_pua_tbl[k][0]) { - w = c2 - mbfl_cp936_pua_tbl[k][2] + mbfl_cp936_pua_tbl[k][0]; - CK((*filter->output_function)(w, filter->data)); - break; - } - } - } - - if (w <= 0) { - if (c1 < 0xff && c1 > 0x80 && c >= 0x40 && c < 0xff && c != 0x7f) { - w = (c1 - 0x81)*192 + c - 0x40; - ZEND_ASSERT(w < cp936_ucs_table_size); - CK((*filter->output_function)(cp936_ucs_table[w], filter->data)); - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - } - break; - - EMPTY_SWITCH_DEFAULT_CASE(); - } - - return 0; -} - -static int mbfl_filt_conv_cp936_wchar_flush(mbfl_convert_filter *filter) -{ - if (filter->status) { - /* 2-byte character was truncated */ - filter->status = 0; - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - - if (filter->flush_function) { - (*filter->flush_function)(filter->data); - } - - return 0; -} - -static int mbfl_filt_conv_wchar_cp936(int c, mbfl_convert_filter *filter) -{ - int k, k1, k2; - int c1, s = 0; - - if (c >= ucs_a1_cp936_table_min && c < ucs_a1_cp936_table_max) { - /* U+0000 - U+0451 */ - s = ucs_a1_cp936_table[c - ucs_a1_cp936_table_min]; - } else if (c >= ucs_a2_cp936_table_min && c < ucs_a2_cp936_table_max) { - /* U+2000 - U+26FF */ - if (c == 0x203e) { - s = 0xa3fe; - } else if (c == 0x2218) { - s = 0xa1e3; - } else if (c == 0x223c) { - s = 0xa1ab; - } else { - s = ucs_a2_cp936_table[c - ucs_a2_cp936_table_min]; - } - } else if (c >= ucs_a3_cp936_table_min && c < ucs_a3_cp936_table_max) { - /* U+2F00 - U+33FF */ - s = ucs_a3_cp936_table[c - ucs_a3_cp936_table_min]; - } else if (c >= ucs_i_cp936_table_min && c < ucs_i_cp936_table_max) { - /* U+4D00-9FFF CJK Unified Ideographs (+ Extension A) */ - s = ucs_i_cp936_table[c - ucs_i_cp936_table_min]; - } else if (c >= 0xe000 && c <= 0xe864) { /* PUA */ - if (c < 0xe766) { - if (c < 0xe4c6) { - c1 = c - 0xe000; - s = (c1 % 94) + 0xa1; c1 /= 94; - s |= (c1 < 0x06 ? c1 + 0xaa : c1 + 0xf2) << 8; - } else { - c1 = c - 0xe4c6; - s = ((c1 / 96) + 0xa1) << 8; c1 %= 96; - s |= c1 + (c1 >= 0x3f ? 0x41 : 0x40); - } - } else { - /* U+E766..U+E864 */ - k1 = 0; k2 = mbfl_cp936_pua_tbl_max; - while (k1 < k2) { - k = (k1 + k2) >> 1; - if (c < mbfl_cp936_pua_tbl[k][0]) { - k2 = k; - } else if (c > mbfl_cp936_pua_tbl[k][1]) { - k1 = k + 1; - } else { - s = c - mbfl_cp936_pua_tbl[k][0] + mbfl_cp936_pua_tbl[k][2]; - break; - } + s |= ((c1 % 10) + 0x30) << 16; + c1 /= 10; + s |= (c1 + 0x81) << 24; } + } else if (w >= 0x10000 && w <= 0x10FFFF) { + /* Code set 3: Unicode U+10000-U+10FFFF */ + unsigned int c1 = w - 0x10000; + s = (c1 % 10) + 0x30; + c1 /= 10; + s |= ((c1 % 126) + 0x81) << 8; + c1 /= 126; + s |= ((c1 % 10) + 0x30) << 16; + c1 /= 10; + s |= (c1 + 0x90) << 24; } - } else if (c == 0xf8f5) { - s = 0xff; - } else if (c >= ucs_ci_cp936_table_min && c < ucs_ci_cp936_table_max) { - /* U+F900-FA2F CJK Compatibility Ideographs */ - s = ucs_ci_cp936_table[c - ucs_ci_cp936_table_min]; - } else if (c >= ucs_cf_cp936_table_min && c < ucs_cf_cp936_table_max) { - s = ucs_cf_cp936_table[c - ucs_cf_cp936_table_min]; - } else if (c >= ucs_sfv_cp936_table_min && c < ucs_sfv_cp936_table_max) { - s = ucs_sfv_cp936_table[c - ucs_sfv_cp936_table_min]; /* U+FE50-FE6F Small Form Variants */ - } else if (c >= ucs_hff_cp936_table_min && c < ucs_hff_cp936_table_max) { - /* U+FF00-FFFF HW/FW Forms */ - if (c == 0xff04) { - s = 0xa1e7; - } else if (c == 0xff5e) { - s = 0xa1ab; - } else if (c >= 0xff01 && c <= 0xff5d) { - s = c - 0xff01 + 0xa3a1; - } else if (c >= 0xffe0 && c <= 0xffe5) { - s = ucs_hff_s_cp936_table[c-0xffe0]; - } - } - - if (s <= 0) { - if (c == 0) { - s = 0; - } else if (s <= 0) { - s = -1; - } - } - if (s >= 0) { - if (s <= 0x80 || s == 0xff) { /* latin */ - CK((*filter->output_function)(s, filter->data)); + if (!s) { + MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_gb18030); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len); + } else if (s < 0x80) { + out = mb_convert_buf_add(out, s); + } else if (s > 0xFFFFFF) { + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 4); + out = mb_convert_buf_add4(out, (s >> 24) & 0xFF, (s >> 16) & 0xFF, (s >> 8) & 0xFF, s & 0xFF); } else { - CK((*filter->output_function)((s >> 8) & 0xff, filter->data)); - CK((*filter->output_function)(s & 0xff, filter->data)); + MB_CONVERT_BUF_ENSURE(buf, out, limit, len + 2); + out = mb_convert_buf_add2(out, (s >> 8) & 0xFF, s & 0xFF); } - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); } - return 0; + MB_CONVERT_BUF_STORE(buf, out, limit); } static size_t mb_cp936_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) @@ -11915,26 +8777,6 @@ static zend_string* mb_cut_gb18030(unsigned char *str, size_t from, size_t len, static const char *mbfl_encoding_gb18030_aliases[] = {"gb-18030", "gb-18030-2000", NULL}; -static const struct mbfl_convert_vtbl vtbl_gb18030_wchar = { - mbfl_no_encoding_gb18030, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_gb18030_wchar, - mbfl_filt_conv_gb18030_wchar_flush, - NULL, -}; - -static const struct mbfl_convert_vtbl vtbl_wchar_gb18030 = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_gb18030, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_gb18030, - mbfl_filt_conv_common_flush, - NULL, -}; - const mbfl_encoding mbfl_encoding_gb18030 = { mbfl_no_encoding_gb18030, "GB18030", @@ -11942,8 +8784,8 @@ const mbfl_encoding mbfl_encoding_gb18030 = { mbfl_encoding_gb18030_aliases, NULL, MBFL_ENCTYPE_GL_UNSAFE, - &vtbl_gb18030_wchar, - &vtbl_wchar_gb18030, + NULL, + NULL, mb_gb18030_to_wchar, mb_wchar_to_gb18030, NULL, @@ -11952,26 +8794,6 @@ const mbfl_encoding mbfl_encoding_gb18030 = { static const char *mbfl_encoding_cp936_aliases[] = {"CP-936", "GBK", NULL}; -static const struct mbfl_convert_vtbl vtbl_cp936_wchar = { - mbfl_no_encoding_cp936, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_cp936_wchar, - mbfl_filt_conv_cp936_wchar_flush, - NULL, -}; - -static const struct mbfl_convert_vtbl vtbl_wchar_cp936 = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_cp936, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_cp936, - mbfl_filt_conv_common_flush, - NULL, -}; - const mbfl_encoding mbfl_encoding_cp936 = { mbfl_no_encoding_cp936, "CP936", @@ -11979,8 +8801,8 @@ const mbfl_encoding mbfl_encoding_cp936 = { mbfl_encoding_cp936_aliases, mblen_table_81_to_fe, MBFL_ENCTYPE_GL_UNSAFE, - &vtbl_cp936_wchar, - &vtbl_wchar_cp936, + NULL, + NULL, mb_cp936_to_wchar, mb_wchar_to_cp936, NULL, @@ -12025,247 +8847,6 @@ static inline int is_in_cp950_pua(int c1, int c) return 0; } -static int mbfl_filt_conv_big5_wchar(int c, mbfl_convert_filter *filter) -{ - int c1, w; - - switch (filter->status) { - case 0: - if (c >= 0 && c < 0x80) { /* latin */ - CK((*filter->output_function)(c, filter->data)); - } else if (filter->from->no_encoding != mbfl_no_encoding_cp950 && c > 0xA0 && c <= 0xF9 && c != 0xC8) { - filter->status = 1; - filter->cache = c; - } else if (filter->from->no_encoding == mbfl_no_encoding_cp950 && c > 0x80 && c <= 0xFE) { - filter->status = 1; - filter->cache = c; - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - case 1: /* dbcs second byte */ - filter->status = 0; - c1 = filter->cache; - if ((c > 0x3f && c < 0x7f) || (c > 0xa0 && c < 0xff)) { - if (c < 0x7f) { - w = (c1 - 0xa1)*157 + (c - 0x40); - } else { - w = (c1 - 0xa1)*157 + (c - 0xa1) + 0x3f; - } - if (w >= 0 && w < big5_ucs_table_size) { - w = big5_ucs_table[w]; - } else { - w = 0; - } - - if (filter->from->no_encoding == mbfl_no_encoding_cp950) { - /* PUA for CP950 */ - if (is_in_cp950_pua(c1, c)) { - int c2 = (c1 << 8) | c; - - size_t k; - for (k = 0; k < sizeof(cp950_pua_tbl) / (sizeof(unsigned short)*4); k++) { - if (c2 >= cp950_pua_tbl[k][2] && c2 <= cp950_pua_tbl[k][3]) { - break; - } - } - - if ((cp950_pua_tbl[k][2] & 0xff) == 0x40) { - w = 157*(c1 - (cp950_pua_tbl[k][2]>>8)) + c - (c >= 0xa1 ? 0x62 : 0x40) + cp950_pua_tbl[k][0]; - } else { - w = c2 - cp950_pua_tbl[k][2] + cp950_pua_tbl[k][0]; - } - } else if (c1 == 0xA1) { - if (c == 0x45) { - w = 0x2027; - } else if (c == 0x4E) { - w = 0xFE51; - } else if (c == 0x5A) { - w = 0x2574; - } else if (c == 0xC2) { - w = 0x00AF; - } else if (c == 0xC3) { - w = 0xFFE3; - } else if (c == 0xC5) { - w = 0x02CD; - } else if (c == 0xE3) { - w = 0xFF5E; - } else if (c == 0xF2) { - w = 0x2295; - } else if (c == 0xF3) { - w = 0x2299; - } else if (c == 0xFE) { - w = 0xFF0F; - } - } else if (c1 == 0xA2) { - if (c == 0x40) { - w = 0xFF3C; - } else if (c == 0x41) { - w = 0x2215; - } else if (c == 0x42) { - w = 0xFE68; - } else if (c == 0x46) { - w = 0xFFE0; - } else if (c == 0x47) { - w = 0xFFE1; - } else if (c == 0xCC) { - w = 0x5341; - } else if (c == 0xCE) { - w = 0x5345; - } - } - } - - if (w <= 0) { - w = MBFL_BAD_INPUT; - } - CK((*filter->output_function)(w, filter->data)); - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - break; - - EMPTY_SWITCH_DEFAULT_CASE(); - } - - return 0; -} - -static int mbfl_filt_conv_big5_wchar_flush(mbfl_convert_filter *filter) -{ - if (filter->status == 1) { - /* 2-byte character was truncated */ - filter->status = 0; - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - - if (filter->flush_function) { - (*filter->flush_function)(filter->data); - } - - return 0; -} - -static int mbfl_filt_conv_wchar_big5(int c, mbfl_convert_filter *filter) -{ - int s = 0; - - if (c >= ucs_a1_big5_table_min && c < ucs_a1_big5_table_max) { - s = ucs_a1_big5_table[c - ucs_a1_big5_table_min]; - } else if (c >= ucs_a2_big5_table_min && c < ucs_a2_big5_table_max) { - s = ucs_a2_big5_table[c - ucs_a2_big5_table_min]; - } else if (c >= ucs_a3_big5_table_min && c < ucs_a3_big5_table_max) { - s = ucs_a3_big5_table[c - ucs_a3_big5_table_min]; - } else if (c >= ucs_i_big5_table_min && c < ucs_i_big5_table_max) { - s = ucs_i_big5_table[c - ucs_i_big5_table_min]; - } else if (c >= ucs_r1_big5_table_min && c < ucs_r1_big5_table_max) { - s = ucs_r1_big5_table[c - ucs_r1_big5_table_min]; - } else if (c >= ucs_r2_big5_table_min && c < ucs_r2_big5_table_max) { - s = ucs_r2_big5_table[c - ucs_r2_big5_table_min]; - } - - if (filter->to->no_encoding == mbfl_no_encoding_cp950) { - if (c >= 0xe000 && c <= 0xf848) { /* PUA for CP950 */ - size_t k; - for (k = 0; k < sizeof(cp950_pua_tbl) / (sizeof(unsigned short)*4); k++) { - if (c <= cp950_pua_tbl[k][1]) { - break; - } - } - - int c1 = c - cp950_pua_tbl[k][0]; - if ((cp950_pua_tbl[k][2] & 0xff) == 0x40) { - int c2 = cp950_pua_tbl[k][2] >> 8; - s = ((c1 / 157) + c2) << 8; - c1 %= 157; - s |= c1 + (c1 >= 0x3f ? 0x62 : 0x40); - } else { - s = c1 + cp950_pua_tbl[k][2]; - } - } else if (c == 0x00A2) { - s = 0; - } else if (c == 0x00A3) { - s = 0; - } else if (c == 0x00AF) { - s = 0xA1C2; - } else if (c == 0x02CD) { - s = 0xA1C5; - } else if (c == 0x0401) { - s = 0; - } else if (c >= 0x0414 && c <= 0x041C) { - s = 0; - } else if (c >= 0x0423 && c <= 0x044F) { - s = 0; - } else if (c == 0x0451) { - s = 0; - } else if (c == 0x2022) { - s = 0; - } else if (c == 0x2027) { - s = 0xA145; - } else if (c == 0x203E) { - s = 0; - } else if (c == 0x2215) { - s = 0xA241; - } else if (c == 0x223C) { - s = 0; - } else if (c == 0x2295) { - s = 0xA1F2; - } else if (c == 0x2299) { - s = 0xA1F3; - } else if (c >= 0x2460 && c <= 0x247D) { - s = 0; - } else if (c == 0x2574) { - s = 0xA15A; - } else if (c == 0x2609) { - s = 0; - } else if (c == 0x2641) { - s = 0; - } else if (c == 0x3005 || (c >= 0x302A && c <= 0x30FF)) { - s = 0; - } else if (c == 0xFE51) { - s = 0xA14E; - } else if (c == 0xFE68) { - s = 0xA242; - } else if (c == 0xFF3C) { - s = 0xA240; - } else if (c == 0xFF5E) { - s = 0xA1E3; - } else if (c == 0xFF64) { - s = 0; - } else if (c == 0xFFE0) { - s = 0xA246; - } else if (c == 0xFFE1) { - s = 0xA247; - } else if (c == 0xFFE3) { - s = 0xA1C3; - } else if (c == 0xFF0F) { - s = 0xA1FE; - } - } - - if (s <= 0) { - if (c == 0) { - s = 0; - } else { - s = -1; - } - } - - if (s >= 0) { - if (s <= 0x80) { /* latin */ - CK((*filter->output_function)(s, filter->data)); - } else { - CK((*filter->output_function)((s >> 8) & 0xff, filter->data)); - CK((*filter->output_function)(s & 0xff, filter->data)); - } - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - - return 0; -} - static size_t mb_big5_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) { unsigned char *p = *in, *e = p + *in_len; @@ -12539,26 +9120,6 @@ static void mb_wchar_to_cp950(uint32_t *in, size_t len, mb_convert_buf *buf, boo static const char *mbfl_encoding_big5_aliases[] = {"CN-BIG5", "BIG-FIVE", "BIGFIVE", NULL}; -static const struct mbfl_convert_vtbl vtbl_big5_wchar = { - mbfl_no_encoding_big5, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_big5_wchar, - mbfl_filt_conv_big5_wchar_flush, - NULL, -}; - -static const struct mbfl_convert_vtbl vtbl_wchar_big5 = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_big5, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_big5, - mbfl_filt_conv_common_flush, - NULL -}; - const mbfl_encoding mbfl_encoding_big5 = { mbfl_no_encoding_big5, "BIG-5", @@ -12566,31 +9127,11 @@ const mbfl_encoding mbfl_encoding_big5 = { mbfl_encoding_big5_aliases, mblen_table_81_to_fe, MBFL_ENCTYPE_GL_UNSAFE, - &vtbl_big5_wchar, - &vtbl_wchar_big5, - mb_big5_to_wchar, - mb_wchar_to_big5, NULL, NULL, -}; - -static const struct mbfl_convert_vtbl vtbl_cp950_wchar = { - mbfl_no_encoding_cp950, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_big5_wchar, - mbfl_filt_conv_big5_wchar_flush, - NULL, -}; - -static const struct mbfl_convert_vtbl vtbl_wchar_cp950 = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_cp950, - mbfl_filt_conv_common_ctor, + mb_big5_to_wchar, + mb_wchar_to_big5, NULL, - mbfl_filt_conv_wchar_big5, - mbfl_filt_conv_common_flush, NULL, }; @@ -12601,8 +9142,8 @@ const mbfl_encoding mbfl_encoding_cp950 = { NULL, mblen_table_81_to_fe, MBFL_ENCTYPE_GL_UNSAFE, - &vtbl_cp950_wchar, - &vtbl_wchar_cp950, + NULL, + NULL, mb_cp950_to_wchar, mb_wchar_to_cp950, NULL, diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cjk.h b/ext/mbstring/libmbfl/filters/mbfilter_cjk.h index bb0e672bef44d..f7e2184986c1f 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cjk.h +++ b/ext/mbstring/libmbfl/filters/mbfilter_cjk.h @@ -42,8 +42,4 @@ int mbfilter_sjis_emoji_docomo2unicode(int s, int *snd); int mbfilter_sjis_emoji_kddi2unicode(int s, int *snd); int mbfilter_sjis_emoji_sb2unicode(int s, int *snd); -int mbfilter_unicode2sjis_emoji_docomo(int c, int *s1, mbfl_convert_filter *filter); -int mbfilter_unicode2sjis_emoji_kddi_sjis(int c, int *s1, mbfl_convert_filter *filter); -int mbfilter_unicode2sjis_emoji_sb(int c, int *s1, mbfl_convert_filter *filter); - #endif /* MBFL_MBFILTER_CJK_H */ diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp51932.h b/ext/mbstring/libmbfl/filters/mbfilter_cp51932.h index f48ec7cb3d4c0..6729edb272d61 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp51932.h +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp51932.h @@ -33,10 +33,5 @@ #include "mbfilter.h" extern const mbfl_encoding mbfl_encoding_cp51932; -extern const struct mbfl_convert_vtbl vtbl_cp51932_wchar; -extern const struct mbfl_convert_vtbl vtbl_wchar_cp51932; - -int mbfl_filt_conv_cp51932_wchar(int c, mbfl_convert_filter *filter); -int mbfl_filt_conv_wchar_cp51932(int c, mbfl_convert_filter *filter); #endif /* MBFL_MBFILTER_CP51932_H */ diff --git a/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c b/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c index 7ced00fa536e1..ebb44cc3154fc 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c @@ -21,61 +21,10 @@ static inline uint32_t coalesce(uint32_t a, uint32_t b) return a ? a : b; } -/* Helper for single-byte encodings which use a conversion table */ -static int mbfl_conv_singlebyte_table(int c, mbfl_convert_filter *filter, int tbl_min, const unsigned short tbl[]) -{ - if (c >= 0 && c < tbl_min) { - CK((*filter->output_function)(c, filter->data)); - } else if (c < 0) { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } else { - CK((*filter->output_function)(coalesce(tbl[c - tbl_min], MBFL_BAD_INPUT), filter->data)); - } - return 0; -} - -static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int tbl_min, const unsigned short tbl[]) -{ - if (c >= 0 && c < tbl_min) { - CK((*filter->output_function)(c, filter->data)); - } else if (c < 0 || c == MBFL_BAD_INPUT) { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } else { - for (int i = 0; i < 256 - tbl_min; i++) { - if (c == tbl[i]) { - CK((*filter->output_function)(i + tbl_min, filter->data)); - return 0; - } - } - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - return 0; -} - /* Initialize data structures for a single-byte encoding */ #define DEF_SB(id, name, mime_name, aliases) \ - static int mbfl_filt_conv_##id##_wchar(int c, mbfl_convert_filter *filter); \ - static int mbfl_filt_conv_wchar_##id(int c, mbfl_convert_filter *filter); \ static size_t mb_##id##_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); \ static void mb_wchar_to_##id(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); \ - static const struct mbfl_convert_vtbl vtbl_##id##_wchar = { \ - mbfl_no_encoding_##id, \ - mbfl_no_encoding_wchar, \ - mbfl_filt_conv_common_ctor, \ - NULL, \ - mbfl_filt_conv_##id##_wchar, \ - mbfl_filt_conv_common_flush, \ - NULL \ - }; \ - static const struct mbfl_convert_vtbl vtbl_wchar_##id = { \ - mbfl_no_encoding_wchar, \ - mbfl_no_encoding_##id, \ - mbfl_filt_conv_common_ctor, \ - NULL, \ - mbfl_filt_conv_wchar_##id, \ - mbfl_filt_conv_common_flush, \ - NULL \ - }; \ const mbfl_encoding mbfl_encoding_##id = { \ mbfl_no_encoding_##id, \ name, \ @@ -83,8 +32,8 @@ static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int aliases, \ NULL, \ MBFL_ENCTYPE_SBCS, \ - &vtbl_##id##_wchar, \ - &vtbl_wchar_##id, \ + NULL, \ + NULL, \ mb_##id##_to_wchar, \ mb_wchar_to_##id, \ NULL, \ @@ -93,12 +42,6 @@ static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int /* For single-byte encodings which use a conversion table */ #define DEF_SB_TBL(id, name, mime_name, aliases, tbl_min, tbl) \ - static int mbfl_filt_conv_##id##_wchar(int c, mbfl_convert_filter *filter) { \ - return mbfl_conv_singlebyte_table(c, filter, tbl_min, tbl); \ - } \ - static int mbfl_filt_conv_wchar_##id(int c, mbfl_convert_filter *filter) { \ - return mbfl_conv_reverselookup_table(c, filter, tbl_min, tbl); \ - } \ static size_t mb_##id##_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) \ { \ unsigned char *p = *in, *e = p + *in_len; \ @@ -140,22 +83,6 @@ static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int static const char *ascii_aliases[] = {"ANSI_X3.4-1968", "iso-ir-6", "ANSI_X3.4-1986", "ISO_646.irv:1991", "US-ASCII", "ISO646-US", "us", "IBM367", "IBM-367", "cp367", "csASCII", NULL}; DEF_SB(ascii, "ASCII", "US-ASCII", ascii_aliases); -static int mbfl_filt_conv_ascii_wchar(int c, mbfl_convert_filter *filter) -{ - CK((*filter->output_function)((c < 0x80) ? c : MBFL_BAD_INPUT, filter->data)); - return 0; -} - -static int mbfl_filt_conv_wchar_ascii(int c, mbfl_convert_filter *filter) -{ - if (c >= 0 && c < 0x80 && c != MBFL_BAD_INPUT) { - CK((*filter->output_function)(c, filter->data)); - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - return 0; -} - static size_t mb_ascii_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) { unsigned char *p = *in, *e = p + *in_len; @@ -195,21 +122,6 @@ static void mb_wchar_to_ascii(uint32_t *in, size_t len, mb_convert_buf *buf, boo static const char *iso8859_1_aliases[] = {"ISO8859-1", "latin1", NULL}; DEF_SB(8859_1, "ISO-8859-1", "ISO-8859-1", iso8859_1_aliases); -static int mbfl_filt_conv_8859_1_wchar(int c, mbfl_convert_filter *filter) -{ - return (*filter->output_function)(c, filter->data); -} - -static int mbfl_filt_conv_wchar_8859_1(int c, mbfl_convert_filter *filter) -{ - if (c >= 0 && c < 0x100 && c != MBFL_BAD_INPUT) { - CK((*filter->output_function)(c, filter->data)); - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - return 0; -} - static size_t mb_8859_1_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) { unsigned char *p = *in, *e = p + *in_len; @@ -494,38 +406,6 @@ static const unsigned short cp1252_ucs_table[] = { }; DEF_SB(cp1252, "Windows-1252", "Windows-1252", cp1252_aliases); -static int mbfl_filt_conv_wchar_cp1252(int c, mbfl_convert_filter *filter) -{ - if (c < 0 || c == MBFL_BAD_INPUT) { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } else if (c >= 0x100) { - for (int n = 0; n < 32; n++) { - if (c == cp1252_ucs_table[n]) { - CK((*filter->output_function)(0x80 + n, filter->data)); - return 0; - } - } - CK(mbfl_filt_conv_illegal_output(c, filter)); - } else if (c <= 0x7F || c >= 0xA0 || c == 0x81 || c == 0x8D || c == 0x8F || c == 0x90 || c == 0x9D) { - CK((*filter->output_function)(c, filter->data)); - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - return 0; -} - -static int mbfl_filt_conv_cp1252_wchar(int c, mbfl_convert_filter *filter) -{ - int s; - if (c >= 0x80 && c < 0xA0) { - s = coalesce(cp1252_ucs_table[c - 0x80], MBFL_BAD_INPUT); - } else { - s = c; - } - CK((*filter->output_function)(s, filter->data)); - return 0; -} - static size_t mb_cp1252_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) { unsigned char *p = *in, *e = p + *in_len; @@ -701,32 +581,6 @@ static const unsigned char ucs_armscii8_table[] = { }; DEF_SB(armscii8, "ArmSCII-8", "ArmSCII-8", armscii8_aliases); -static int mbfl_filt_conv_armscii8_wchar(int c, mbfl_convert_filter *filter) -{ - CK((*filter->output_function)((c < 0xA0) ? c : coalesce(armscii8_ucs_table[c - 0xA0], MBFL_BAD_INPUT), filter->data)); - return 0; -} - -static int mbfl_filt_conv_wchar_armscii8(int c, mbfl_convert_filter *filter) -{ - if (c >= 0x28 && c <= 0x2F) { - CK((*filter->output_function)(ucs_armscii8_table[c - 0x28], filter->data)); - } else if (c < 0 || c == MBFL_BAD_INPUT) { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } else if (c < 0xA0) { - CK((*filter->output_function)(c, filter->data)); - } else { - for (int n = 0; n < 0x60; n++) { - if (c == armscii8_ucs_table[n]) { - CK((*filter->output_function)(0xA0 + n, filter->data)); - return 0; - } - } - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - return 0; -} - static size_t mb_armscii8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) { unsigned char *p = *in, *e = p + *in_len; diff --git a/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c b/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c index 01b569482b601..7639412253554 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c @@ -30,7 +30,6 @@ #include "mbfilter.h" #include "mbfilter_ucs2.h" -static int mbfl_filt_conv_ucs2_wchar_flush(mbfl_convert_filter *filter); static size_t mb_ucs2_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); static size_t mb_ucs2be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); static void mb_wchar_to_ucs2be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); @@ -53,8 +52,8 @@ const mbfl_encoding mbfl_encoding_ucs2 = { mbfl_encoding_ucs2_aliases, NULL, MBFL_ENCTYPE_WCS2, - &vtbl_ucs2_wchar, - &vtbl_wchar_ucs2, + NULL, + NULL, mb_ucs2_to_wchar, mb_wchar_to_ucs2be, NULL, @@ -68,8 +67,8 @@ const mbfl_encoding mbfl_encoding_ucs2be = { mbfl_encoding_ucs2be_aliases, NULL, MBFL_ENCTYPE_WCS2, - &vtbl_ucs2be_wchar, - &vtbl_wchar_ucs2be, + NULL, + NULL, mb_ucs2be_to_wchar, mb_wchar_to_ucs2be, NULL, @@ -83,158 +82,14 @@ const mbfl_encoding mbfl_encoding_ucs2le = { mbfl_encoding_ucs2le_aliases, NULL, MBFL_ENCTYPE_WCS2, - &vtbl_ucs2le_wchar, - &vtbl_wchar_ucs2le, - mb_ucs2le_to_wchar, - mb_wchar_to_ucs2le, NULL, NULL, -}; - -const struct mbfl_convert_vtbl vtbl_ucs2_wchar = { - mbfl_no_encoding_ucs2, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_ucs2_wchar, - mbfl_filt_conv_ucs2_wchar_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_wchar_ucs2 = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_ucs2, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_ucs2be, - mbfl_filt_conv_common_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_ucs2be_wchar = { - mbfl_no_encoding_ucs2be, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_ucs2be_wchar, - mbfl_filt_conv_ucs2_wchar_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_wchar_ucs2be = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_ucs2be, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_ucs2be, - mbfl_filt_conv_common_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_ucs2le_wchar = { - mbfl_no_encoding_ucs2le, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_ucs2le_wchar, - mbfl_filt_conv_ucs2_wchar_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_wchar_ucs2le = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_ucs2le, - mbfl_filt_conv_common_ctor, + mb_ucs2le_to_wchar, + mb_wchar_to_ucs2le, NULL, - mbfl_filt_conv_wchar_ucs2le, - mbfl_filt_conv_common_flush, NULL, }; -#define CK(statement) do { if ((statement) < 0) return (-1); } while (0) - -int mbfl_filt_conv_ucs2_wchar(int c, mbfl_convert_filter *filter) -{ - if (filter->status == 0) { - filter->status = 1; - filter->cache = c & 0xFF; - } else { - filter->status = 0; - int n = (filter->cache << 8) | (c & 0xFF); - if (n == 0xFFFE) { - /* Found little-endian byte order mark */ - filter->filter_function = mbfl_filt_conv_ucs2le_wchar; - } else { - filter->filter_function = mbfl_filt_conv_ucs2be_wchar; - if (n != 0xFEFF) { - CK((*filter->output_function)(n, filter->data)); - } - } - } - return 0; -} - -int mbfl_filt_conv_ucs2be_wchar(int c, mbfl_convert_filter *filter) -{ - if (filter->status == 0) { - filter->status = 1; - filter->cache = (c & 0xFF) << 8; - } else { - filter->status = 0; - CK((*filter->output_function)((c & 0xFF) | filter->cache, filter->data)); - } - return 0; -} - -int mbfl_filt_conv_wchar_ucs2be(int c, mbfl_convert_filter *filter) -{ - if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) { - CK((*filter->output_function)((c >> 8) & 0xFF, filter->data)); - CK((*filter->output_function)(c & 0xFF, filter->data)); - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - return 0; -} - -int mbfl_filt_conv_ucs2le_wchar(int c, mbfl_convert_filter *filter) -{ - if (filter->status == 0) { - filter->status = 1; - filter->cache = c & 0xFF; - } else { - filter->status = 0; - CK((*filter->output_function)(((c & 0xFF) << 8) | filter->cache, filter->data)); - } - return 0; -} - -int mbfl_filt_conv_wchar_ucs2le(int c, mbfl_convert_filter *filter) -{ - if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) { - CK((*filter->output_function)(c & 0xFF, filter->data)); - CK((*filter->output_function)((c >> 8) & 0xFF, filter->data)); - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - return 0; -} - -static int mbfl_filt_conv_ucs2_wchar_flush(mbfl_convert_filter *filter) -{ - if (filter->status) { - /* Input string was truncated */ - filter->status = 0; - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - - if (filter->flush_function) { - (*filter->flush_function)(filter->data); - } - - return 0; -} - #define DETECTED_BE 1 #define DETECTED_LE 2 diff --git a/ext/mbstring/libmbfl/filters/mbfilter_ucs2.h b/ext/mbstring/libmbfl/filters/mbfilter_ucs2.h index bbf567a49339b..7e2993d8fbb52 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_ucs2.h +++ b/ext/mbstring/libmbfl/filters/mbfilter_ucs2.h @@ -35,17 +35,5 @@ extern const mbfl_encoding mbfl_encoding_ucs2; extern const mbfl_encoding mbfl_encoding_ucs2be; extern const mbfl_encoding mbfl_encoding_ucs2le; -extern const struct mbfl_convert_vtbl vtbl_ucs2_wchar; -extern const struct mbfl_convert_vtbl vtbl_wchar_ucs2; -extern const struct mbfl_convert_vtbl vtbl_ucs2be_wchar; -extern const struct mbfl_convert_vtbl vtbl_wchar_ucs2be; -extern const struct mbfl_convert_vtbl vtbl_ucs2le_wchar; -extern const struct mbfl_convert_vtbl vtbl_wchar_ucs2le; - -int mbfl_filt_conv_ucs2_wchar(int c, mbfl_convert_filter *filter); -int mbfl_filt_conv_ucs2be_wchar(int c, mbfl_convert_filter *filter); -int mbfl_filt_conv_wchar_ucs2be(int c, mbfl_convert_filter *filter); -int mbfl_filt_conv_ucs2le_wchar(int c, mbfl_convert_filter *filter); -int mbfl_filt_conv_wchar_ucs2le(int c, mbfl_convert_filter *filter); #endif /* MBFL_MBFILTER_UCS2_H */ diff --git a/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c b/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c index 10b57061f7d9c..1731eb48add76 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c @@ -44,8 +44,6 @@ static const char *mbfl_encoding_ucs4_aliases[] = {"ISO-10646-UCS-4", "UCS4", NU static const char *mbfl_encoding_ucs4be_aliases[] = {"byte4be", NULL}; static const char *mbfl_encoding_ucs4le_aliases[] = {"byte4le", NULL}; -static int mbfl_filt_conv_ucs4_wchar_flush(mbfl_convert_filter *filter); - const mbfl_encoding mbfl_encoding_ucs4 = { mbfl_no_encoding_ucs4, "UCS-4", @@ -53,8 +51,8 @@ const mbfl_encoding mbfl_encoding_ucs4 = { mbfl_encoding_ucs4_aliases, NULL, MBFL_ENCTYPE_WCS4, - &vtbl_ucs4_wchar, - &vtbl_wchar_ucs4, + NULL, + NULL, mb_ucs4_to_wchar, mb_wchar_to_ucs4be, NULL, @@ -68,8 +66,8 @@ const mbfl_encoding mbfl_encoding_ucs4be = { mbfl_encoding_ucs4be_aliases, NULL, MBFL_ENCTYPE_WCS4, - &vtbl_ucs4be_wchar, - &vtbl_wchar_ucs4be, + NULL, + NULL, mb_ucs4be_to_wchar, mb_wchar_to_ucs4be, NULL, @@ -83,239 +81,14 @@ const mbfl_encoding mbfl_encoding_ucs4le = { mbfl_encoding_ucs4le_aliases, NULL, MBFL_ENCTYPE_WCS4, - &vtbl_ucs4le_wchar, - &vtbl_wchar_ucs4le, - mb_ucs4le_to_wchar, - mb_wchar_to_ucs4le, - NULL, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_ucs4_wchar = { - mbfl_no_encoding_ucs4, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_ucs4_wchar, - mbfl_filt_conv_ucs4_wchar_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_wchar_ucs4 = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_ucs4, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_ucs4be, - mbfl_filt_conv_common_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_ucs4be_wchar = { - mbfl_no_encoding_ucs4be, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_ucs4be_wchar, - mbfl_filt_conv_ucs4_wchar_flush, NULL, -}; - -const struct mbfl_convert_vtbl vtbl_wchar_ucs4be = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_ucs4be, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_ucs4be, - mbfl_filt_conv_common_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_ucs4le_wchar = { - mbfl_no_encoding_ucs4le, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_ucs4le_wchar, - mbfl_filt_conv_ucs4_wchar_flush, NULL, -}; - -const struct mbfl_convert_vtbl vtbl_wchar_ucs4le = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_ucs4le, - mbfl_filt_conv_common_ctor, + mb_ucs4le_to_wchar, + mb_wchar_to_ucs4le, NULL, - mbfl_filt_conv_wchar_ucs4le, - mbfl_filt_conv_common_flush, NULL, }; - -#define CK(statement) do { if ((statement) < 0) return (-1); } while (0) - -/* - * UCS-4 => wchar - */ -int mbfl_filt_conv_ucs4_wchar(int c, mbfl_convert_filter *filter) -{ - int n, endian; - - endian = filter->status & 0xff00; - switch (filter->status & 0xff) { - case 0: - if (endian) { - n = c & 0xff; - } else { - n = (c & 0xffu) << 24; - } - filter->cache = n; - filter->status++; - break; - case 1: - if (endian) { - n = (c & 0xff) << 8; - } else { - n = (c & 0xff) << 16; - } - filter->cache |= n; - filter->status++; - break; - case 2: - if (endian) { - n = (c & 0xff) << 16; - } else { - n = (c & 0xff) << 8; - } - filter->cache |= n; - filter->status++; - break; - default: - if (endian) { - n = (c & 0xffu) << 24; - } else { - n = c & 0xff; - } - n |= filter->cache; - filter->status &= ~0xff; - if ((n & 0xffff) == 0 && ((n >> 16) & 0xffff) == 0xfffe) { - if (endian) { - filter->status = 0; /* big-endian */ - } else { - filter->status = 0x100; /* little-endian */ - } - } else if (n != 0xfeff) { - CK((*filter->output_function)(n, filter->data)); - } - break; - } - - return 0; -} - -/* - * UCS-4BE => wchar - */ -int mbfl_filt_conv_ucs4be_wchar(int c, mbfl_convert_filter *filter) -{ - int n; - - if (filter->status == 0) { - filter->status = 1; - n = (c & 0xffu) << 24; - filter->cache = n; - } else if (filter->status == 1) { - filter->status = 2; - n = (c & 0xff) << 16; - filter->cache |= n; - } else if (filter->status == 2) { - filter->status = 3; - n = (c & 0xff) << 8; - filter->cache |= n; - } else { - filter->status = 0; - n = (c & 0xff) | filter->cache; - CK((*filter->output_function)(n, filter->data)); - } - return 0; -} - -/* - * wchar => UCS-4BE - */ -int mbfl_filt_conv_wchar_ucs4be(int c, mbfl_convert_filter *filter) -{ - if (c != MBFL_BAD_INPUT) { - CK((*filter->output_function)((c >> 24) & 0xff, filter->data)); - CK((*filter->output_function)((c >> 16) & 0xff, filter->data)); - CK((*filter->output_function)((c >> 8) & 0xff, filter->data)); - CK((*filter->output_function)(c & 0xff, filter->data)); - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - - return 0; -} - -/* - * UCS-4LE => wchar - */ -int mbfl_filt_conv_ucs4le_wchar(int c, mbfl_convert_filter *filter) -{ - int n; - - if (filter->status == 0) { - filter->status = 1; - n = (c & 0xff); - filter->cache = n; - } else if (filter->status == 1) { - filter->status = 2; - n = (c & 0xff) << 8; - filter->cache |= n; - } else if (filter->status == 2) { - filter->status = 3; - n = (c & 0xff) << 16; - filter->cache |= n; - } else { - filter->status = 0; - n = ((c & 0xffu) << 24) | filter->cache; - CK((*filter->output_function)(n, filter->data)); - } - return 0; -} - -/* - * wchar => UCS-4LE - */ -int mbfl_filt_conv_wchar_ucs4le(int c, mbfl_convert_filter *filter) -{ - if (c != MBFL_BAD_INPUT) { - CK((*filter->output_function)(c & 0xff, filter->data)); - CK((*filter->output_function)((c >> 8) & 0xff, filter->data)); - CK((*filter->output_function)((c >> 16) & 0xff, filter->data)); - CK((*filter->output_function)((c >> 24) & 0xff, filter->data)); - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - - return 0; -} - -static int mbfl_filt_conv_ucs4_wchar_flush(mbfl_convert_filter *filter) -{ - if (filter->status & 0xF) { - /* Input string was truncated */ - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - filter->status = 0; - - if (filter->flush_function) { - (*filter->flush_function)(filter->data); - } - - return 0; -} - #define DETECTED_BE 1 #define DETECTED_LE 2 diff --git a/ext/mbstring/libmbfl/filters/mbfilter_ucs4.h b/ext/mbstring/libmbfl/filters/mbfilter_ucs4.h index b5280f1bfb336..8b825784664df 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_ucs4.h +++ b/ext/mbstring/libmbfl/filters/mbfilter_ucs4.h @@ -33,17 +33,5 @@ extern const mbfl_encoding mbfl_encoding_ucs4; extern const mbfl_encoding mbfl_encoding_ucs4le; extern const mbfl_encoding mbfl_encoding_ucs4be; -extern const struct mbfl_convert_vtbl vtbl_ucs4_wchar; -extern const struct mbfl_convert_vtbl vtbl_wchar_ucs4; -extern const struct mbfl_convert_vtbl vtbl_ucs4be_wchar; -extern const struct mbfl_convert_vtbl vtbl_wchar_ucs4be; -extern const struct mbfl_convert_vtbl vtbl_ucs4le_wchar; -extern const struct mbfl_convert_vtbl vtbl_wchar_ucs4le; - -int mbfl_filt_conv_ucs4_wchar(int c, mbfl_convert_filter *filter); -int mbfl_filt_conv_ucs4be_wchar(int c, mbfl_convert_filter *filter); -int mbfl_filt_conv_wchar_ucs4be(int c, mbfl_convert_filter *filter); -int mbfl_filt_conv_ucs4le_wchar(int c, mbfl_convert_filter *filter); -int mbfl_filt_conv_wchar_ucs4le(int c, mbfl_convert_filter *filter); #endif /* MBFL_MBFILTER_UCS4_H */ diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c index 5f5958ad19b3e..29c4caeb8d94f 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c @@ -173,7 +173,6 @@ static void mb_wchar_to_utf16le_default(uint32_t *in, size_t len, mb_convert_buf #endif -static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter); static size_t mb_utf16_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); static zend_string* mb_cut_utf16(unsigned char *str, size_t from, size_t len, unsigned char *end); static zend_string* mb_cut_utf16be(unsigned char *str, size_t from, size_t len, unsigned char *end); @@ -188,8 +187,8 @@ const mbfl_encoding mbfl_encoding_utf16 = { mbfl_encoding_utf16_aliases, NULL, 0, - &vtbl_utf16_wchar, - &vtbl_wchar_utf16, + NULL, + NULL, mb_utf16_to_wchar, mb_wchar_to_utf16be, NULL, @@ -203,8 +202,8 @@ const mbfl_encoding mbfl_encoding_utf16be = { NULL, NULL, 0, - &vtbl_utf16be_wchar, - &vtbl_wchar_utf16be, + NULL, + NULL, mb_utf16be_to_wchar, mb_wchar_to_utf16be, NULL, @@ -218,270 +217,14 @@ const mbfl_encoding mbfl_encoding_utf16le = { NULL, NULL, 0, - &vtbl_utf16le_wchar, - &vtbl_wchar_utf16le, + NULL, + NULL, mb_utf16le_to_wchar, mb_wchar_to_utf16le, NULL, mb_cut_utf16le }; -const struct mbfl_convert_vtbl vtbl_utf16_wchar = { - mbfl_no_encoding_utf16, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_utf16_wchar, - mbfl_filt_conv_utf16_wchar_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_wchar_utf16 = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_utf16, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_utf16be, - mbfl_filt_conv_common_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_utf16be_wchar = { - mbfl_no_encoding_utf16be, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_utf16be_wchar, - mbfl_filt_conv_utf16_wchar_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_wchar_utf16be = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_utf16be, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_utf16be, - mbfl_filt_conv_common_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_utf16le_wchar = { - mbfl_no_encoding_utf16le, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_utf16le_wchar, - mbfl_filt_conv_utf16_wchar_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_wchar_utf16le = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_utf16le, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_utf16le, - mbfl_filt_conv_common_flush, - NULL, -}; - -#define CK(statement) do { if ((statement) < 0) return (-1); } while (0) - -int mbfl_filt_conv_utf16_wchar(int c, mbfl_convert_filter *filter) -{ - /* Start with the assumption that the string is big-endian; - * If we find a little-endian BOM, then we will change that assumption */ - if (filter->status == 0) { - filter->cache = c & 0xFF; - filter->status = 1; - } else { - int n = (filter->cache << 8) | (c & 0xFF); - filter->cache = filter->status = 0; - if (n == 0xFFFE) { - /* Switch to little-endian mode */ - filter->filter_function = mbfl_filt_conv_utf16le_wchar; - } else { - filter->filter_function = mbfl_filt_conv_utf16be_wchar; - if (n >= 0xD800 && n <= 0xDBFF) { - filter->cache = n & 0x3FF; /* Pick out 10 data bits */ - filter->status = 2; - return 0; - } else if (n >= 0xDC00 && n <= 0xDFFF) { - /* This is wrong; second part of surrogate pair has come first */ - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } else if (n != 0xFEFF) { - CK((*filter->output_function)(n, filter->data)); - } - } - } - - return 0; -} - -int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter) -{ - int n; - - switch (filter->status) { - case 0: /* First byte */ - filter->cache = c & 0xFF; - filter->status = 1; - break; - - case 1: /* Second byte */ - n = (filter->cache << 8) | (c & 0xFF); - if (n >= 0xD800 && n <= 0xDBFF) { - filter->cache = n & 0x3FF; /* Pick out 10 data bits */ - filter->status = 2; - } else if (n >= 0xDC00 && n <= 0xDFFF) { - /* This is wrong; second part of surrogate pair has come first */ - filter->status = 0; - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } else { - filter->status = 0; - CK((*filter->output_function)(n, filter->data)); - } - break; - - case 2: /* Second part of surrogate, first byte */ - filter->cache = (filter->cache << 8) | (c & 0xFF); - filter->status = 3; - break; - - case 3: /* Second part of surrogate, second byte */ - n = ((filter->cache & 0xFF) << 8) | (c & 0xFF); - if (n >= 0xD800 && n <= 0xDBFF) { - /* Wrong; that's the first half of a surrogate pair, not the second */ - filter->cache = n & 0x3FF; - filter->status = 2; - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } else if (n >= 0xDC00 && n <= 0xDFFF) { - filter->status = 0; - n = ((filter->cache & 0x3FF00) << 2) + (n & 0x3FF) + 0x10000; - CK((*filter->output_function)(n, filter->data)); - } else { - filter->status = 0; - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - CK((*filter->output_function)(n, filter->data)); - } - } - - return 0; -} - -int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter) -{ - int n; - - if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) { - CK((*filter->output_function)((c >> 8) & 0xff, filter->data)); - CK((*filter->output_function)(c & 0xff, filter->data)); - } else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_SUPMAX) { - n = ((c >> 10) - 0x40) | 0xd800; - CK((*filter->output_function)((n >> 8) & 0xff, filter->data)); - CK((*filter->output_function)(n & 0xff, filter->data)); - n = (c & 0x3ff) | 0xdc00; - CK((*filter->output_function)((n >> 8) & 0xff, filter->data)); - CK((*filter->output_function)(n & 0xff, filter->data)); - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - - return 0; -} - -int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter) -{ - int n; - - switch (filter->status) { - case 0: - filter->cache = c & 0xff; - filter->status = 1; - break; - - case 1: - if ((c & 0xfc) == 0xd8) { - /* Looks like we have a surrogate pair here */ - filter->cache += ((c & 0x3) << 8); - filter->status = 2; - } else if ((c & 0xfc) == 0xdc) { - /* This is wrong; the second part of the surrogate pair has come first */ - filter->status = 0; - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } else { - filter->status = 0; - CK((*filter->output_function)(filter->cache + ((c & 0xff) << 8), filter->data)); - } - break; - - case 2: - filter->cache = (filter->cache << 10) + (c & 0xff); - filter->status = 3; - break; - - case 3: - n = (filter->cache & 0xFF) | ((c & 0xFF) << 8); - if (n >= 0xD800 && n <= 0xDBFF) { - /* We previously saw the first part of a surrogate pair and were - * expecting the second part; this is another first part */ - filter->cache = n & 0x3FF; - filter->status = 2; - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } else if (n >= 0xDC00 && n <= 0xDFFF) { - n = filter->cache + ((c & 0x3) << 8) + 0x10000; - filter->status = 0; - CK((*filter->output_function)(n, filter->data)); - } else { - /* The first part of a surrogate pair was followed by some other codepoint - * which is not part of a surrogate pair at all */ - filter->status = 0; - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - CK((*filter->output_function)(n, filter->data)); - } - break; - } - - return 0; -} - -int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter) -{ - int n; - - if (c >= 0 && c < MBFL_WCSPLANE_UCS2MAX) { - CK((*filter->output_function)(c & 0xff, filter->data)); - CK((*filter->output_function)((c >> 8) & 0xff, filter->data)); - } else if (c >= MBFL_WCSPLANE_SUPMIN && c < MBFL_WCSPLANE_SUPMAX) { - n = ((c >> 10) - 0x40) | 0xd800; - CK((*filter->output_function)(n & 0xff, filter->data)); - CK((*filter->output_function)((n >> 8) & 0xff, filter->data)); - n = (c & 0x3ff) | 0xdc00; - CK((*filter->output_function)(n & 0xff, filter->data)); - CK((*filter->output_function)((n >> 8) & 0xff, filter->data)); - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - - return 0; -} - -static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter) -{ - if (filter->status) { - /* Input string was truncated */ - filter->status = 0; - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - - if (filter->flush_function) { - (*filter->flush_function)(filter->data); - } - - return 0; -} - #define DETECTED_BE 1 #define DETECTED_LE 2 diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf16.h b/ext/mbstring/libmbfl/filters/mbfilter_utf16.h index 291628549debe..227912a495564 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.h +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.h @@ -34,19 +34,6 @@ extern const mbfl_encoding mbfl_encoding_utf16; extern const mbfl_encoding mbfl_encoding_utf16be; extern const mbfl_encoding mbfl_encoding_utf16le; -extern const struct mbfl_convert_vtbl vtbl_utf16_wchar; -extern const struct mbfl_convert_vtbl vtbl_wchar_utf16; -extern const struct mbfl_convert_vtbl vtbl_utf16be_wchar; -extern const struct mbfl_convert_vtbl vtbl_wchar_utf16be; -extern const struct mbfl_convert_vtbl vtbl_utf16le_wchar; -extern const struct mbfl_convert_vtbl vtbl_wchar_utf16le; - -int mbfl_filt_conv_utf16_wchar(int c, mbfl_convert_filter *filter); -int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter); -int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter); -int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter); -int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter); - #ifdef ZEND_INTRIN_AVX2_FUNC_PTR void init_convert_utf16(void); #endif diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf32.c b/ext/mbstring/libmbfl/filters/mbfilter_utf32.c index 81057d8c6e95d..e82d5df5706c1 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf32.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf32.c @@ -30,7 +30,6 @@ #include "mbfilter.h" #include "mbfilter_utf32.h" -static int mbfl_filt_conv_utf32_wchar_flush(mbfl_convert_filter *filter); static size_t mb_utf32_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); static size_t mb_utf32be_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); static void mb_wchar_to_utf32be(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); @@ -46,8 +45,8 @@ const mbfl_encoding mbfl_encoding_utf32 = { mbfl_encoding_utf32_aliases, NULL, MBFL_ENCTYPE_WCS4, - &vtbl_utf32_wchar, - &vtbl_wchar_utf32, + NULL, + NULL, mb_utf32_to_wchar, mb_wchar_to_utf32be, NULL, @@ -61,8 +60,8 @@ const mbfl_encoding mbfl_encoding_utf32be = { NULL, NULL, MBFL_ENCTYPE_WCS4, - &vtbl_utf32be_wchar, - &vtbl_wchar_utf32be, + NULL, + NULL, mb_utf32be_to_wchar, mb_wchar_to_utf32be, NULL, @@ -76,178 +75,14 @@ const mbfl_encoding mbfl_encoding_utf32le = { NULL, NULL, MBFL_ENCTYPE_WCS4, - &vtbl_utf32le_wchar, - &vtbl_wchar_utf32le, - mb_utf32le_to_wchar, - mb_wchar_to_utf32le, - NULL, NULL, -}; - -const struct mbfl_convert_vtbl vtbl_utf32_wchar = { - mbfl_no_encoding_utf32, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, NULL, - mbfl_filt_conv_utf32_wchar, - mbfl_filt_conv_utf32_wchar_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_wchar_utf32 = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_utf32, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_utf32be, - mbfl_filt_conv_common_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_utf32be_wchar = { - mbfl_no_encoding_utf32be, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_utf32be_wchar, - mbfl_filt_conv_utf32_wchar_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_wchar_utf32be = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_utf32be, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_utf32be, - mbfl_filt_conv_common_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_utf32le_wchar = { - mbfl_no_encoding_utf32le, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_utf32le_wchar, - mbfl_filt_conv_utf32_wchar_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_wchar_utf32le = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_utf32le, - mbfl_filt_conv_common_ctor, + mb_utf32le_to_wchar, + mb_wchar_to_utf32le, NULL, - mbfl_filt_conv_wchar_utf32le, - mbfl_filt_conv_common_flush, NULL, }; -#define CK(statement) do { if ((statement) < 0) return (-1); } while (0) - -static int emit_char_if_valid(int n, mbfl_convert_filter *filter) -{ - if (n >= 0 && n < MBFL_WCSPLANE_UTF32MAX && (n < 0xD800 || n > 0xDFFF)) { - CK((*filter->output_function)(n, filter->data)); - } else { - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - return 0; -} - -int mbfl_filt_conv_utf32_wchar(int c, mbfl_convert_filter *filter) -{ - if (filter->status < 3) { - filter->cache = (filter->cache << 8) | (c & 0xFF); - filter->status++; - } else { - int n = ((unsigned int)filter->cache << 8) | (c & 0xFF); - filter->cache = filter->status = 0; - - if (n == 0xFFFE0000) { - /* Found a little-endian byte order mark */ - filter->filter_function = mbfl_filt_conv_utf32le_wchar; - } else { - filter->filter_function = mbfl_filt_conv_utf32be_wchar; - if (n != 0xFEFF) { - CK(emit_char_if_valid(n, filter)); - } - } - } - - return 0; -} - -int mbfl_filt_conv_utf32be_wchar(int c, mbfl_convert_filter *filter) -{ - if (filter->status < 3) { - filter->cache = (filter->cache << 8) | (c & 0xFF); - filter->status++; - } else { - int n = ((unsigned int)filter->cache << 8) | (c & 0xFF); - filter->cache = filter->status = 0; - CK(emit_char_if_valid(n, filter)); - } - return 0; -} - -int mbfl_filt_conv_wchar_utf32be(int c, mbfl_convert_filter *filter) -{ - if (c >= 0 && c < MBFL_WCSPLANE_UTF32MAX) { - CK((*filter->output_function)((c >> 24) & 0xff, filter->data)); - CK((*filter->output_function)((c >> 16) & 0xff, filter->data)); - CK((*filter->output_function)((c >> 8) & 0xff, filter->data)); - CK((*filter->output_function)(c & 0xff, filter->data)); - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - - return 0; -} - -int mbfl_filt_conv_utf32le_wchar(int c, mbfl_convert_filter *filter) -{ - if (filter->status < 3) { - filter->cache |= ((c & 0xFFU) << (8 * filter->status)); - filter->status++; - } else { - int n = ((c & 0xFFU) << 24) | filter->cache; - filter->cache = filter->status = 0; - CK(emit_char_if_valid(n, filter)); - } - return 0; -} - -int mbfl_filt_conv_wchar_utf32le(int c, mbfl_convert_filter *filter) -{ - if (c >= 0 && c < MBFL_WCSPLANE_UTF32MAX) { - CK((*filter->output_function)(c & 0xff, filter->data)); - CK((*filter->output_function)((c >> 8) & 0xff, filter->data)); - CK((*filter->output_function)((c >> 16) & 0xff, filter->data)); - CK((*filter->output_function)((c >> 24) & 0xff, filter->data)); - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - - return 0; -} - -static int mbfl_filt_conv_utf32_wchar_flush(mbfl_convert_filter *filter) -{ - if (filter->status) { - /* Input string was truncated */ - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - } - filter->cache = filter->status = 0; - - if (filter->flush_function) { - (*filter->flush_function)(filter->data); - } - - return 0; -} - #define DETECTED_BE 1 #define DETECTED_LE 2 diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf32.h b/ext/mbstring/libmbfl/filters/mbfilter_utf32.h index 58c69d72f16d3..5f75851116987 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf32.h +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf32.h @@ -34,17 +34,4 @@ extern const mbfl_encoding mbfl_encoding_utf32; extern const mbfl_encoding mbfl_encoding_utf32be; extern const mbfl_encoding mbfl_encoding_utf32le; -extern const struct mbfl_convert_vtbl vtbl_utf32_wchar; -extern const struct mbfl_convert_vtbl vtbl_wchar_utf32; -extern const struct mbfl_convert_vtbl vtbl_utf32be_wchar; -extern const struct mbfl_convert_vtbl vtbl_wchar_utf32be; -extern const struct mbfl_convert_vtbl vtbl_utf32le_wchar; -extern const struct mbfl_convert_vtbl vtbl_wchar_utf32le; - -int mbfl_filt_conv_utf32_wchar(int c, mbfl_convert_filter *filter); -int mbfl_filt_conv_utf32be_wchar(int c, mbfl_convert_filter *filter); -int mbfl_filt_conv_wchar_utf32be(int c, mbfl_convert_filter *filter); -int mbfl_filt_conv_utf32le_wchar(int c, mbfl_convert_filter *filter); -int mbfl_filt_conv_wchar_utf32le(int c, mbfl_convert_filter *filter); - #endif /* MBFL_MBFILTER_UTF32_H */ diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c index 41ffb97e58f16..80ac36be6dd47 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c @@ -52,14 +52,6 @@ const unsigned char mblen_table_utf8[] = { }; extern int mbfl_bisec_srch2(int w, const unsigned short tbl[], int n); -extern int mbfl_filt_conv_sjis_mobile_flush(mbfl_convert_filter *filter); - -static int mbfl_filt_conv_utf8_mobile_wchar(int c, mbfl_convert_filter *filter); -static int mbfl_filt_conv_wchar_utf8_mobile(int c, mbfl_convert_filter *filter); - -static int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter); -static int mbfl_filt_conv_wchar_utf8(int c, mbfl_convert_filter *filter); -static int mbfl_filt_conv_utf8_wchar_flush(mbfl_convert_filter *filter); static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); static void mb_wchar_to_utf8(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); @@ -143,34 +135,14 @@ const mbfl_encoding mbfl_encoding_utf8 = { mbfl_encoding_utf8_aliases, mblen_table_utf8, 0, - &vtbl_utf8_wchar, - &vtbl_wchar_utf8, + NULL, + NULL, mb_utf8_to_wchar, mb_wchar_to_utf8, NULL, mb_cut_utf8 }; -const struct mbfl_convert_vtbl vtbl_utf8_wchar = { - mbfl_no_encoding_utf8, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_utf8_wchar, - mbfl_filt_conv_utf8_wchar_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_wchar_utf8 = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_utf8, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_utf8, - mbfl_filt_conv_common_flush, - NULL, -}; - static const char *mbfl_encoding_utf8_docomo_aliases[] = {"UTF-8-DOCOMO", "UTF8-DOCOMO", NULL}; static const char *mbfl_encoding_utf8_kddi_b_aliases[] = {"UTF-8-Mobile#KDDI", "UTF-8-KDDI", "UTF8-KDDI", NULL}; static const char *mbfl_encoding_utf8_sb_aliases[] = {"UTF-8-SOFTBANK", "UTF8-SOFTBANK", NULL}; @@ -182,8 +154,8 @@ const mbfl_encoding mbfl_encoding_utf8_docomo = { mbfl_encoding_utf8_docomo_aliases, mblen_table_utf8, 0, - &vtbl_utf8_docomo_wchar, - &vtbl_wchar_utf8_docomo, + NULL, + NULL, mb_utf8_docomo_to_wchar, mb_wchar_to_utf8_docomo, NULL, @@ -197,8 +169,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_a = { NULL, mblen_table_utf8, 0, - &vtbl_utf8_kddi_a_wchar, - &vtbl_wchar_utf8_kddi_a, + NULL, + NULL, mb_utf8_kddi_a_to_wchar, mb_wchar_to_utf8_kddi_a, NULL, @@ -212,8 +184,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_b = { mbfl_encoding_utf8_kddi_b_aliases, mblen_table_utf8, 0, - &vtbl_utf8_kddi_b_wchar, - &vtbl_wchar_utf8_kddi_b, + NULL, + NULL, mb_utf8_kddi_b_to_wchar, mb_wchar_to_utf8_kddi_b, NULL, @@ -227,222 +199,14 @@ const mbfl_encoding mbfl_encoding_utf8_sb = { mbfl_encoding_utf8_sb_aliases, mblen_table_utf8, 0, - &vtbl_utf8_sb_wchar, - &vtbl_wchar_utf8_sb, + NULL, + NULL, mb_utf8_sb_to_wchar, mb_wchar_to_utf8_sb, NULL, mb_cut_utf8, }; -const struct mbfl_convert_vtbl vtbl_utf8_docomo_wchar = { - mbfl_no_encoding_utf8_docomo, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_utf8_mobile_wchar, - mbfl_filt_conv_utf8_wchar_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_wchar_utf8_docomo = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_utf8_docomo, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_utf8_mobile, - mbfl_filt_conv_sjis_mobile_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_utf8_kddi_a_wchar = { - mbfl_no_encoding_utf8_kddi_a, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_utf8_mobile_wchar, - mbfl_filt_conv_utf8_wchar_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_wchar_utf8_kddi_a = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_utf8_kddi_a, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_utf8_mobile, - mbfl_filt_conv_sjis_mobile_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_utf8_kddi_b_wchar = { - mbfl_no_encoding_utf8_kddi_b, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_utf8_mobile_wchar, - mbfl_filt_conv_utf8_wchar_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_wchar_utf8_kddi_b = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_utf8_kddi_b, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_utf8_mobile, - mbfl_filt_conv_sjis_mobile_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_utf8_sb_wchar = { - mbfl_no_encoding_utf8_sb, - mbfl_no_encoding_wchar, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_utf8_mobile_wchar, - mbfl_filt_conv_utf8_wchar_flush, - NULL, -}; - -const struct mbfl_convert_vtbl vtbl_wchar_utf8_sb = { - mbfl_no_encoding_wchar, - mbfl_no_encoding_utf8_sb, - mbfl_filt_conv_common_ctor, - NULL, - mbfl_filt_conv_wchar_utf8_mobile, - mbfl_filt_conv_sjis_mobile_flush, - NULL, -}; - -#define CK(statement) do { if ((statement) < 0) return (-1); } while (0) - -static int mbfl_filt_put_invalid_char(mbfl_convert_filter *filter) -{ - filter->status = filter->cache = 0; - CK((*filter->output_function)(MBFL_BAD_INPUT, filter->data)); - return 0; -} - -static int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter) -{ - int s, c1; - -retry: - switch (filter->status) { - case 0x00: - if (c < 0x80) { - CK((*filter->output_function)(c, filter->data)); - } else if (c >= 0xc2 && c <= 0xdf) { /* 2byte code first char: 0xc2-0xdf */ - filter->status = 0x10; - filter->cache = c & 0x1f; - } else if (c >= 0xe0 && c <= 0xef) { /* 3byte code first char: 0xe0-0xef */ - filter->status = 0x20; - filter->cache = c & 0xf; - } else if (c >= 0xf0 && c <= 0xf4) { /* 3byte code first char: 0xf0-0xf4 */ - filter->status = 0x30; - filter->cache = c & 0x7; - } else { - CK(mbfl_filt_put_invalid_char(filter)); - } - break; - case 0x10: /* 2byte code 2nd char: 0x80-0xbf */ - case 0x21: /* 3byte code 3rd char: 0x80-0xbf */ - case 0x32: /* 4byte code 4th char: 0x80-0xbf */ - if (c >= 0x80 && c <= 0xbf) { - s = (filter->cache<<6) | (c & 0x3f); - filter->status = filter->cache = 0; - CK((*filter->output_function)(s, filter->data)); - } else { - CK(mbfl_filt_put_invalid_char(filter)); - goto retry; - } - break; - case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */ - s = (filter->cache<<6) | (c & 0x3f); - c1 = filter->cache & 0xf; - - if ((c >= 0x80 && c <= 0xbf) && - ((c1 == 0x0 && c >= 0xa0) || - (c1 == 0xd && c < 0xa0) || - (c1 > 0x0 && c1 != 0xd))) { - filter->cache = s; - filter->status++; - } else { - CK(mbfl_filt_put_invalid_char(filter)); - goto retry; - } - break; - case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */ - s = (filter->cache<<6) | (c & 0x3f); - c1 = filter->cache & 0x7; - - if ((c >= 0x80 && c <= 0xbf) && - ((c1 == 0x0 && c >= 0x90) || - (c1 == 0x4 && c < 0x90) || - (c1 > 0x0 && c1 != 0x4))) { - filter->cache = s; - filter->status++; - } else { - CK(mbfl_filt_put_invalid_char(filter)); - goto retry; - } - break; - case 0x31: /* 4byte code 3rd char: 0x80-0xbf */ - if (c >= 0x80 && c <= 0xbf) { - filter->cache = (filter->cache<<6) | (c & 0x3f); - filter->status++; - } else { - CK(mbfl_filt_put_invalid_char(filter)); - goto retry; - } - break; - - EMPTY_SWITCH_DEFAULT_CASE(); - } - - return 0; -} - -static int mbfl_filt_conv_utf8_wchar_flush(mbfl_convert_filter *filter) -{ - if (filter->status) { - (*filter->output_function)(MBFL_BAD_INPUT, filter->data); - filter->status = 0; - } - - if (filter->flush_function) { - (*filter->flush_function)(filter->data); - } - - return 0; -} - -static int mbfl_filt_conv_wchar_utf8(int c, mbfl_convert_filter *filter) -{ - if (c >= 0 && c < 0x110000) { - if (c < 0x80) { - CK((*filter->output_function)(c, filter->data)); - } else if (c < 0x800) { - CK((*filter->output_function)(((c >> 6) & 0x1f) | 0xc0, filter->data)); - CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data)); - } else if (c < 0x10000) { - CK((*filter->output_function)(((c >> 12) & 0x0f) | 0xe0, filter->data)); - CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data)); - CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data)); - } else { - CK((*filter->output_function)(((c >> 18) & 0x07) | 0xf0, filter->data)); - CK((*filter->output_function)(((c >> 12) & 0x3f) | 0x80, filter->data)); - CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data)); - CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data)); - } - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - - return 0; -} - static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state) { unsigned char *p = *in, *e = p + *in_len; @@ -581,143 +345,6 @@ static zend_string* mb_cut_utf8(unsigned char *str, size_t from, size_t len, uns return zend_string_init_fast((char*)start, _end - start); } -static int mbfl_filt_conv_utf8_mobile_wchar(int c, mbfl_convert_filter *filter) -{ - int s, s1 = 0, c1 = 0, snd = 0; - -retry: - switch (filter->status & 0xff) { - case 0x00: - if (c < 0x80) { - CK((*filter->output_function)(c, filter->data)); - } else if (c >= 0xc2 && c <= 0xdf) { /* 2byte code first char: 0xc2-0xdf */ - filter->status = 0x10; - filter->cache = c & 0x1f; - } else if (c >= 0xe0 && c <= 0xef) { /* 3byte code first char: 0xe0-0xef */ - filter->status = 0x20; - filter->cache = c & 0xf; - } else if (c >= 0xf0 && c <= 0xf4) { /* 3byte code first char: 0xf0-0xf4 */ - filter->status = 0x30; - filter->cache = c & 0x7; - } else { - CK(mbfl_filt_put_invalid_char(filter)); - } - break; - - case 0x10: /* 2byte code 2nd char: 0x80-0xbf */ - case 0x21: /* 3byte code 3rd char: 0x80-0xbf */ - case 0x32: /* 4byte code 4th char: 0x80-0xbf */ - filter->status = 0; - if (c >= 0x80 && c <= 0xbf) { - s = (filter->cache << 6) | (c & 0x3f); - filter->cache = 0; - - if (filter->from->no_encoding == mbfl_no_encoding_utf8_docomo && mbfilter_conv_r_map_tbl(s, &s1, 4, mbfl_docomo2uni_pua)) { - s = mbfilter_sjis_emoji_docomo2unicode(s1, &snd); - } else if (filter->from->no_encoding == mbfl_no_encoding_utf8_kddi_a && mbfilter_conv_r_map_tbl(s, &s1, 7, mbfl_kddi2uni_pua)) { - s = mbfilter_sjis_emoji_kddi2unicode(s1, &snd); - } else if (filter->from->no_encoding == mbfl_no_encoding_utf8_kddi_b && mbfilter_conv_r_map_tbl(s, &s1, 8, mbfl_kddi2uni_pua_b)) { - s = mbfilter_sjis_emoji_kddi2unicode(s1, &snd); - } else if (filter->from->no_encoding == mbfl_no_encoding_utf8_sb && mbfilter_conv_r_map_tbl(s, &s1, 6, mbfl_sb2uni_pua)) { - s = mbfilter_sjis_emoji_sb2unicode(s1, &snd); - } - - if (snd > 0) { - CK((*filter->output_function)(snd, filter->data)); - } - CK((*filter->output_function)(s, filter->data)); - } else { - CK(mbfl_filt_put_invalid_char(filter)); - goto retry; - } - break; - - case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */ - s = (filter->cache << 6) | (c & 0x3f); - c1 = filter->cache & 0xf; - - if ((c >= 0x80 && c <= 0xbf) && - ((c1 == 0x0 && c >= 0xa0) || - (c1 == 0xd && c < 0xa0) || - (c1 > 0x0 && c1 != 0xd))) { - filter->cache = s; - filter->status++; - } else { - CK(mbfl_filt_put_invalid_char(filter)); - goto retry; - } - break; - - case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */ - s = (filter->cache << 6) | (c & 0x3f); - c1 = filter->cache & 0x7; - - if ((c >= 0x80 && c <= 0xbf) && - ((c1 == 0x0 && c >= 0x90) || - (c1 == 0x4 && c < 0x90) || - (c1 > 0x0 && c1 != 0x4))) { - filter->cache = s; - filter->status++; - } else { - CK(mbfl_filt_put_invalid_char(filter)); - goto retry; - } - break; - - case 0x31: /* 4byte code 3rd char: 0x80-0xbf */ - if (c >= 0x80 && c <= 0xbf) { - filter->cache = (filter->cache << 6) | (c & 0x3f); - filter->status++; - } else { - CK(mbfl_filt_put_invalid_char(filter)); - goto retry; - } - break; - - EMPTY_SWITCH_DEFAULT_CASE(); - } - - return 0; -} - -static int mbfl_filt_conv_wchar_utf8_mobile(int c, mbfl_convert_filter *filter) -{ - if (c >= 0 && c < 0x110000) { - int s1, c1; - - if ((filter->to->no_encoding == mbfl_no_encoding_utf8_docomo && mbfilter_unicode2sjis_emoji_docomo(c, &s1, filter) > 0 && mbfilter_conv_map_tbl(s1, &c1, 4, mbfl_docomo2uni_pua)) || - (filter->to->no_encoding == mbfl_no_encoding_utf8_kddi_a && mbfilter_unicode2sjis_emoji_kddi_sjis(c, &s1, filter) > 0 && mbfilter_conv_map_tbl(s1, &c1, 7, mbfl_kddi2uni_pua)) || - (filter->to->no_encoding == mbfl_no_encoding_utf8_kddi_b && mbfilter_unicode2sjis_emoji_kddi_sjis(c, &s1, filter) > 0 && mbfilter_conv_map_tbl(s1, &c1, 8, mbfl_kddi2uni_pua_b)) || - (filter->to->no_encoding == mbfl_no_encoding_utf8_sb && mbfilter_unicode2sjis_emoji_sb(c, &s1, filter) > 0 && mbfilter_conv_map_tbl(s1, &c1, 6, mbfl_sb2uni_pua))) { - c = c1; - } - - if (filter->status) { - return 0; - } - - if (c < 0x80) { - CK((*filter->output_function)(c, filter->data)); - } else if (c < 0x800) { - CK((*filter->output_function)(((c >> 6) & 0x1f) | 0xc0, filter->data)); - CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data)); - } else if (c < 0x10000) { - CK((*filter->output_function)(((c >> 12) & 0x0f) | 0xe0, filter->data)); - CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data)); - CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data)); - } else { - CK((*filter->output_function)(((c >> 18) & 0x07) | 0xf0, filter->data)); - CK((*filter->output_function)(((c >> 12) & 0x3f) | 0x80, filter->data)); - CK((*filter->output_function)(((c >> 6) & 0x3f) | 0x80, filter->data)); - CK((*filter->output_function)((c & 0x3f) | 0x80, filter->data)); - } - } else { - CK(mbfl_filt_conv_illegal_output(c, filter)); - } - - return 0; -} - /* Regional Indicator Unicode codepoints are from 0x1F1E6-0x1F1FF * These correspond to the letters A-Z * To display the flag emoji for a country, two unicode codepoints are combined, diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf8.h b/ext/mbstring/libmbfl/filters/mbfilter_utf8.h index a1282515f34f1..e574aebf89582 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf8.h +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf8.h @@ -31,21 +31,9 @@ #define MBFL_MBFILTER_UTF8_H extern const mbfl_encoding mbfl_encoding_utf8; -extern const struct mbfl_convert_vtbl vtbl_utf8_wchar; -extern const struct mbfl_convert_vtbl vtbl_wchar_utf8; - extern const mbfl_encoding mbfl_encoding_utf8_docomo; extern const mbfl_encoding mbfl_encoding_utf8_kddi_a; extern const mbfl_encoding mbfl_encoding_utf8_kddi_b; extern const mbfl_encoding mbfl_encoding_utf8_sb; -extern const struct mbfl_convert_vtbl vtbl_utf8_docomo_wchar; -extern const struct mbfl_convert_vtbl vtbl_wchar_utf8_docomo; -extern const struct mbfl_convert_vtbl vtbl_utf8_kddi_a_wchar; -extern const struct mbfl_convert_vtbl vtbl_wchar_utf8_kddi_a; -extern const struct mbfl_convert_vtbl vtbl_utf8_kddi_b_wchar; -extern const struct mbfl_convert_vtbl vtbl_wchar_utf8_kddi_b; -extern const struct mbfl_convert_vtbl vtbl_utf8_sb_wchar; -extern const struct mbfl_convert_vtbl vtbl_wchar_utf8_sb; - #endif /* MBFL_MBFILTER_UTF8_H */