| To: vim-dev@vim.org |
| Subject: Patch 7.2.312 |
| Fcc: outbox |
| From: Bram Moolenaar <Bram@moolenaar.net> |
| Mime-Version: 1.0 |
| Content-Type: text/plain; charset=UTF-8 |
| Content-Transfer-Encoding: 8bit |
| |
| |
| Patch 7.2.312 |
| Problem: iconv() returns an invalid character sequence when conversion |
| fails. It should return an empty string. (Yongwei Wu) |
| Solution: Be more strict about invalid characters in the input. |
| Files: src/mbyte.c |
| |
| |
| |
| |
| |
| *** 133,154 **** |
| static int dbcs_ptr2cells_len __ARGS((char_u *p, int size)); |
| static int dbcs_ptr2char __ARGS((char_u *p)); |
| |
| ! /* Lookup table to quickly get the length in bytes of a UTF-8 character from |
| ! * the first byte of a UTF-8 string. Bytes which are illegal when used as the |
| ! * first byte have a one, because these will be used separately. */ |
| static char utf8len_tab[256] = |
| { |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| ! 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /*bogus*/ |
| ! 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /*bogus*/ |
| 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
| 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1, |
| }; |
| |
| /* |
| * XIM often causes trouble. Define XIM_DEBUG to get a log of XIM callbacks |
| * in the "xim.log" file. |
| */ |
| --- 133,172 ---- |
| static int dbcs_ptr2cells_len __ARGS((char_u *p, int size)); |
| static int dbcs_ptr2char __ARGS((char_u *p)); |
| |
| ! /* |
| ! * Lookup table to quickly get the length in bytes of a UTF-8 character from |
| ! * the first byte of a UTF-8 string. |
| ! * Bytes which are illegal when used as the first byte have a 1. |
| ! * The NUL byte has length 1. |
| ! */ |
| static char utf8len_tab[256] = |
| { |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| ! 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| ! 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
| 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1, |
| }; |
| |
| /* |
| + * Like utf8len_tab above, but using a zero for illegal lead bytes. |
| + */ |
| + static char utf8len_tab_zero[256] = |
| + { |
| + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
| + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0, |
| + }; |
| + |
| + /* |
| * XIM often causes trouble. Define XIM_DEBUG to get a log of XIM callbacks |
| * in the "xim.log" file. |
| */ |
| |
| *** 1352,1358 **** |
| if (size > 0 && *p >= 0x80) |
| { |
| if (utf_ptr2len_len(p, size) < utf8len_tab[*p]) |
| ! return 1; |
| c = utf_ptr2char(p); |
| /* An illegal byte is displayed as <xx>. */ |
| if (utf_ptr2len(p) == 1 || c == NUL) |
| --- 1370,1376 ---- |
| if (size > 0 && *p >= 0x80) |
| { |
| if (utf_ptr2len_len(p, size) < utf8len_tab[*p]) |
| ! return 1; /* truncated */ |
| c = utf_ptr2char(p); |
| /* An illegal byte is displayed as <xx>. */ |
| if (utf_ptr2len(p) == 1 || c == NUL) |
| |
| *** 1473,1479 **** |
| if (p[0] < 0x80) /* be quick for ASCII */ |
| return p[0]; |
| |
| ! len = utf8len_tab[p[0]]; |
| if (len > 1 && (p[1] & 0xc0) == 0x80) |
| { |
| if (len == 2) |
| --- 1491,1497 ---- |
| if (p[0] < 0x80) /* be quick for ASCII */ |
| return p[0]; |
| |
| ! len = utf8len_tab_zero[p[0]]; |
| if (len > 1 && (p[1] & 0xc0) == 0x80) |
| { |
| if (len == 2) |
| |
| *** 1723,1728 **** |
| --- 1741,1747 ---- |
| /* |
| * Return length of UTF-8 character, obtained from the first byte. |
| * "b" must be between 0 and 255! |
| + * Returns 1 for an invalid first byte value. |
| */ |
| int |
| utf_byte2len(b) |
| |
| *** 1737,1742 **** |
| --- 1756,1762 ---- |
| * Returns 1 for "". |
| * Returns 1 for an illegal byte sequence (also in incomplete byte seq.). |
| * Returns number > "size" for an incomplete byte sequence. |
| + * Never returns zero. |
| */ |
| int |
| utf_ptr2len_len(p, size) |
| |
| *** 1747,1757 **** |
| int i; |
| int m; |
| |
| ! if (*p == NUL) |
| ! return 1; |
| ! m = len = utf8len_tab[*p]; |
| if (len > size) |
| m = size; /* incomplete byte sequence. */ |
| for (i = 1; i < m; ++i) |
| if ((p[i] & 0xc0) != 0x80) |
| return 1; |
| --- 1767,1779 ---- |
| int i; |
| int m; |
| |
| ! len = utf8len_tab[*p]; |
| ! if (len == 1) |
| ! return 1; /* NUL, ascii or illegal lead byte */ |
| if (len > size) |
| m = size; /* incomplete byte sequence. */ |
| + else |
| + m = len; |
| for (i = 1; i < m; ++i) |
| if ((p[i] & 0xc0) != 0x80) |
| return 1; |
| |
| *** 2505,2510 **** |
| --- 2527,2533 ---- |
| /* |
| * mb_head_off() function pointer. |
| * Return offset from "p" to the first byte of the character it points into. |
| + * If "p" points to the NUL at the end of the string return 0. |
| * Returns 0 when already at the first byte of a character. |
| */ |
| int |
| |
| *** 2524,2530 **** |
| |
| /* It can't be a trailing byte when not using DBCS, at the start of the |
| * string or the previous byte can't start a double-byte. */ |
| ! if (p <= base || MB_BYTE2LEN(p[-1]) == 1) |
| return 0; |
| |
| /* This is slow: need to start at the base and go forward until the |
| --- 2547,2553 ---- |
| |
| /* It can't be a trailing byte when not using DBCS, at the start of the |
| * string or the previous byte can't start a double-byte. */ |
| ! if (p <= base || MB_BYTE2LEN(p[-1]) == 1 || *p == NUL) |
| return 0; |
| |
| /* This is slow: need to start at the base and go forward until the |
| |
| *** 2552,2558 **** |
| * lead byte in the current cell. */ |
| if (p <= base |
| || (enc_dbcs == DBCS_JPNU && p[-1] == 0x8e) |
| ! || MB_BYTE2LEN(p[-1]) == 1) |
| return 0; |
| |
| /* This is slow: need to start at the base and go forward until the |
| --- 2575,2582 ---- |
| * lead byte in the current cell. */ |
| if (p <= base |
| || (enc_dbcs == DBCS_JPNU && p[-1] == 0x8e) |
| ! || MB_BYTE2LEN(p[-1]) == 1 |
| ! || *p == NUL) |
| return 0; |
| |
| /* This is slow: need to start at the base and go forward until the |
| |
| *** 2578,2583 **** |
| --- 2602,2608 ---- |
| char_u *q; |
| char_u *s; |
| int c; |
| + int len; |
| #ifdef FEAT_ARABIC |
| char_u *j; |
| #endif |
| |
| *** 2597,2604 **** |
| --q; |
| /* Check for illegal sequence. Do allow an illegal byte after where we |
| * started. */ |
| ! if (utf8len_tab[*q] != (int)(s - q + 1) |
| ! && utf8len_tab[*q] != (int)(p - q + 1)) |
| return 0; |
| |
| if (q <= base) |
| --- 2622,2629 ---- |
| --q; |
| /* Check for illegal sequence. Do allow an illegal byte after where we |
| * started. */ |
| ! len = utf8len_tab[*q]; |
| ! if (len != (int)(s - q + 1) && len != (int)(p - q + 1)) |
| return 0; |
| |
| if (q <= base) |
| |
| *** 2810,2818 **** |
| |
| while (end == NULL ? *p != NUL : p < end) |
| { |
| ! if ((*p & 0xc0) == 0x80) |
| return FALSE; /* invalid lead byte */ |
| - l = utf8len_tab[*p]; |
| if (end != NULL && p + l > end) |
| return FALSE; /* incomplete byte sequence */ |
| ++p; |
| --- 2835,2843 ---- |
| |
| while (end == NULL ? *p != NUL : p < end) |
| { |
| ! l = utf8len_tab_zero[*p]; |
| ! if (l == 0) |
| return FALSE; /* invalid lead byte */ |
| if (end != NULL && p + l > end) |
| return FALSE; /* incomplete byte sequence */ |
| ++p; |
| |
| *** 6117,6128 **** |
| d = retval; |
| for (i = 0; i < len; ++i) |
| { |
| ! l = utf_ptr2len(ptr + i); |
| if (l == 0) |
| *d++ = NUL; |
| else if (l == 1) |
| { |
| ! if (unconvlenp != NULL && utf8len_tab[ptr[i]] > len - i) |
| { |
| /* Incomplete sequence at the end. */ |
| *unconvlenp = len - i; |
| --- 6142,6161 ---- |
| d = retval; |
| for (i = 0; i < len; ++i) |
| { |
| ! l = utf_ptr2len_len(ptr + i, len - i); |
| if (l == 0) |
| *d++ = NUL; |
| else if (l == 1) |
| { |
| ! int l_w = utf8len_tab_zero[ptr[i]]; |
| ! |
| ! if (l_w == 0) |
| ! { |
| ! /* Illegal utf-8 byte cannot be converted */ |
| ! vim_free(retval); |
| ! return NULL; |
| ! } |
| ! if (unconvlenp != NULL && l_w > len - i) |
| { |
| /* Incomplete sequence at the end. */ |
| *unconvlenp = len - i; |
| |
| |
| |
| *** 683,684 **** |
| --- 683,686 ---- |
| { /* Add new patch number below this line */ |
| + /**/ |
| + 312, |
| /**/ |
| |
| -- |
| hundred-and-one symptoms of being an internet addict: |
| 6. You refuse to go to a vacation spot with no electricity and no phone lines. |
| |
| /// Bram Moolenaar -- Bram@Moolenaar.net -- http://www.Moolenaar.net \\\ |
| /// sponsor Vim, vote for features -- http://www.Vim.org/sponsor/ \\\ |
| \\\ download, build and distribute -- http://www.A-A-P.org /// |
| \\\ help me help AIDS victims -- http://ICCF-Holland.org /// |