NLS: update handling of Unicode

This patch (as1239) updates the kernel's treatment of Unicode. The character-set conversion routines are well behind the current state of the Unicode specification: They don't recognize the existence of code points beyond plane 0 or of surrogate pairs in the UTF-16 encoding. The old wchar_t 16-bit type is retained because it's still used in lots of places. This shouldn't cause any new problems; if a conversion now results in an invalid 16-bit code then before it must have yielded an undefined code. Difficult-to-read names like "utf_mbstowcs" are replaced with more transparent names like "utf8s_to_utf16s" and the ordering of the parameters is rationalized (buffer lengths come immediate after the pointers they refer to, and the inputs precede the outputs). Fortunately the low-level conversion routines are used in only a few places; the interfaces to the higher-level uni2char and char2uni methods have been left unchanged. Signed-off-by: Alan Stern <stern@rowland.harvard.edu> Acked-by: Clemens Ladisch <clemens@ladisch.de> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
author: Alan Stern <stern@rowland.harvard.edu> 2009-04-30 10:08:18 -0400
committer: Greg Kroah-Hartman <gregkh@suse.de> 2009-06-16 00:44:43 -0400
commit: 74675a58507e769beee7d949dbed788af3c4139d (patch)
tree: d4ae3cc06dbfadecf1eaf6ed0aef249fc87b07e6 /fs
parent: a853a3d4eb2edb066248a39f0634f6f5858816a0 (diff)
7 files changed, 150 insertions, 124 deletions
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 9367b6297d84..89cd2deeb4af 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -513,7 +513,7 @@ befs_utf2nls(struct super_block *sb, const char *in,
 {
        struct nls_table *nls = BEFS_SB(sb)->nls;
        int i, o;
-        wchar_t uni;
+        unicode_t uni;
        int unilen, utflen;
        char *result;
        /* The utf8->nls conversion won't make the final nls string bigger
@@ -539,16 +539,16 @@ befs_utf2nls(struct super_block *sb, const char *in,
        for (i = o = 0; i < in_len; i += utflen, o += unilen) {
                /* convert from UTF-8 to Unicode */
-                utflen = utf8_mbtowc(&uni, &in[i], in_len - i);
+                utflen = utf8_to_utf32(&in[i], in_len - i, &uni);
-                if (utflen < 0) {
+                if (utflen < 0)
                        goto conv_err;
-                }
                /* convert from Unicode to nls */
+                if (uni > MAX_WCHAR_T)
+                        goto conv_err;
                unilen = nls->uni2char(uni, &result[o], in_len - o);
-                if (unilen < 0) {
+                if (unilen < 0)
                        goto conv_err;
-                }
        }
        result[o] = '\0';
        *out_len = o;
@@ -619,15 +619,13 @@ befs_nls2utf(struct super_block *sb, const char *in,
                /* convert from nls to unicode */
                unilen = nls->char2uni(&in[i], in_len - i, &uni);
-                if (unilen < 0) {
+                if (unilen < 0)
                        goto conv_err;
-                }
                /* convert from unicode to UTF-8 */
-                utflen = utf8_wctomb(&result[o], uni, 3);
+                utflen = utf32_to_utf8(uni, &result[o], 3);
-                if (utflen <= 0) {
+                if (utflen <= 0)
                        goto conv_err;
-                }
        }
        result[o] = '\0';
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index f3500294eec5..7c14c8cbbaba 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -22,6 +22,19 @@
 #include <asm/uaccess.h>
 #include "fat.h"
+/*
+ * Maximum buffer size of short name.
+ * [(MSDOS_NAME + '.') * max one char + nul]
+ * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
+ */
+#define FAT_MAX_SHORT_SIZE      ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
+/*
+ * Maximum buffer size of unicode chars from slots.
+ * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
+ */
+#define FAT_MAX_UNI_CHARS       ((MSDOS_SLOTS - 1) * 13 + 1)
+#define FAT_MAX_UNI_SIZE        (FAT_MAX_UNI_CHARS * sizeof(wchar_t))
 static inline loff_t fat_make_i_pos(struct super_block *sb,
                                    struct buffer_head *bh,
                                    struct msdos_dir_entry *de)
@@ -171,7 +184,8 @@ static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni,
                                unsigned char *buf, int size)
 {
        if (sbi->options.utf8)
-                return utf8_wcstombs(buf, uni, size);
+                return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS,
+                                UTF16_HOST_ENDIAN, buf, size);
        else
                return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate,
                                   sbi->nls_io);
@@ -325,19 +339,6 @@ parse_long:
 }
 /*
- * Maximum buffer size of short name.
- * [(MSDOS_NAME + '.') * max one char + nul]
- * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
- */
-#define FAT_MAX_SHORT_SIZE      ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
-/*
- * Maximum buffer size of unicode chars from slots.
- * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
- */
-#define FAT_MAX_UNI_CHARS       ((MSDOS_SLOTS - 1) * 13 + 1)
-#define FAT_MAX_UNI_SIZE        (FAT_MAX_UNI_CHARS * sizeof(wchar_t))
-/*
 * Return values: negative -> error, 0 -> not found, positive -> found,
 * value is the total amount of slots, including the shortname entry.
 */
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index b50ecbe97f83..f92ad9995356 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -502,11 +502,11 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
        if (utf8) {
                int name_len = strlen(name);
-                *outlen = utf8_mbstowcs((wchar_t *)outname, name, PATH_MAX);
+                *outlen = utf8s_to_utf16s(name, PATH_MAX, (wchar_t *) outname);
                /*
                 * We stripped '.'s before and set len appropriately,
-                 * but utf8_mbstowcs doesn't care about len
+                 * but utf8s_to_utf16s doesn't care about len
                 */
                *outlen -= (name_len - len);
diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c
index 92c14b850e9c..a048de81c093 100644
--- a/fs/isofs/joliet.c
+++ b/fs/isofs/joliet.c
@@ -37,37 +37,6 @@ uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls)
        return (op - ascii);
 }
-/* Convert big endian wide character string to utf8 */
-static int
-wcsntombs_be(__u8 *s, const __u8 *pwcs, int inlen, int maxlen)
-{
-        const __u8 *ip;
-        __u8 *op;
-        int size;
-        __u16 c;
-        op = s;
-        ip = pwcs;
-        while ((*ip || ip[1]) && (maxlen > 0) && (inlen > 0)) {
-                c = (*ip << 8) | ip[1];
-                if (c > 0x7f) {
-                        size = utf8_wctomb(op, c, maxlen);
-                        if (size == -1) {
-                                /* Ignore character and move on */
-                                maxlen--;
-                        } else {
-                                op += size;
-                                maxlen -= size;
-                        }
-                } else {
-                        *op++ = (__u8) c;
-                }
-                ip += 2;
-                inlen--;
-        }
-        return (op - s);
-}
 int
 get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode)
 {
@@ -79,8 +48,9 @@ get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, st
        nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset;
        if (utf8) {
-                len = wcsntombs_be(outname, de->name,
+                len = utf16s_to_utf8s((const wchar_t *) de->name,
-                                de->name_len[0] >> 1, PAGE_SIZE);
+                                de->name_len[0] >> 1, UTF16_BIG_ENDIAN,
+                                outname, PAGE_SIZE);
        } else {
                len = uni16_to_x8(outname, (__be16 *) de->name,
                                de->name_len[0] >> 1, nls);
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index 97645f112114..0ec6237a5970 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -1113,11 +1113,13 @@ ncp__io2vol(struct ncp_server *server, unsigned char *vname, unsigned int *vlen,
                if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
                        int k;
+                        unicode_t u;
-                        k = utf8_mbtowc(&ec, iname, iname_end - iname);
+                        k = utf8_to_utf32(iname, iname_end - iname, &u);
-                        if (k < 0)
+                        if (k < 0 || u > MAX_WCHAR_T)
                                return -EINVAL;
                        iname += k;
+                        ec = u;
                } else {
                        if (*iname == NCP_ESC) {
                                int k;
@@ -1214,7 +1216,7 @@ ncp__vol2io(struct ncp_server *server, unsigned char *iname, unsigned int *ilen,
                if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
                        int k;
-                        k = utf8_wctomb(iname, ec, iname_end - iname);
+                        k = utf32_to_utf8(ec, iname, iname_end - iname);
                        if (k < 0) {
                                err = -ENAMETOOLONG;
                                goto quit;
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 750abf211e26..477d37d83b31 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -15,6 +15,7 @@
 #include <linux/errno.h>
 #include <linux/kmod.h>
 #include <linux/spinlock.h>
+#include <asm/byteorder.h>
 static struct nls_table default_table;
 static struct nls_table *tables = &default_table;
@@ -43,10 +44,17 @@ static const struct utf8_table utf8_table[] =
    {0,                                                /* end of table    */}
 };
-int
+#define UNICODE_MAX     0x0010ffff
-utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
+#define PLANE_SIZE      0x00010000
+#define SURROGATE_MASK  0xfffff800
+#define SURROGATE_PAIR  0x0000d800
+#define SURROGATE_LOW   0x00000400
+#define SURROGATE_BITS  0x000003ff
+int utf8_to_utf32(const u8 *s, int len, unicode_t *pu)
 {
-        long l;
+        unsigned long l;
        int c0, c, nc;
        const struct utf8_table *t;
  
@@ -57,12 +65,13 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
                nc++;
                if ((c0 & t->cmask) == t->cval) {
                        l &= t->lmask;
-                        if (l < t->lval)
+                        if (l < t->lval || l > UNICODE_MAX ||
+                                        (l & SURROGATE_MASK) == SURROGATE_PAIR)
                                return -1;
-                        *p = l;
+                        *pu = (unicode_t) l;
                        return nc;
                }
-                if (n <= nc)
+                if (len <= nc)
                        return -1;
                s++;
                c = (*s ^ 0x80) & 0xFF;
@@ -72,76 +81,119 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
        }
        return -1;
 }
+EXPORT_SYMBOL(utf8_to_utf32);
-int
+int utf32_to_utf8(unicode_t u, u8 *s, int maxlen)
-utf8_mbstowcs(wchar_t *pwcs, const __u8 *s, int n)
 {
-        __u16 *op;
+        unsigned long l;
-        const __u8 *ip;
-        int size;
-        op = pwcs;
-        ip = s;
-        while (*ip && n > 0) {
-                if (*ip & 0x80) {
-                        size = utf8_mbtowc(op, ip, n);
-                        if (size == -1) {
-                                /* Ignore character and move on */
-                                ip++;
-                                n--;
-                        } else {
-                                op++;
-                                ip += size;
-                                n -= size;
-                        }
-                } else {
-                        *op++ = *ip++;
-                        n--;
-                }
-        }
-        return (op - pwcs);
-}
-int
-utf8_wctomb(__u8 *s, wchar_t wc, int maxlen)
-{
-        long l;
        int c, nc;
        const struct utf8_table *t;
-  
        if (!s)
                return 0;
-  
-        l = wc;
+        l = u;
+        if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR)
+                return -1;
        nc = 0;
        for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) {
                nc++;
                if (l <= t->lmask) {
                        c = t->shift;
-                        *s = t->cval | (l >> c);
+                        *s = (u8) (t->cval | (l >> c));
                        while (c > 0) {
                                c -= 6;
                                s++;
-                                *s = 0x80 | ((l >> c) & 0x3F);
+                                *s = (u8) (0x80 | ((l >> c) & 0x3F));
                        }
                        return nc;
                }
        }
        return -1;
 }
+EXPORT_SYMBOL(utf32_to_utf8);
-int
+int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs)
-utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen)
 {
-        const __u16 *ip;
+        u16 *op;
-        __u8 *op;
        int size;
+        unicode_t u;
+        op = pwcs;
+        while (*s && len > 0) {
+                if (*s & 0x80) {
+                        size = utf8_to_utf32(s, len, &u);
+                        if (size < 0) {
+                                /* Ignore character and move on */
+                                size = 1;
+                        } else if (u >= PLANE_SIZE) {
+                                u -= PLANE_SIZE;
+                                *op++ = (wchar_t) (SURROGATE_PAIR |
+                                                ((u >> 10) & SURROGATE_BITS));
+                                *op++ = (wchar_t) (SURROGATE_PAIR |
+                                                SURROGATE_LOW |
+                                                (u & SURROGATE_BITS));
+                        } else {
+                                *op++ = (wchar_t) u;
+                        }
+                        s += size;
+                        len -= size;
+                } else {
+                        *op++ = *s++;
+                        len--;
+                }
+        }
+        return op - pwcs;
+}
+EXPORT_SYMBOL(utf8s_to_utf16s);
+static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian)
+{
+        switch (endian) {
+        default:
+                return c;
+        case UTF16_LITTLE_ENDIAN:
+                return __le16_to_cpu(c);
+        case UTF16_BIG_ENDIAN:
+                return __be16_to_cpu(c);
+        }
+}
+int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian,
+                u8 *s, int maxlen)
+{
+        u8 *op;
+        int size;
+        unsigned long u, v;
        op = s;
-        ip = pwcs;
+        while (len > 0 && maxlen > 0) {
-        while (*ip && maxlen > 0) {
+                u = get_utf16(*pwcs, endian);
-                if (*ip > 0x7f) {
+                if (!u)
-                        size = utf8_wctomb(op, *ip, maxlen);
+                        break;
+                pwcs++;
+                len--;
+                if (u > 0x7f) {
+                        if ((u & SURROGATE_MASK) == SURROGATE_PAIR) {
+                                if (u & SURROGATE_LOW) {
+                                        /* Ignore character and move on */
+                                        continue;
+                                }
+                                if (len <= 0)
+                                        break;
+                                v = get_utf16(*pwcs, endian);
+                                if ((v & SURROGATE_MASK) != SURROGATE_PAIR ||
+                                                !(v & SURROGATE_LOW)) {
+                                        /* Ignore character and move on */
+                                        continue;
+                                }
+                                u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10)
+                                                + (v & SURROGATE_BITS);
+                                pwcs++;
+                                len--;
+                        }
+                        size = utf32_to_utf8(u, op, maxlen);
                        if (size == -1) {
                                /* Ignore character and move on */
                        } else {
@@ -149,13 +201,13 @@ utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen)
                                maxlen -= size;
                        }
                } else {
-                        *op++ = (__u8) *ip;
+                        *op++ = (u8) u;
                        maxlen--;
                }
-                ip++;
        }
-        return (op - s);
+        return op - s;
 }
+EXPORT_SYMBOL(utf16s_to_utf8s);
 int register_nls(struct nls_table * nls)
 {
@@ -467,9 +519,5 @@ EXPORT_SYMBOL(unregister_nls);
 EXPORT_SYMBOL(unload_nls);
 EXPORT_SYMBOL(load_nls);
 EXPORT_SYMBOL(load_nls_default);
-EXPORT_SYMBOL(utf8_mbtowc);
-EXPORT_SYMBOL(utf8_mbstowcs);
-EXPORT_SYMBOL(utf8_wctomb);
-EXPORT_SYMBOL(utf8_wcstombs);
 MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c
index aa2c42fdd977..0d60a44acacd 100644
--- a/fs/nls/nls_utf8.c
+++ b/fs/nls/nls_utf8.c
@@ -15,7 +15,11 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
 {
        int n;
-        if ( (n = utf8_wctomb(out, uni, boundlen)) == -1) {
+        if (boundlen <= 0)
+                return -ENAMETOOLONG;
+        n = utf32_to_utf8(uni, out, boundlen);
+        if (n < 0) {
                *out = '?';
                return -EINVAL;
        }
@@ -25,11 +29,14 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
 static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
 {
        int n;
+        unicode_t u;
-        if ( (n = utf8_mbtowc(uni, rawstring, boundlen)) == -1) {
+        n = utf8_to_utf32(rawstring, boundlen, &u);
+        if (n < 0 || u > MAX_WCHAR_T) {
                *uni = 0x003f;  /* ? */
-                n = -EINVAL;
+                return -EINVAL;
        }
+        *uni = (wchar_t) u;
        return n;
 }
author	Alan Stern <stern@rowland.harvard.edu>	2009-04-30 10:08:18 -0400
committer	Greg Kroah-Hartman <gregkh@suse.de>	2009-06-16 00:44:43 -0400
commit	74675a58507e769beee7d949dbed788af3c4139d (patch)
tree	d4ae3cc06dbfadecf1eaf6ed0aef249fc87b07e6 /fs
parent	a853a3d4eb2edb066248a39f0634f6f5858816a0 (diff)