aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorAlan Stern <stern@rowland.harvard.edu>2009-04-30 10:08:18 -0400
committerGreg Kroah-Hartman <gregkh@suse.de>2009-06-16 00:44:43 -0400
commit74675a58507e769beee7d949dbed788af3c4139d (patch)
treed4ae3cc06dbfadecf1eaf6ed0aef249fc87b07e6 /fs
parenta853a3d4eb2edb066248a39f0634f6f5858816a0 (diff)
NLS: update handling of Unicode
This patch (as1239) updates the kernel's treatment of Unicode. The character-set conversion routines are well behind the current state of the Unicode specification: They don't recognize the existence of code points beyond plane 0 or of surrogate pairs in the UTF-16 encoding. The old wchar_t 16-bit type is retained because it's still used in lots of places. This shouldn't cause any new problems; if a conversion now results in an invalid 16-bit code then before it must have yielded an undefined code. Difficult-to-read names like "utf_mbstowcs" are replaced with more transparent names like "utf8s_to_utf16s" and the ordering of the parameters is rationalized (buffer lengths come immediate after the pointers they refer to, and the inputs precede the outputs). Fortunately the low-level conversion routines are used in only a few places; the interfaces to the higher-level uni2char and char2uni methods have been left unchanged. Signed-off-by: Alan Stern <stern@rowland.harvard.edu> Acked-by: Clemens Ladisch <clemens@ladisch.de> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Diffstat (limited to 'fs')
-rw-r--r--fs/befs/linuxvfs.c20
-rw-r--r--fs/fat/dir.c29
-rw-r--r--fs/fat/namei_vfat.c4
-rw-r--r--fs/isofs/joliet.c36
-rw-r--r--fs/ncpfs/ncplib_kernel.c8
-rw-r--r--fs/nls/nls_base.c164
-rw-r--r--fs/nls/nls_utf8.c13
7 files changed, 150 insertions, 124 deletions
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 9367b6297d84..89cd2deeb4af 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -513,7 +513,7 @@ befs_utf2nls(struct super_block *sb, const char *in,
513{ 513{
514 struct nls_table *nls = BEFS_SB(sb)->nls; 514 struct nls_table *nls = BEFS_SB(sb)->nls;
515 int i, o; 515 int i, o;
516 wchar_t uni; 516 unicode_t uni;
517 int unilen, utflen; 517 int unilen, utflen;
518 char *result; 518 char *result;
519 /* The utf8->nls conversion won't make the final nls string bigger 519 /* The utf8->nls conversion won't make the final nls string bigger
@@ -539,16 +539,16 @@ befs_utf2nls(struct super_block *sb, const char *in,
539 for (i = o = 0; i < in_len; i += utflen, o += unilen) { 539 for (i = o = 0; i < in_len; i += utflen, o += unilen) {
540 540
541 /* convert from UTF-8 to Unicode */ 541 /* convert from UTF-8 to Unicode */
542 utflen = utf8_mbtowc(&uni, &in[i], in_len - i); 542 utflen = utf8_to_utf32(&in[i], in_len - i, &uni);
543 if (utflen < 0) { 543 if (utflen < 0)
544 goto conv_err; 544 goto conv_err;
545 }
546 545
547 /* convert from Unicode to nls */ 546 /* convert from Unicode to nls */
547 if (uni > MAX_WCHAR_T)
548 goto conv_err;
548 unilen = nls->uni2char(uni, &result[o], in_len - o); 549 unilen = nls->uni2char(uni, &result[o], in_len - o);
549 if (unilen < 0) { 550 if (unilen < 0)
550 goto conv_err; 551 goto conv_err;
551 }
552 } 552 }
553 result[o] = '\0'; 553 result[o] = '\0';
554 *out_len = o; 554 *out_len = o;
@@ -619,15 +619,13 @@ befs_nls2utf(struct super_block *sb, const char *in,
619 619
620 /* convert from nls to unicode */ 620 /* convert from nls to unicode */
621 unilen = nls->char2uni(&in[i], in_len - i, &uni); 621 unilen = nls->char2uni(&in[i], in_len - i, &uni);
622 if (unilen < 0) { 622 if (unilen < 0)
623 goto conv_err; 623 goto conv_err;
624 }
625 624
626 /* convert from unicode to UTF-8 */ 625 /* convert from unicode to UTF-8 */
627 utflen = utf8_wctomb(&result[o], uni, 3); 626 utflen = utf32_to_utf8(uni, &result[o], 3);
628 if (utflen <= 0) { 627 if (utflen <= 0)
629 goto conv_err; 628 goto conv_err;
630 }
631 } 629 }
632 630
633 result[o] = '\0'; 631 result[o] = '\0';
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index f3500294eec5..7c14c8cbbaba 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -22,6 +22,19 @@
22#include <asm/uaccess.h> 22#include <asm/uaccess.h>
23#include "fat.h" 23#include "fat.h"
24 24
25/*
26 * Maximum buffer size of short name.
27 * [(MSDOS_NAME + '.') * max one char + nul]
28 * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
29 */
30#define FAT_MAX_SHORT_SIZE ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
31/*
32 * Maximum buffer size of unicode chars from slots.
33 * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
34 */
35#define FAT_MAX_UNI_CHARS ((MSDOS_SLOTS - 1) * 13 + 1)
36#define FAT_MAX_UNI_SIZE (FAT_MAX_UNI_CHARS * sizeof(wchar_t))
37
25static inline loff_t fat_make_i_pos(struct super_block *sb, 38static inline loff_t fat_make_i_pos(struct super_block *sb,
26 struct buffer_head *bh, 39 struct buffer_head *bh,
27 struct msdos_dir_entry *de) 40 struct msdos_dir_entry *de)
@@ -171,7 +184,8 @@ static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni,
171 unsigned char *buf, int size) 184 unsigned char *buf, int size)
172{ 185{
173 if (sbi->options.utf8) 186 if (sbi->options.utf8)
174 return utf8_wcstombs(buf, uni, size); 187 return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS,
188 UTF16_HOST_ENDIAN, buf, size);
175 else 189 else
176 return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate, 190 return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate,
177 sbi->nls_io); 191 sbi->nls_io);
@@ -325,19 +339,6 @@ parse_long:
325} 339}
326 340
327/* 341/*
328 * Maximum buffer size of short name.
329 * [(MSDOS_NAME + '.') * max one char + nul]
330 * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
331 */
332#define FAT_MAX_SHORT_SIZE ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
333/*
334 * Maximum buffer size of unicode chars from slots.
335 * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
336 */
337#define FAT_MAX_UNI_CHARS ((MSDOS_SLOTS - 1) * 13 + 1)
338#define FAT_MAX_UNI_SIZE (FAT_MAX_UNI_CHARS * sizeof(wchar_t))
339
340/*
341 * Return values: negative -> error, 0 -> not found, positive -> found, 342 * Return values: negative -> error, 0 -> not found, positive -> found,
342 * value is the total amount of slots, including the shortname entry. 343 * value is the total amount of slots, including the shortname entry.
343 */ 344 */
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index b50ecbe97f83..f92ad9995356 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -502,11 +502,11 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
502 if (utf8) { 502 if (utf8) {
503 int name_len = strlen(name); 503 int name_len = strlen(name);
504 504
505 *outlen = utf8_mbstowcs((wchar_t *)outname, name, PATH_MAX); 505 *outlen = utf8s_to_utf16s(name, PATH_MAX, (wchar_t *) outname);
506 506
507 /* 507 /*
508 * We stripped '.'s before and set len appropriately, 508 * We stripped '.'s before and set len appropriately,
509 * but utf8_mbstowcs doesn't care about len 509 * but utf8s_to_utf16s doesn't care about len
510 */ 510 */
511 *outlen -= (name_len - len); 511 *outlen -= (name_len - len);
512 512
diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c
index 92c14b850e9c..a048de81c093 100644
--- a/fs/isofs/joliet.c
+++ b/fs/isofs/joliet.c
@@ -37,37 +37,6 @@ uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls)
37 return (op - ascii); 37 return (op - ascii);
38} 38}
39 39
40/* Convert big endian wide character string to utf8 */
41static int
42wcsntombs_be(__u8 *s, const __u8 *pwcs, int inlen, int maxlen)
43{
44 const __u8 *ip;
45 __u8 *op;
46 int size;
47 __u16 c;
48
49 op = s;
50 ip = pwcs;
51 while ((*ip || ip[1]) && (maxlen > 0) && (inlen > 0)) {
52 c = (*ip << 8) | ip[1];
53 if (c > 0x7f) {
54 size = utf8_wctomb(op, c, maxlen);
55 if (size == -1) {
56 /* Ignore character and move on */
57 maxlen--;
58 } else {
59 op += size;
60 maxlen -= size;
61 }
62 } else {
63 *op++ = (__u8) c;
64 }
65 ip += 2;
66 inlen--;
67 }
68 return (op - s);
69}
70
71int 40int
72get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode) 41get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode)
73{ 42{
@@ -79,8 +48,9 @@ get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, st
79 nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset; 48 nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset;
80 49
81 if (utf8) { 50 if (utf8) {
82 len = wcsntombs_be(outname, de->name, 51 len = utf16s_to_utf8s((const wchar_t *) de->name,
83 de->name_len[0] >> 1, PAGE_SIZE); 52 de->name_len[0] >> 1, UTF16_BIG_ENDIAN,
53 outname, PAGE_SIZE);
84 } else { 54 } else {
85 len = uni16_to_x8(outname, (__be16 *) de->name, 55 len = uni16_to_x8(outname, (__be16 *) de->name,
86 de->name_len[0] >> 1, nls); 56 de->name_len[0] >> 1, nls);
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index 97645f112114..0ec6237a5970 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -1113,11 +1113,13 @@ ncp__io2vol(struct ncp_server *server, unsigned char *vname, unsigned int *vlen,
1113 1113
1114 if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) { 1114 if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
1115 int k; 1115 int k;
1116 unicode_t u;
1116 1117
1117 k = utf8_mbtowc(&ec, iname, iname_end - iname); 1118 k = utf8_to_utf32(iname, iname_end - iname, &u);
1118 if (k < 0) 1119 if (k < 0 || u > MAX_WCHAR_T)
1119 return -EINVAL; 1120 return -EINVAL;
1120 iname += k; 1121 iname += k;
1122 ec = u;
1121 } else { 1123 } else {
1122 if (*iname == NCP_ESC) { 1124 if (*iname == NCP_ESC) {
1123 int k; 1125 int k;
@@ -1214,7 +1216,7 @@ ncp__vol2io(struct ncp_server *server, unsigned char *iname, unsigned int *ilen,
1214 if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) { 1216 if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
1215 int k; 1217 int k;
1216 1218
1217 k = utf8_wctomb(iname, ec, iname_end - iname); 1219 k = utf32_to_utf8(ec, iname, iname_end - iname);
1218 if (k < 0) { 1220 if (k < 0) {
1219 err = -ENAMETOOLONG; 1221 err = -ENAMETOOLONG;
1220 goto quit; 1222 goto quit;
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 750abf211e26..477d37d83b31 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -15,6 +15,7 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/kmod.h> 16#include <linux/kmod.h>
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <asm/byteorder.h>
18 19
19static struct nls_table default_table; 20static struct nls_table default_table;
20static struct nls_table *tables = &default_table; 21static struct nls_table *tables = &default_table;
@@ -43,10 +44,17 @@ static const struct utf8_table utf8_table[] =
43 {0, /* end of table */} 44 {0, /* end of table */}
44}; 45};
45 46
46int 47#define UNICODE_MAX 0x0010ffff
47utf8_mbtowc(wchar_t *p, const __u8 *s, int n) 48#define PLANE_SIZE 0x00010000
49
50#define SURROGATE_MASK 0xfffff800
51#define SURROGATE_PAIR 0x0000d800
52#define SURROGATE_LOW 0x00000400
53#define SURROGATE_BITS 0x000003ff
54
55int utf8_to_utf32(const u8 *s, int len, unicode_t *pu)
48{ 56{
49 long l; 57 unsigned long l;
50 int c0, c, nc; 58 int c0, c, nc;
51 const struct utf8_table *t; 59 const struct utf8_table *t;
52 60
@@ -57,12 +65,13 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
57 nc++; 65 nc++;
58 if ((c0 & t->cmask) == t->cval) { 66 if ((c0 & t->cmask) == t->cval) {
59 l &= t->lmask; 67 l &= t->lmask;
60 if (l < t->lval) 68 if (l < t->lval || l > UNICODE_MAX ||
69 (l & SURROGATE_MASK) == SURROGATE_PAIR)
61 return -1; 70 return -1;
62 *p = l; 71 *pu = (unicode_t) l;
63 return nc; 72 return nc;
64 } 73 }
65 if (n <= nc) 74 if (len <= nc)
66 return -1; 75 return -1;
67 s++; 76 s++;
68 c = (*s ^ 0x80) & 0xFF; 77 c = (*s ^ 0x80) & 0xFF;
@@ -72,76 +81,119 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
72 } 81 }
73 return -1; 82 return -1;
74} 83}
84EXPORT_SYMBOL(utf8_to_utf32);
75 85
76int 86int utf32_to_utf8(unicode_t u, u8 *s, int maxlen)
77utf8_mbstowcs(wchar_t *pwcs, const __u8 *s, int n)
78{ 87{
79 __u16 *op; 88 unsigned long l;
80 const __u8 *ip;
81 int size;
82
83 op = pwcs;
84 ip = s;
85 while (*ip && n > 0) {
86 if (*ip & 0x80) {
87 size = utf8_mbtowc(op, ip, n);
88 if (size == -1) {
89 /* Ignore character and move on */
90 ip++;
91 n--;
92 } else {
93 op++;
94 ip += size;
95 n -= size;
96 }
97 } else {
98 *op++ = *ip++;
99 n--;
100 }
101 }
102 return (op - pwcs);
103}
104
105int
106utf8_wctomb(__u8 *s, wchar_t wc, int maxlen)
107{
108 long l;
109 int c, nc; 89 int c, nc;
110 const struct utf8_table *t; 90 const struct utf8_table *t;
111 91
112 if (!s) 92 if (!s)
113 return 0; 93 return 0;
114 94
115 l = wc; 95 l = u;
96 if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR)
97 return -1;
98
116 nc = 0; 99 nc = 0;
117 for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) { 100 for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) {
118 nc++; 101 nc++;
119 if (l <= t->lmask) { 102 if (l <= t->lmask) {
120 c = t->shift; 103 c = t->shift;
121 *s = t->cval | (l >> c); 104 *s = (u8) (t->cval | (l >> c));
122 while (c > 0) { 105 while (c > 0) {
123 c -= 6; 106 c -= 6;
124 s++; 107 s++;
125 *s = 0x80 | ((l >> c) & 0x3F); 108 *s = (u8) (0x80 | ((l >> c) & 0x3F));
126 } 109 }
127 return nc; 110 return nc;
128 } 111 }
129 } 112 }
130 return -1; 113 return -1;
131} 114}
115EXPORT_SYMBOL(utf32_to_utf8);
132 116
133int 117int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs)
134utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen)
135{ 118{
136 const __u16 *ip; 119 u16 *op;
137 __u8 *op;
138 int size; 120 int size;
121 unicode_t u;
122
123 op = pwcs;
124 while (*s && len > 0) {
125 if (*s & 0x80) {
126 size = utf8_to_utf32(s, len, &u);
127 if (size < 0) {
128 /* Ignore character and move on */
129 size = 1;
130 } else if (u >= PLANE_SIZE) {
131 u -= PLANE_SIZE;
132 *op++ = (wchar_t) (SURROGATE_PAIR |
133 ((u >> 10) & SURROGATE_BITS));
134 *op++ = (wchar_t) (SURROGATE_PAIR |
135 SURROGATE_LOW |
136 (u & SURROGATE_BITS));
137 } else {
138 *op++ = (wchar_t) u;
139 }
140 s += size;
141 len -= size;
142 } else {
143 *op++ = *s++;
144 len--;
145 }
146 }
147 return op - pwcs;
148}
149EXPORT_SYMBOL(utf8s_to_utf16s);
150
151static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian)
152{
153 switch (endian) {
154 default:
155 return c;
156 case UTF16_LITTLE_ENDIAN:
157 return __le16_to_cpu(c);
158 case UTF16_BIG_ENDIAN:
159 return __be16_to_cpu(c);
160 }
161}
162
163int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian,
164 u8 *s, int maxlen)
165{
166 u8 *op;
167 int size;
168 unsigned long u, v;
139 169
140 op = s; 170 op = s;
141 ip = pwcs; 171 while (len > 0 && maxlen > 0) {
142 while (*ip && maxlen > 0) { 172 u = get_utf16(*pwcs, endian);
143 if (*ip > 0x7f) { 173 if (!u)
144 size = utf8_wctomb(op, *ip, maxlen); 174 break;
175 pwcs++;
176 len--;
177 if (u > 0x7f) {
178 if ((u & SURROGATE_MASK) == SURROGATE_PAIR) {
179 if (u & SURROGATE_LOW) {
180 /* Ignore character and move on */
181 continue;
182 }
183 if (len <= 0)
184 break;
185 v = get_utf16(*pwcs, endian);
186 if ((v & SURROGATE_MASK) != SURROGATE_PAIR ||
187 !(v & SURROGATE_LOW)) {
188 /* Ignore character and move on */
189 continue;
190 }
191 u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10)
192 + (v & SURROGATE_BITS);
193 pwcs++;
194 len--;
195 }
196 size = utf32_to_utf8(u, op, maxlen);
145 if (size == -1) { 197 if (size == -1) {
146 /* Ignore character and move on */ 198 /* Ignore character and move on */
147 } else { 199 } else {
@@ -149,13 +201,13 @@ utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen)
149 maxlen -= size; 201 maxlen -= size;
150 } 202 }
151 } else { 203 } else {
152 *op++ = (__u8) *ip; 204 *op++ = (u8) u;
153 maxlen--; 205 maxlen--;
154 } 206 }
155 ip++;
156 } 207 }
157 return (op - s); 208 return op - s;
158} 209}
210EXPORT_SYMBOL(utf16s_to_utf8s);
159 211
160int register_nls(struct nls_table * nls) 212int register_nls(struct nls_table * nls)
161{ 213{
@@ -467,9 +519,5 @@ EXPORT_SYMBOL(unregister_nls);
467EXPORT_SYMBOL(unload_nls); 519EXPORT_SYMBOL(unload_nls);
468EXPORT_SYMBOL(load_nls); 520EXPORT_SYMBOL(load_nls);
469EXPORT_SYMBOL(load_nls_default); 521EXPORT_SYMBOL(load_nls_default);
470EXPORT_SYMBOL(utf8_mbtowc);
471EXPORT_SYMBOL(utf8_mbstowcs);
472EXPORT_SYMBOL(utf8_wctomb);
473EXPORT_SYMBOL(utf8_wcstombs);
474 522
475MODULE_LICENSE("Dual BSD/GPL"); 523MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c
index aa2c42fdd977..0d60a44acacd 100644
--- a/fs/nls/nls_utf8.c
+++ b/fs/nls/nls_utf8.c
@@ -15,7 +15,11 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
15{ 15{
16 int n; 16 int n;
17 17
18 if ( (n = utf8_wctomb(out, uni, boundlen)) == -1) { 18 if (boundlen <= 0)
19 return -ENAMETOOLONG;
20
21 n = utf32_to_utf8(uni, out, boundlen);
22 if (n < 0) {
19 *out = '?'; 23 *out = '?';
20 return -EINVAL; 24 return -EINVAL;
21 } 25 }
@@ -25,11 +29,14 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
25static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) 29static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
26{ 30{
27 int n; 31 int n;
32 unicode_t u;
28 33
29 if ( (n = utf8_mbtowc(uni, rawstring, boundlen)) == -1) { 34 n = utf8_to_utf32(rawstring, boundlen, &u);
35 if (n < 0 || u > MAX_WCHAR_T) {
30 *uni = 0x003f; /* ? */ 36 *uni = 0x003f; /* ? */
31 n = -EINVAL; 37 return -EINVAL;
32 } 38 }
39 *uni = (wchar_t) u;
33 return n; 40 return n;
34} 41}
35 42