diff options
| author | Alan Stern <stern@rowland.harvard.edu> | 2009-04-30 10:08:18 -0400 |
|---|---|---|
| committer | Greg Kroah-Hartman <gregkh@suse.de> | 2009-06-16 00:44:43 -0400 |
| commit | 74675a58507e769beee7d949dbed788af3c4139d (patch) | |
| tree | d4ae3cc06dbfadecf1eaf6ed0aef249fc87b07e6 /fs/nls | |
| parent | a853a3d4eb2edb066248a39f0634f6f5858816a0 (diff) | |
NLS: update handling of Unicode
This patch (as1239) updates the kernel's treatment of Unicode. The
character-set conversion routines are well behind the current state of
the Unicode specification: They don't recognize the existence of code
points beyond plane 0 or of surrogate pairs in the UTF-16 encoding.
The old wchar_t 16-bit type is retained because it's still used in
lots of places. This shouldn't cause any new problems; if a
conversion now results in an invalid 16-bit code then before it must
have yielded an undefined code.
Difficult-to-read names like "utf_mbstowcs" are replaced with more
transparent names like "utf8s_to_utf16s" and the ordering of the
parameters is rationalized (buffer lengths come immediate after the
pointers they refer to, and the inputs precede the outputs).
Fortunately the low-level conversion routines are used in only a few
places; the interfaces to the higher-level uni2char and char2uni
methods have been left unchanged.
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Clemens Ladisch <clemens@ladisch.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Diffstat (limited to 'fs/nls')
| -rw-r--r-- | fs/nls/nls_base.c | 164 | ||||
| -rw-r--r-- | fs/nls/nls_utf8.c | 13 |
2 files changed, 116 insertions, 61 deletions
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 750abf211e2..477d37d83b3 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c | |||
| @@ -15,6 +15,7 @@ | |||
| 15 | #include <linux/errno.h> | 15 | #include <linux/errno.h> |
| 16 | #include <linux/kmod.h> | 16 | #include <linux/kmod.h> |
| 17 | #include <linux/spinlock.h> | 17 | #include <linux/spinlock.h> |
| 18 | #include <asm/byteorder.h> | ||
| 18 | 19 | ||
| 19 | static struct nls_table default_table; | 20 | static struct nls_table default_table; |
| 20 | static struct nls_table *tables = &default_table; | 21 | static struct nls_table *tables = &default_table; |
| @@ -43,10 +44,17 @@ static const struct utf8_table utf8_table[] = | |||
| 43 | {0, /* end of table */} | 44 | {0, /* end of table */} |
| 44 | }; | 45 | }; |
| 45 | 46 | ||
| 46 | int | 47 | #define UNICODE_MAX 0x0010ffff |
| 47 | utf8_mbtowc(wchar_t *p, const __u8 *s, int n) | 48 | #define PLANE_SIZE 0x00010000 |
| 49 | |||
| 50 | #define SURROGATE_MASK 0xfffff800 | ||
| 51 | #define SURROGATE_PAIR 0x0000d800 | ||
| 52 | #define SURROGATE_LOW 0x00000400 | ||
| 53 | #define SURROGATE_BITS 0x000003ff | ||
| 54 | |||
| 55 | int utf8_to_utf32(const u8 *s, int len, unicode_t *pu) | ||
| 48 | { | 56 | { |
| 49 | long l; | 57 | unsigned long l; |
| 50 | int c0, c, nc; | 58 | int c0, c, nc; |
| 51 | const struct utf8_table *t; | 59 | const struct utf8_table *t; |
| 52 | 60 | ||
| @@ -57,12 +65,13 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n) | |||
| 57 | nc++; | 65 | nc++; |
| 58 | if ((c0 & t->cmask) == t->cval) { | 66 | if ((c0 & t->cmask) == t->cval) { |
| 59 | l &= t->lmask; | 67 | l &= t->lmask; |
| 60 | if (l < t->lval) | 68 | if (l < t->lval || l > UNICODE_MAX || |
| 69 | (l & SURROGATE_MASK) == SURROGATE_PAIR) | ||
| 61 | return -1; | 70 | return -1; |
| 62 | *p = l; | 71 | *pu = (unicode_t) l; |
| 63 | return nc; | 72 | return nc; |
| 64 | } | 73 | } |
| 65 | if (n <= nc) | 74 | if (len <= nc) |
| 66 | return -1; | 75 | return -1; |
| 67 | s++; | 76 | s++; |
| 68 | c = (*s ^ 0x80) & 0xFF; | 77 | c = (*s ^ 0x80) & 0xFF; |
| @@ -72,76 +81,119 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n) | |||
| 72 | } | 81 | } |
| 73 | return -1; | 82 | return -1; |
| 74 | } | 83 | } |
| 84 | EXPORT_SYMBOL(utf8_to_utf32); | ||
| 75 | 85 | ||
| 76 | int | 86 | int utf32_to_utf8(unicode_t u, u8 *s, int maxlen) |
| 77 | utf8_mbstowcs(wchar_t *pwcs, const __u8 *s, int n) | ||
| 78 | { | 87 | { |
| 79 | __u16 *op; | 88 | unsigned long l; |
| 80 | const __u8 *ip; | ||
| 81 | int size; | ||
| 82 | |||
| 83 | op = pwcs; | ||
| 84 | ip = s; | ||
| 85 | while (*ip && n > 0) { | ||
| 86 | if (*ip & 0x80) { | ||
| 87 | size = utf8_mbtowc(op, ip, n); | ||
| 88 | if (size == -1) { | ||
| 89 | /* Ignore character and move on */ | ||
| 90 | ip++; | ||
| 91 | n--; | ||
| 92 | } else { | ||
| 93 | op++; | ||
| 94 | ip += size; | ||
| 95 | n -= size; | ||
| 96 | } | ||
| 97 | } else { | ||
| 98 | *op++ = *ip++; | ||
| 99 | n--; | ||
| 100 | } | ||
| 101 | } | ||
| 102 | return (op - pwcs); | ||
| 103 | } | ||
| 104 | |||
| 105 | int | ||
| 106 | utf8_wctomb(__u8 *s, wchar_t wc, int maxlen) | ||
| 107 | { | ||
| 108 | long l; | ||
| 109 | int c, nc; | 89 | int c, nc; |
| 110 | const struct utf8_table *t; | 90 | const struct utf8_table *t; |
| 111 | 91 | ||
| 112 | if (!s) | 92 | if (!s) |
| 113 | return 0; | 93 | return 0; |
| 114 | 94 | ||
| 115 | l = wc; | 95 | l = u; |
| 96 | if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR) | ||
| 97 | return -1; | ||
| 98 | |||
| 116 | nc = 0; | 99 | nc = 0; |
| 117 | for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) { | 100 | for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) { |
| 118 | nc++; | 101 | nc++; |
| 119 | if (l <= t->lmask) { | 102 | if (l <= t->lmask) { |
| 120 | c = t->shift; | 103 | c = t->shift; |
| 121 | *s = t->cval | (l >> c); | 104 | *s = (u8) (t->cval | (l >> c)); |
| 122 | while (c > 0) { | 105 | while (c > 0) { |
| 123 | c -= 6; | 106 | c -= 6; |
| 124 | s++; | 107 | s++; |
| 125 | *s = 0x80 | ((l >> c) & 0x3F); | 108 | *s = (u8) (0x80 | ((l >> c) & 0x3F)); |
| 126 | } | 109 | } |
| 127 | return nc; | 110 | return nc; |
| 128 | } | 111 | } |
| 129 | } | 112 | } |
| 130 | return -1; | 113 | return -1; |
| 131 | } | 114 | } |
| 115 | EXPORT_SYMBOL(utf32_to_utf8); | ||
| 132 | 116 | ||
| 133 | int | 117 | int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs) |
| 134 | utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen) | ||
| 135 | { | 118 | { |
| 136 | const __u16 *ip; | 119 | u16 *op; |
| 137 | __u8 *op; | ||
| 138 | int size; | 120 | int size; |
| 121 | unicode_t u; | ||
| 122 | |||
| 123 | op = pwcs; | ||
| 124 | while (*s && len > 0) { | ||
| 125 | if (*s & 0x80) { | ||
| 126 | size = utf8_to_utf32(s, len, &u); | ||
| 127 | if (size < 0) { | ||
| 128 | /* Ignore character and move on */ | ||
| 129 | size = 1; | ||
| 130 | } else if (u >= PLANE_SIZE) { | ||
| 131 | u -= PLANE_SIZE; | ||
| 132 | *op++ = (wchar_t) (SURROGATE_PAIR | | ||
| 133 | ((u >> 10) & SURROGATE_BITS)); | ||
| 134 | *op++ = (wchar_t) (SURROGATE_PAIR | | ||
| 135 | SURROGATE_LOW | | ||
| 136 | (u & SURROGATE_BITS)); | ||
| 137 | } else { | ||
| 138 | *op++ = (wchar_t) u; | ||
| 139 | } | ||
| 140 | s += size; | ||
| 141 | len -= size; | ||
| 142 | } else { | ||
| 143 | *op++ = *s++; | ||
| 144 | len--; | ||
| 145 | } | ||
| 146 | } | ||
| 147 | return op - pwcs; | ||
| 148 | } | ||
| 149 | EXPORT_SYMBOL(utf8s_to_utf16s); | ||
| 150 | |||
| 151 | static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian) | ||
| 152 | { | ||
| 153 | switch (endian) { | ||
| 154 | default: | ||
| 155 | return c; | ||
| 156 | case UTF16_LITTLE_ENDIAN: | ||
| 157 | return __le16_to_cpu(c); | ||
| 158 | case UTF16_BIG_ENDIAN: | ||
| 159 | return __be16_to_cpu(c); | ||
| 160 | } | ||
| 161 | } | ||
| 162 | |||
| 163 | int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian, | ||
| 164 | u8 *s, int maxlen) | ||
| 165 | { | ||
| 166 | u8 *op; | ||
| 167 | int size; | ||
| 168 | unsigned long u, v; | ||
| 139 | 169 | ||
| 140 | op = s; | 170 | op = s; |
| 141 | ip = pwcs; | 171 | while (len > 0 && maxlen > 0) { |
| 142 | while (*ip && maxlen > 0) { | 172 | u = get_utf16(*pwcs, endian); |
| 143 | if (*ip > 0x7f) { | 173 | if (!u) |
| 144 | size = utf8_wctomb(op, *ip, maxlen); | 174 | break; |
| 175 | pwcs++; | ||
| 176 | len--; | ||
| 177 | if (u > 0x7f) { | ||
| 178 | if ((u & SURROGATE_MASK) == SURROGATE_PAIR) { | ||
| 179 | if (u & SURROGATE_LOW) { | ||
| 180 | /* Ignore character and move on */ | ||
| 181 | continue; | ||
| 182 | } | ||
| 183 | if (len <= 0) | ||
| 184 | break; | ||
| 185 | v = get_utf16(*pwcs, endian); | ||
| 186 | if ((v & SURROGATE_MASK) != SURROGATE_PAIR || | ||
| 187 | !(v & SURROGATE_LOW)) { | ||
| 188 | /* Ignore character and move on */ | ||
| 189 | continue; | ||
| 190 | } | ||
| 191 | u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10) | ||
| 192 | + (v & SURROGATE_BITS); | ||
| 193 | pwcs++; | ||
| 194 | len--; | ||
| 195 | } | ||
| 196 | size = utf32_to_utf8(u, op, maxlen); | ||
| 145 | if (size == -1) { | 197 | if (size == -1) { |
| 146 | /* Ignore character and move on */ | 198 | /* Ignore character and move on */ |
| 147 | } else { | 199 | } else { |
| @@ -149,13 +201,13 @@ utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen) | |||
| 149 | maxlen -= size; | 201 | maxlen -= size; |
| 150 | } | 202 | } |
| 151 | } else { | 203 | } else { |
| 152 | *op++ = (__u8) *ip; | 204 | *op++ = (u8) u; |
| 153 | maxlen--; | 205 | maxlen--; |
| 154 | } | 206 | } |
| 155 | ip++; | ||
| 156 | } | 207 | } |
| 157 | return (op - s); | 208 | return op - s; |
| 158 | } | 209 | } |
| 210 | EXPORT_SYMBOL(utf16s_to_utf8s); | ||
| 159 | 211 | ||
| 160 | int register_nls(struct nls_table * nls) | 212 | int register_nls(struct nls_table * nls) |
| 161 | { | 213 | { |
| @@ -467,9 +519,5 @@ EXPORT_SYMBOL(unregister_nls); | |||
| 467 | EXPORT_SYMBOL(unload_nls); | 519 | EXPORT_SYMBOL(unload_nls); |
| 468 | EXPORT_SYMBOL(load_nls); | 520 | EXPORT_SYMBOL(load_nls); |
| 469 | EXPORT_SYMBOL(load_nls_default); | 521 | EXPORT_SYMBOL(load_nls_default); |
| 470 | EXPORT_SYMBOL(utf8_mbtowc); | ||
| 471 | EXPORT_SYMBOL(utf8_mbstowcs); | ||
| 472 | EXPORT_SYMBOL(utf8_wctomb); | ||
| 473 | EXPORT_SYMBOL(utf8_wcstombs); | ||
| 474 | 522 | ||
| 475 | MODULE_LICENSE("Dual BSD/GPL"); | 523 | MODULE_LICENSE("Dual BSD/GPL"); |
diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c index aa2c42fdd97..0d60a44acac 100644 --- a/fs/nls/nls_utf8.c +++ b/fs/nls/nls_utf8.c | |||
| @@ -15,7 +15,11 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen) | |||
| 15 | { | 15 | { |
| 16 | int n; | 16 | int n; |
| 17 | 17 | ||
| 18 | if ( (n = utf8_wctomb(out, uni, boundlen)) == -1) { | 18 | if (boundlen <= 0) |
| 19 | return -ENAMETOOLONG; | ||
| 20 | |||
| 21 | n = utf32_to_utf8(uni, out, boundlen); | ||
| 22 | if (n < 0) { | ||
| 19 | *out = '?'; | 23 | *out = '?'; |
| 20 | return -EINVAL; | 24 | return -EINVAL; |
| 21 | } | 25 | } |
| @@ -25,11 +29,14 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen) | |||
| 25 | static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) | 29 | static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) |
| 26 | { | 30 | { |
| 27 | int n; | 31 | int n; |
| 32 | unicode_t u; | ||
| 28 | 33 | ||
| 29 | if ( (n = utf8_mbtowc(uni, rawstring, boundlen)) == -1) { | 34 | n = utf8_to_utf32(rawstring, boundlen, &u); |
| 35 | if (n < 0 || u > MAX_WCHAR_T) { | ||
| 30 | *uni = 0x003f; /* ? */ | 36 | *uni = 0x003f; /* ? */ |
| 31 | n = -EINVAL; | 37 | return -EINVAL; |
| 32 | } | 38 | } |
| 39 | *uni = (wchar_t) u; | ||
| 33 | return n; | 40 | return n; |
| 34 | } | 41 | } |
| 35 | 42 | ||
