diff options
author | Alan Stern <stern@rowland.harvard.edu> | 2009-04-30 10:08:18 -0400 |
---|---|---|
committer | Greg Kroah-Hartman <gregkh@suse.de> | 2009-06-16 00:44:43 -0400 |
commit | 74675a58507e769beee7d949dbed788af3c4139d (patch) | |
tree | d4ae3cc06dbfadecf1eaf6ed0aef249fc87b07e6 /include | |
parent | a853a3d4eb2edb066248a39f0634f6f5858816a0 (diff) |
NLS: update handling of Unicode
This patch (as1239) updates the kernel's treatment of Unicode. The
character-set conversion routines are well behind the current state of
the Unicode specification: They don't recognize the existence of code
points beyond plane 0 or of surrogate pairs in the UTF-16 encoding.
The old wchar_t 16-bit type is retained because it's still used in
lots of places. This shouldn't cause any new problems; if a
conversion now results in an invalid 16-bit code then before it must
have yielded an undefined code.
Difficult-to-read names like "utf_mbstowcs" are replaced with more
transparent names like "utf8s_to_utf16s" and the ordering of the
parameters is rationalized (buffer lengths come immediate after the
pointers they refer to, and the inputs precede the outputs).
Fortunately the low-level conversion routines are used in only a few
places; the interfaces to the higher-level uni2char and char2uni
methods have been left unchanged.
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Clemens Ladisch <clemens@ladisch.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/nls.h | 35 |
1 files changed, 29 insertions, 6 deletions
diff --git a/include/linux/nls.h b/include/linux/nls.h index 52b1a76c1b43..d47beef08dfd 100644 --- a/include/linux/nls.h +++ b/include/linux/nls.h | |||
@@ -3,8 +3,23 @@ | |||
3 | 3 | ||
4 | #include <linux/init.h> | 4 | #include <linux/init.h> |
5 | 5 | ||
6 | /* unicode character */ | 6 | /* Unicode has changed over the years. Unicode code points no longer |
7 | typedef __u16 wchar_t; | 7 | * fit into 16 bits; as of Unicode 5 valid code points range from 0 |
8 | * to 0x10ffff (17 planes, where each plane holds 65536 code points). | ||
9 | * | ||
10 | * The original decision to represent Unicode characters as 16-bit | ||
11 | * wchar_t values is now outdated. But plane 0 still includes the | ||
12 | * most commonly used characters, so we will retain it. The newer | ||
13 | * 32-bit unicode_t type can be used when it is necessary to | ||
14 | * represent the full Unicode character set. | ||
15 | */ | ||
16 | |||
17 | /* Plane-0 Unicode character */ | ||
18 | typedef u16 wchar_t; | ||
19 | #define MAX_WCHAR_T 0xffff | ||
20 | |||
21 | /* Arbitrary Unicode character */ | ||
22 | typedef u32 unicode_t; | ||
8 | 23 | ||
9 | struct nls_table { | 24 | struct nls_table { |
10 | const char *charset; | 25 | const char *charset; |
@@ -21,6 +36,13 @@ struct nls_table { | |||
21 | /* this value hold the maximum octet of charset */ | 36 | /* this value hold the maximum octet of charset */ |
22 | #define NLS_MAX_CHARSET_SIZE 6 /* for UTF-8 */ | 37 | #define NLS_MAX_CHARSET_SIZE 6 /* for UTF-8 */ |
23 | 38 | ||
39 | /* Byte order for UTF-16 strings */ | ||
40 | enum utf16_endian { | ||
41 | UTF16_HOST_ENDIAN, | ||
42 | UTF16_LITTLE_ENDIAN, | ||
43 | UTF16_BIG_ENDIAN | ||
44 | }; | ||
45 | |||
24 | /* nls.c */ | 46 | /* nls.c */ |
25 | extern int register_nls(struct nls_table *); | 47 | extern int register_nls(struct nls_table *); |
26 | extern int unregister_nls(struct nls_table *); | 48 | extern int unregister_nls(struct nls_table *); |
@@ -28,10 +50,11 @@ extern struct nls_table *load_nls(char *); | |||
28 | extern void unload_nls(struct nls_table *); | 50 | extern void unload_nls(struct nls_table *); |
29 | extern struct nls_table *load_nls_default(void); | 51 | extern struct nls_table *load_nls_default(void); |
30 | 52 | ||
31 | extern int utf8_mbtowc(wchar_t *, const __u8 *, int); | 53 | extern int utf8_to_utf32(const u8 *s, int len, unicode_t *pu); |
32 | extern int utf8_mbstowcs(wchar_t *, const __u8 *, int); | 54 | extern int utf32_to_utf8(unicode_t u, u8 *s, int maxlen); |
33 | extern int utf8_wctomb(__u8 *, wchar_t, int); | 55 | extern int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs); |
34 | extern int utf8_wcstombs(__u8 *, const wchar_t *, int); | 56 | extern int utf16s_to_utf8s(const wchar_t *pwcs, int len, |
57 | enum utf16_endian endian, u8 *s, int maxlen); | ||
35 | 58 | ||
36 | static inline unsigned char nls_tolower(struct nls_table *t, unsigned char c) | 59 | static inline unsigned char nls_tolower(struct nls_table *t, unsigned char c) |
37 | { | 60 | { |