diff options
author | Jan Kara <jack@suse.cz> | 2018-04-16 11:30:14 -0400 |
---|---|---|
committer | Jan Kara <jack@suse.cz> | 2018-04-19 10:00:48 -0400 |
commit | ef2e18f1fa3958f8ee1a38acaebb6991d49dce18 (patch) | |
tree | 01f660cf8b584674b178200f89e4eb529306e2a0 | |
parent | d504adc29142755edda4ef0f24ec81b7088564a4 (diff) |
udf: Add support for encoding UTF-16 characters
Add support to store characters outside of Base Multilingual Plane of
UTF-16 in CS0 encoding of UDF.
Signed-off-by: Jan Kara <jack@suse.cz>
-rw-r--r-- | fs/udf/unicode.c | 79 |
1 files changed, 43 insertions, 36 deletions
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index 329be783f98a..616ffee441c5 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c | |||
@@ -28,9 +28,13 @@ | |||
28 | 28 | ||
29 | #include "udf_sb.h" | 29 | #include "udf_sb.h" |
30 | 30 | ||
31 | #define PLANE_SIZE 0x10000 | ||
31 | #define UNICODE_MAX 0x10ffff | 32 | #define UNICODE_MAX 0x10ffff |
32 | #define SURROGATE_MASK 0xfffff800 | 33 | #define SURROGATE_MASK 0xfffff800 |
33 | #define SURROGATE_PAIR 0x0000d800 | 34 | #define SURROGATE_PAIR 0x0000d800 |
35 | #define SURROGATE_LOW 0x00000400 | ||
36 | #define SURROGATE_CHAR_BITS 10 | ||
37 | #define SURROGATE_CHAR_MASK ((1 << SURROGATE_CHAR_BITS) - 1) | ||
34 | 38 | ||
35 | static int udf_uni2char_utf8(wchar_t uni, | 39 | static int udf_uni2char_utf8(wchar_t uni, |
36 | unsigned char *out, | 40 | unsigned char *out, |
@@ -51,26 +55,6 @@ static int udf_uni2char_utf8(wchar_t uni, | |||
51 | return u_len; | 55 | return u_len; |
52 | } | 56 | } |
53 | 57 | ||
54 | static int udf_char2uni_utf8(const unsigned char *in, | ||
55 | int boundlen, | ||
56 | wchar_t *uni) | ||
57 | { | ||
58 | int u_len; | ||
59 | unicode_t c; | ||
60 | |||
61 | u_len = utf8_to_utf32(in, boundlen, &c); | ||
62 | if (u_len < 0) { | ||
63 | *uni = '?'; | ||
64 | return -EINVAL; | ||
65 | } | ||
66 | |||
67 | if (c > MAX_WCHAR_T) | ||
68 | *uni = '?'; | ||
69 | else | ||
70 | *uni = c; | ||
71 | return u_len; | ||
72 | } | ||
73 | |||
74 | #define ILLEGAL_CHAR_MARK '_' | 58 | #define ILLEGAL_CHAR_MARK '_' |
75 | #define EXT_MARK '.' | 59 | #define EXT_MARK '.' |
76 | #define CRC_MARK '#' | 60 | #define CRC_MARK '#' |
@@ -261,19 +245,17 @@ static int udf_name_to_CS0(struct super_block *sb, | |||
261 | { | 245 | { |
262 | int i, len; | 246 | int i, len; |
263 | unsigned int max_val; | 247 | unsigned int max_val; |
264 | wchar_t uni_char; | ||
265 | int u_len, u_ch; | 248 | int u_len, u_ch; |
249 | unicode_t uni_char; | ||
266 | int (*conv_f)(const unsigned char *, int, wchar_t *); | 250 | int (*conv_f)(const unsigned char *, int, wchar_t *); |
267 | 251 | ||
268 | if (ocu_max_len <= 0) | 252 | if (ocu_max_len <= 0) |
269 | return 0; | 253 | return 0; |
270 | 254 | ||
271 | if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { | 255 | if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) |
272 | conv_f = udf_char2uni_utf8; | ||
273 | } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { | ||
274 | conv_f = UDF_SB(sb)->s_nls_map->char2uni; | 256 | conv_f = UDF_SB(sb)->s_nls_map->char2uni; |
275 | } else | 257 | else |
276 | BUG(); | 258 | conv_f = NULL; |
277 | 259 | ||
278 | memset(ocu, 0, ocu_max_len); | 260 | memset(ocu, 0, ocu_max_len); |
279 | ocu[0] = 8; | 261 | ocu[0] = 8; |
@@ -282,30 +264,55 @@ static int udf_name_to_CS0(struct super_block *sb, | |||
282 | 264 | ||
283 | try_again: | 265 | try_again: |
284 | u_len = 1; | 266 | u_len = 1; |
285 | for (i = 0; i < str_len; i++) { | 267 | for (i = 0; i < str_len; i += len) { |
286 | /* Name didn't fit? */ | 268 | /* Name didn't fit? */ |
287 | if (u_len + u_ch > ocu_max_len) | 269 | if (u_len + u_ch > ocu_max_len) |
288 | return 0; | 270 | return 0; |
289 | len = conv_f(&str_i[i], str_len - i, &uni_char); | 271 | if (conv_f) { |
290 | if (!len) | 272 | wchar_t wchar; |
291 | continue; | 273 | |
274 | len = conv_f(&str_i[i], str_len - i, &wchar); | ||
275 | if (len > 0) | ||
276 | uni_char = wchar; | ||
277 | } else { | ||
278 | len = utf8_to_utf32(&str_i[i], str_len - i, | ||
279 | &uni_char); | ||
280 | } | ||
292 | /* Invalid character, deal with it */ | 281 | /* Invalid character, deal with it */ |
293 | if (len < 0) { | 282 | if (len <= 0 || uni_char > UNICODE_MAX) { |
294 | len = 1; | 283 | len = 1; |
295 | uni_char = '?'; | 284 | uni_char = '?'; |
296 | } | 285 | } |
297 | 286 | ||
298 | if (uni_char > max_val) { | 287 | if (uni_char > max_val) { |
299 | max_val = 0xffff; | 288 | unicode_t c; |
300 | ocu[0] = 0x10; | 289 | |
301 | u_ch = 2; | 290 | if (max_val == 0xff) { |
302 | goto try_again; | 291 | max_val = 0xffff; |
292 | ocu[0] = 0x10; | ||
293 | u_ch = 2; | ||
294 | goto try_again; | ||
295 | } | ||
296 | /* | ||
297 | * Use UTF-16 encoding for chars outside we | ||
298 | * cannot encode directly. | ||
299 | */ | ||
300 | if (u_len + 2 * u_ch > ocu_max_len) | ||
301 | return 0; | ||
302 | |||
303 | uni_char -= PLANE_SIZE; | ||
304 | c = SURROGATE_PAIR | | ||
305 | ((uni_char >> SURROGATE_CHAR_BITS) & | ||
306 | SURROGATE_CHAR_MASK); | ||
307 | ocu[u_len++] = (uint8_t)(c >> 8); | ||
308 | ocu[u_len++] = (uint8_t)(c & 0xff); | ||
309 | uni_char = SURROGATE_PAIR | SURROGATE_LOW | | ||
310 | (uni_char & SURROGATE_CHAR_MASK); | ||
303 | } | 311 | } |
304 | 312 | ||
305 | if (max_val == 0xffff) | 313 | if (max_val == 0xffff) |
306 | ocu[u_len++] = (uint8_t)(uni_char >> 8); | 314 | ocu[u_len++] = (uint8_t)(uni_char >> 8); |
307 | ocu[u_len++] = (uint8_t)(uni_char & 0xff); | 315 | ocu[u_len++] = (uint8_t)(uni_char & 0xff); |
308 | i += len - 1; | ||
309 | } | 316 | } |
310 | 317 | ||
311 | return u_len; | 318 | return u_len; |