summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2018-04-16 11:30:14 -0400
committerJan Kara <jack@suse.cz>2018-04-19 10:00:48 -0400
commitef2e18f1fa3958f8ee1a38acaebb6991d49dce18 (patch)
tree01f660cf8b584674b178200f89e4eb529306e2a0
parentd504adc29142755edda4ef0f24ec81b7088564a4 (diff)
udf: Add support for encoding UTF-16 characters
Add support to store characters outside of Base Multilingual Plane of UTF-16 in CS0 encoding of UDF. Signed-off-by: Jan Kara <jack@suse.cz>
-rw-r--r--fs/udf/unicode.c79
1 files changed, 43 insertions, 36 deletions
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 329be783f98a..616ffee441c5 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -28,9 +28,13 @@
28 28
29#include "udf_sb.h" 29#include "udf_sb.h"
30 30
31#define PLANE_SIZE 0x10000
31#define UNICODE_MAX 0x10ffff 32#define UNICODE_MAX 0x10ffff
32#define SURROGATE_MASK 0xfffff800 33#define SURROGATE_MASK 0xfffff800
33#define SURROGATE_PAIR 0x0000d800 34#define SURROGATE_PAIR 0x0000d800
35#define SURROGATE_LOW 0x00000400
36#define SURROGATE_CHAR_BITS 10
37#define SURROGATE_CHAR_MASK ((1 << SURROGATE_CHAR_BITS) - 1)
34 38
35static int udf_uni2char_utf8(wchar_t uni, 39static int udf_uni2char_utf8(wchar_t uni,
36 unsigned char *out, 40 unsigned char *out,
@@ -51,26 +55,6 @@ static int udf_uni2char_utf8(wchar_t uni,
51 return u_len; 55 return u_len;
52} 56}
53 57
54static int udf_char2uni_utf8(const unsigned char *in,
55 int boundlen,
56 wchar_t *uni)
57{
58 int u_len;
59 unicode_t c;
60
61 u_len = utf8_to_utf32(in, boundlen, &c);
62 if (u_len < 0) {
63 *uni = '?';
64 return -EINVAL;
65 }
66
67 if (c > MAX_WCHAR_T)
68 *uni = '?';
69 else
70 *uni = c;
71 return u_len;
72}
73
74#define ILLEGAL_CHAR_MARK '_' 58#define ILLEGAL_CHAR_MARK '_'
75#define EXT_MARK '.' 59#define EXT_MARK '.'
76#define CRC_MARK '#' 60#define CRC_MARK '#'
@@ -261,19 +245,17 @@ static int udf_name_to_CS0(struct super_block *sb,
261{ 245{
262 int i, len; 246 int i, len;
263 unsigned int max_val; 247 unsigned int max_val;
264 wchar_t uni_char;
265 int u_len, u_ch; 248 int u_len, u_ch;
249 unicode_t uni_char;
266 int (*conv_f)(const unsigned char *, int, wchar_t *); 250 int (*conv_f)(const unsigned char *, int, wchar_t *);
267 251
268 if (ocu_max_len <= 0) 252 if (ocu_max_len <= 0)
269 return 0; 253 return 0;
270 254
271 if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { 255 if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
272 conv_f = udf_char2uni_utf8;
273 } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
274 conv_f = UDF_SB(sb)->s_nls_map->char2uni; 256 conv_f = UDF_SB(sb)->s_nls_map->char2uni;
275 } else 257 else
276 BUG(); 258 conv_f = NULL;
277 259
278 memset(ocu, 0, ocu_max_len); 260 memset(ocu, 0, ocu_max_len);
279 ocu[0] = 8; 261 ocu[0] = 8;
@@ -282,30 +264,55 @@ static int udf_name_to_CS0(struct super_block *sb,
282 264
283try_again: 265try_again:
284 u_len = 1; 266 u_len = 1;
285 for (i = 0; i < str_len; i++) { 267 for (i = 0; i < str_len; i += len) {
286 /* Name didn't fit? */ 268 /* Name didn't fit? */
287 if (u_len + u_ch > ocu_max_len) 269 if (u_len + u_ch > ocu_max_len)
288 return 0; 270 return 0;
289 len = conv_f(&str_i[i], str_len - i, &uni_char); 271 if (conv_f) {
290 if (!len) 272 wchar_t wchar;
291 continue; 273
274 len = conv_f(&str_i[i], str_len - i, &wchar);
275 if (len > 0)
276 uni_char = wchar;
277 } else {
278 len = utf8_to_utf32(&str_i[i], str_len - i,
279 &uni_char);
280 }
292 /* Invalid character, deal with it */ 281 /* Invalid character, deal with it */
293 if (len < 0) { 282 if (len <= 0 || uni_char > UNICODE_MAX) {
294 len = 1; 283 len = 1;
295 uni_char = '?'; 284 uni_char = '?';
296 } 285 }
297 286
298 if (uni_char > max_val) { 287 if (uni_char > max_val) {
299 max_val = 0xffff; 288 unicode_t c;
300 ocu[0] = 0x10; 289
301 u_ch = 2; 290 if (max_val == 0xff) {
302 goto try_again; 291 max_val = 0xffff;
292 ocu[0] = 0x10;
293 u_ch = 2;
294 goto try_again;
295 }
296 /*
297 * Use UTF-16 encoding for chars outside we
298 * cannot encode directly.
299 */
300 if (u_len + 2 * u_ch > ocu_max_len)
301 return 0;
302
303 uni_char -= PLANE_SIZE;
304 c = SURROGATE_PAIR |
305 ((uni_char >> SURROGATE_CHAR_BITS) &
306 SURROGATE_CHAR_MASK);
307 ocu[u_len++] = (uint8_t)(c >> 8);
308 ocu[u_len++] = (uint8_t)(c & 0xff);
309 uni_char = SURROGATE_PAIR | SURROGATE_LOW |
310 (uni_char & SURROGATE_CHAR_MASK);
303 } 311 }
304 312
305 if (max_val == 0xffff) 313 if (max_val == 0xffff)
306 ocu[u_len++] = (uint8_t)(uni_char >> 8); 314 ocu[u_len++] = (uint8_t)(uni_char >> 8);
307 ocu[u_len++] = (uint8_t)(uni_char & 0xff); 315 ocu[u_len++] = (uint8_t)(uni_char & 0xff);
308 i += len - 1;
309 } 316 }
310 317
311 return u_len; 318 return u_len;