diff options
Diffstat (limited to 'fs/udf/unicode.c')
-rw-r--r-- | fs/udf/unicode.c | 516 |
1 files changed, 516 insertions, 0 deletions
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c new file mode 100644 index 000000000000..5a80efd8debc --- /dev/null +++ b/fs/udf/unicode.c | |||
@@ -0,0 +1,516 @@ | |||
1 | /* | ||
2 | * unicode.c | ||
3 | * | ||
4 | * PURPOSE | ||
5 | * Routines for converting between UTF-8 and OSTA Compressed Unicode. | ||
6 | * Also handles filename mangling | ||
7 | * | ||
8 | * DESCRIPTION | ||
9 | * OSTA Compressed Unicode is explained in the OSTA UDF specification. | ||
10 | * http://www.osta.org/ | ||
11 | * UTF-8 is explained in the IETF RFC XXXX. | ||
12 | * ftp://ftp.internic.net/rfc/rfcxxxx.txt | ||
13 | * | ||
14 | * CONTACTS | ||
15 | * E-mail regarding any portion of the Linux UDF file system should be | ||
16 | * directed to the development team's mailing list (run by majordomo): | ||
17 | * linux_udf@hpesjro.fc.hp.com | ||
18 | * | ||
19 | * COPYRIGHT | ||
20 | * This file is distributed under the terms of the GNU General Public | ||
21 | * License (GPL). Copies of the GPL can be obtained from: | ||
22 | * ftp://prep.ai.mit.edu/pub/gnu/GPL | ||
23 | * Each contributing author retains all rights to their own work. | ||
24 | */ | ||
25 | |||
26 | #include "udfdecl.h" | ||
27 | |||
28 | #include <linux/kernel.h> | ||
29 | #include <linux/string.h> /* for memset */ | ||
30 | #include <linux/nls.h> | ||
31 | #include <linux/udf_fs.h> | ||
32 | |||
33 | #include "udf_sb.h" | ||
34 | |||
35 | static int udf_translate_to_linux(uint8_t *, uint8_t *, int, uint8_t *, int); | ||
36 | |||
37 | static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen) | ||
38 | { | ||
39 | if ( (!dest) || (!src) || (!strlen) || (strlen > UDF_NAME_LEN-2) ) | ||
40 | return 0; | ||
41 | memset(dest, 0, sizeof(struct ustr)); | ||
42 | memcpy(dest->u_name, src, strlen); | ||
43 | dest->u_cmpID = 0x08; | ||
44 | dest->u_len = strlen; | ||
45 | return strlen; | ||
46 | } | ||
47 | |||
48 | /* | ||
49 | * udf_build_ustr | ||
50 | */ | ||
51 | int udf_build_ustr(struct ustr *dest, dstring *ptr, int size) | ||
52 | { | ||
53 | int usesize; | ||
54 | |||
55 | if ( (!dest) || (!ptr) || (!size) ) | ||
56 | return -1; | ||
57 | |||
58 | memset(dest, 0, sizeof(struct ustr)); | ||
59 | usesize= (size > UDF_NAME_LEN) ? UDF_NAME_LEN : size; | ||
60 | dest->u_cmpID=ptr[0]; | ||
61 | dest->u_len=ptr[size-1]; | ||
62 | memcpy(dest->u_name, ptr+1, usesize-1); | ||
63 | return 0; | ||
64 | } | ||
65 | |||
66 | /* | ||
67 | * udf_build_ustr_exact | ||
68 | */ | ||
69 | static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize) | ||
70 | { | ||
71 | if ( (!dest) || (!ptr) || (!exactsize) ) | ||
72 | return -1; | ||
73 | |||
74 | memset(dest, 0, sizeof(struct ustr)); | ||
75 | dest->u_cmpID=ptr[0]; | ||
76 | dest->u_len=exactsize-1; | ||
77 | memcpy(dest->u_name, ptr+1, exactsize-1); | ||
78 | return 0; | ||
79 | } | ||
80 | |||
81 | /* | ||
82 | * udf_ocu_to_utf8 | ||
83 | * | ||
84 | * PURPOSE | ||
85 | * Convert OSTA Compressed Unicode to the UTF-8 equivalent. | ||
86 | * | ||
87 | * DESCRIPTION | ||
88 | * This routine is only called by udf_filldir(). | ||
89 | * | ||
90 | * PRE-CONDITIONS | ||
91 | * utf Pointer to UTF-8 output buffer. | ||
92 | * ocu Pointer to OSTA Compressed Unicode input buffer | ||
93 | * of size UDF_NAME_LEN bytes. | ||
94 | * both of type "struct ustr *" | ||
95 | * | ||
96 | * POST-CONDITIONS | ||
97 | * <return> Zero on success. | ||
98 | * | ||
99 | * HISTORY | ||
100 | * November 12, 1997 - Andrew E. Mileski | ||
101 | * Written, tested, and released. | ||
102 | */ | ||
103 | int udf_CS0toUTF8(struct ustr *utf_o, struct ustr *ocu_i) | ||
104 | { | ||
105 | uint8_t *ocu; | ||
106 | uint32_t c; | ||
107 | uint8_t cmp_id, ocu_len; | ||
108 | int i; | ||
109 | |||
110 | ocu = ocu_i->u_name; | ||
111 | |||
112 | ocu_len = ocu_i->u_len; | ||
113 | cmp_id = ocu_i->u_cmpID; | ||
114 | utf_o->u_len = 0; | ||
115 | |||
116 | if (ocu_len == 0) | ||
117 | { | ||
118 | memset(utf_o, 0, sizeof(struct ustr)); | ||
119 | utf_o->u_cmpID = 0; | ||
120 | utf_o->u_len = 0; | ||
121 | return 0; | ||
122 | } | ||
123 | |||
124 | if ((cmp_id != 8) && (cmp_id != 16)) | ||
125 | { | ||
126 | printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name); | ||
127 | return 0; | ||
128 | } | ||
129 | |||
130 | for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN-3)) ;) | ||
131 | { | ||
132 | |||
133 | /* Expand OSTA compressed Unicode to Unicode */ | ||
134 | c = ocu[i++]; | ||
135 | if (cmp_id == 16) | ||
136 | c = (c << 8) | ocu[i++]; | ||
137 | |||
138 | /* Compress Unicode to UTF-8 */ | ||
139 | if (c < 0x80U) | ||
140 | utf_o->u_name[utf_o->u_len++] = (uint8_t)c; | ||
141 | else if (c < 0x800U) | ||
142 | { | ||
143 | utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xc0 | (c >> 6)); | ||
144 | utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | (c & 0x3f)); | ||
145 | } | ||
146 | else | ||
147 | { | ||
148 | utf_o->u_name[utf_o->u_len++] = (uint8_t)(0xe0 | (c >> 12)); | ||
149 | utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | ((c >> 6) & 0x3f)); | ||
150 | utf_o->u_name[utf_o->u_len++] = (uint8_t)(0x80 | (c & 0x3f)); | ||
151 | } | ||
152 | } | ||
153 | utf_o->u_cmpID=8; | ||
154 | |||
155 | return utf_o->u_len; | ||
156 | } | ||
157 | |||
158 | /* | ||
159 | * | ||
160 | * udf_utf8_to_ocu | ||
161 | * | ||
162 | * PURPOSE | ||
163 | * Convert UTF-8 to the OSTA Compressed Unicode equivalent. | ||
164 | * | ||
165 | * DESCRIPTION | ||
166 | * This routine is only called by udf_lookup(). | ||
167 | * | ||
168 | * PRE-CONDITIONS | ||
169 | * ocu Pointer to OSTA Compressed Unicode output | ||
170 | * buffer of size UDF_NAME_LEN bytes. | ||
171 | * utf Pointer to UTF-8 input buffer. | ||
172 | * utf_len Length of UTF-8 input buffer in bytes. | ||
173 | * | ||
174 | * POST-CONDITIONS | ||
175 | * <return> Zero on success. | ||
176 | * | ||
177 | * HISTORY | ||
178 | * November 12, 1997 - Andrew E. Mileski | ||
179 | * Written, tested, and released. | ||
180 | */ | ||
181 | static int udf_UTF8toCS0(dstring *ocu, struct ustr *utf, int length) | ||
182 | { | ||
183 | unsigned c, i, max_val, utf_char; | ||
184 | int utf_cnt, u_len; | ||
185 | |||
186 | memset(ocu, 0, sizeof(dstring) * length); | ||
187 | ocu[0] = 8; | ||
188 | max_val = 0xffU; | ||
189 | |||
190 | try_again: | ||
191 | u_len = 0U; | ||
192 | utf_char = 0U; | ||
193 | utf_cnt = 0U; | ||
194 | for (i = 0U; i < utf->u_len; i++) | ||
195 | { | ||
196 | c = (uint8_t)utf->u_name[i]; | ||
197 | |||
198 | /* Complete a multi-byte UTF-8 character */ | ||
199 | if (utf_cnt) | ||
200 | { | ||
201 | utf_char = (utf_char << 6) | (c & 0x3fU); | ||
202 | if (--utf_cnt) | ||
203 | continue; | ||
204 | } | ||
205 | else | ||
206 | { | ||
207 | /* Check for a multi-byte UTF-8 character */ | ||
208 | if (c & 0x80U) | ||
209 | { | ||
210 | /* Start a multi-byte UTF-8 character */ | ||
211 | if ((c & 0xe0U) == 0xc0U) | ||
212 | { | ||
213 | utf_char = c & 0x1fU; | ||
214 | utf_cnt = 1; | ||
215 | } | ||
216 | else if ((c & 0xf0U) == 0xe0U) | ||
217 | { | ||
218 | utf_char = c & 0x0fU; | ||
219 | utf_cnt = 2; | ||
220 | } | ||
221 | else if ((c & 0xf8U) == 0xf0U) | ||
222 | { | ||
223 | utf_char = c & 0x07U; | ||
224 | utf_cnt = 3; | ||
225 | } | ||
226 | else if ((c & 0xfcU) == 0xf8U) | ||
227 | { | ||
228 | utf_char = c & 0x03U; | ||
229 | utf_cnt = 4; | ||
230 | } | ||
231 | else if ((c & 0xfeU) == 0xfcU) | ||
232 | { | ||
233 | utf_char = c & 0x01U; | ||
234 | utf_cnt = 5; | ||
235 | } | ||
236 | else | ||
237 | goto error_out; | ||
238 | continue; | ||
239 | } else | ||
240 | /* Single byte UTF-8 character (most common) */ | ||
241 | utf_char = c; | ||
242 | } | ||
243 | |||
244 | /* Choose no compression if necessary */ | ||
245 | if (utf_char > max_val) | ||
246 | { | ||
247 | if ( 0xffU == max_val ) | ||
248 | { | ||
249 | max_val = 0xffffU; | ||
250 | ocu[0] = (uint8_t)0x10U; | ||
251 | goto try_again; | ||
252 | } | ||
253 | goto error_out; | ||
254 | } | ||
255 | |||
256 | if (max_val == 0xffffU) | ||
257 | { | ||
258 | ocu[++u_len] = (uint8_t)(utf_char >> 8); | ||
259 | } | ||
260 | ocu[++u_len] = (uint8_t)(utf_char & 0xffU); | ||
261 | } | ||
262 | |||
263 | |||
264 | if (utf_cnt) | ||
265 | { | ||
266 | error_out: | ||
267 | ocu[++u_len] = '?'; | ||
268 | printk(KERN_DEBUG "udf: bad UTF-8 character\n"); | ||
269 | } | ||
270 | |||
271 | ocu[length - 1] = (uint8_t)u_len + 1; | ||
272 | return u_len + 1; | ||
273 | } | ||
274 | |||
275 | static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, struct ustr *ocu_i) | ||
276 | { | ||
277 | uint8_t *ocu; | ||
278 | uint32_t c; | ||
279 | uint8_t cmp_id, ocu_len; | ||
280 | int i; | ||
281 | |||
282 | ocu = ocu_i->u_name; | ||
283 | |||
284 | ocu_len = ocu_i->u_len; | ||
285 | cmp_id = ocu_i->u_cmpID; | ||
286 | utf_o->u_len = 0; | ||
287 | |||
288 | if (ocu_len == 0) | ||
289 | { | ||
290 | memset(utf_o, 0, sizeof(struct ustr)); | ||
291 | utf_o->u_cmpID = 0; | ||
292 | utf_o->u_len = 0; | ||
293 | return 0; | ||
294 | } | ||
295 | |||
296 | if ((cmp_id != 8) && (cmp_id != 16)) | ||
297 | { | ||
298 | printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name); | ||
299 | return 0; | ||
300 | } | ||
301 | |||
302 | for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN-3)) ;) | ||
303 | { | ||
304 | /* Expand OSTA compressed Unicode to Unicode */ | ||
305 | c = ocu[i++]; | ||
306 | if (cmp_id == 16) | ||
307 | c = (c << 8) | ocu[i++]; | ||
308 | |||
309 | utf_o->u_len += nls->uni2char(c, &utf_o->u_name[utf_o->u_len], | ||
310 | UDF_NAME_LEN - utf_o->u_len); | ||
311 | } | ||
312 | utf_o->u_cmpID=8; | ||
313 | |||
314 | return utf_o->u_len; | ||
315 | } | ||
316 | |||
317 | static int udf_NLStoCS0(struct nls_table *nls, dstring *ocu, struct ustr *uni, int length) | ||
318 | { | ||
319 | unsigned len, i, max_val; | ||
320 | uint16_t uni_char; | ||
321 | int u_len; | ||
322 | |||
323 | memset(ocu, 0, sizeof(dstring) * length); | ||
324 | ocu[0] = 8; | ||
325 | max_val = 0xffU; | ||
326 | |||
327 | try_again: | ||
328 | u_len = 0U; | ||
329 | for (i = 0U; i < uni->u_len; i++) | ||
330 | { | ||
331 | len = nls->char2uni(&uni->u_name[i], uni->u_len-i, &uni_char); | ||
332 | if (len <= 0) | ||
333 | continue; | ||
334 | |||
335 | if (uni_char > max_val) | ||
336 | { | ||
337 | max_val = 0xffffU; | ||
338 | ocu[0] = (uint8_t)0x10U; | ||
339 | goto try_again; | ||
340 | } | ||
341 | |||
342 | if (max_val == 0xffffU) | ||
343 | ocu[++u_len] = (uint8_t)(uni_char >> 8); | ||
344 | ocu[++u_len] = (uint8_t)(uni_char & 0xffU); | ||
345 | i += len - 1; | ||
346 | } | ||
347 | |||
348 | ocu[length - 1] = (uint8_t)u_len + 1; | ||
349 | return u_len + 1; | ||
350 | } | ||
351 | |||
352 | int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, int flen) | ||
353 | { | ||
354 | struct ustr filename, unifilename; | ||
355 | int len; | ||
356 | |||
357 | if (udf_build_ustr_exact(&unifilename, sname, flen)) | ||
358 | { | ||
359 | return 0; | ||
360 | } | ||
361 | |||
362 | if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) | ||
363 | { | ||
364 | if (!udf_CS0toUTF8(&filename, &unifilename) ) | ||
365 | { | ||
366 | udf_debug("Failed in udf_get_filename: sname = %s\n", sname); | ||
367 | return 0; | ||
368 | } | ||
369 | } | ||
370 | else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) | ||
371 | { | ||
372 | if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, &filename, &unifilename) ) | ||
373 | { | ||
374 | udf_debug("Failed in udf_get_filename: sname = %s\n", sname); | ||
375 | return 0; | ||
376 | } | ||
377 | } | ||
378 | else | ||
379 | return 0; | ||
380 | |||
381 | if ((len = udf_translate_to_linux(dname, filename.u_name, filename.u_len, | ||
382 | unifilename.u_name, unifilename.u_len))) | ||
383 | { | ||
384 | return len; | ||
385 | } | ||
386 | return 0; | ||
387 | } | ||
388 | |||
389 | int udf_put_filename(struct super_block *sb, const uint8_t *sname, uint8_t *dname, int flen) | ||
390 | { | ||
391 | struct ustr unifilename; | ||
392 | int namelen; | ||
393 | |||
394 | if ( !(udf_char_to_ustr(&unifilename, sname, flen)) ) | ||
395 | { | ||
396 | return 0; | ||
397 | } | ||
398 | |||
399 | if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) | ||
400 | { | ||
401 | if ( !(namelen = udf_UTF8toCS0(dname, &unifilename, UDF_NAME_LEN)) ) | ||
402 | { | ||
403 | return 0; | ||
404 | } | ||
405 | } | ||
406 | else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) | ||
407 | { | ||
408 | if ( !(namelen = udf_NLStoCS0(UDF_SB(sb)->s_nls_map, dname, &unifilename, UDF_NAME_LEN)) ) | ||
409 | { | ||
410 | return 0; | ||
411 | } | ||
412 | } | ||
413 | else | ||
414 | return 0; | ||
415 | |||
416 | return namelen; | ||
417 | } | ||
418 | |||
419 | #define ILLEGAL_CHAR_MARK '_' | ||
420 | #define EXT_MARK '.' | ||
421 | #define CRC_MARK '#' | ||
422 | #define EXT_SIZE 5 | ||
423 | |||
424 | static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName, int udfLen, uint8_t *fidName, int fidNameLen) | ||
425 | { | ||
426 | int index, newIndex = 0, needsCRC = 0; | ||
427 | int extIndex = 0, newExtIndex = 0, hasExt = 0; | ||
428 | unsigned short valueCRC; | ||
429 | uint8_t curr; | ||
430 | const uint8_t hexChar[] = "0123456789ABCDEF"; | ||
431 | |||
432 | if (udfName[0] == '.' && (udfLen == 1 || | ||
433 | (udfLen == 2 && udfName[1] == '.'))) | ||
434 | { | ||
435 | needsCRC = 1; | ||
436 | newIndex = udfLen; | ||
437 | memcpy(newName, udfName, udfLen); | ||
438 | } | ||
439 | else | ||
440 | { | ||
441 | for (index = 0; index < udfLen; index++) | ||
442 | { | ||
443 | curr = udfName[index]; | ||
444 | if (curr == '/' || curr == 0) | ||
445 | { | ||
446 | needsCRC = 1; | ||
447 | curr = ILLEGAL_CHAR_MARK; | ||
448 | while (index+1 < udfLen && (udfName[index+1] == '/' || | ||
449 | udfName[index+1] == 0)) | ||
450 | index++; | ||
451 | } | ||
452 | if (curr == EXT_MARK && (udfLen - index - 1) <= EXT_SIZE) | ||
453 | { | ||
454 | if (udfLen == index + 1) | ||
455 | hasExt = 0; | ||
456 | else | ||
457 | { | ||
458 | hasExt = 1; | ||
459 | extIndex = index; | ||
460 | newExtIndex = newIndex; | ||
461 | } | ||
462 | } | ||
463 | if (newIndex < 256) | ||
464 | newName[newIndex++] = curr; | ||
465 | else | ||
466 | needsCRC = 1; | ||
467 | } | ||
468 | } | ||
469 | if (needsCRC) | ||
470 | { | ||
471 | uint8_t ext[EXT_SIZE]; | ||
472 | int localExtIndex = 0; | ||
473 | |||
474 | if (hasExt) | ||
475 | { | ||
476 | int maxFilenameLen; | ||
477 | for(index = 0; index<EXT_SIZE && extIndex + index +1 < udfLen; | ||
478 | index++ ) | ||
479 | { | ||
480 | curr = udfName[extIndex + index + 1]; | ||
481 | |||
482 | if (curr == '/' || curr == 0) | ||
483 | { | ||
484 | needsCRC = 1; | ||
485 | curr = ILLEGAL_CHAR_MARK; | ||
486 | while(extIndex + index + 2 < udfLen && (index + 1 < EXT_SIZE | ||
487 | && (udfName[extIndex + index + 2] == '/' || | ||
488 | udfName[extIndex + index + 2] == 0))) | ||
489 | index++; | ||
490 | } | ||
491 | ext[localExtIndex++] = curr; | ||
492 | } | ||
493 | maxFilenameLen = 250 - localExtIndex; | ||
494 | if (newIndex > maxFilenameLen) | ||
495 | newIndex = maxFilenameLen; | ||
496 | else | ||
497 | newIndex = newExtIndex; | ||
498 | } | ||
499 | else if (newIndex > 250) | ||
500 | newIndex = 250; | ||
501 | newName[newIndex++] = CRC_MARK; | ||
502 | valueCRC = udf_crc(fidName, fidNameLen, 0); | ||
503 | newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12]; | ||
504 | newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8]; | ||
505 | newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4]; | ||
506 | newName[newIndex++] = hexChar[(valueCRC & 0x000f)]; | ||
507 | |||
508 | if (hasExt) | ||
509 | { | ||
510 | newName[newIndex++] = EXT_MARK; | ||
511 | for (index = 0;index < localExtIndex ;index++ ) | ||
512 | newName[newIndex++] = ext[index]; | ||
513 | } | ||
514 | } | ||
515 | return newIndex; | ||
516 | } | ||