diff options
| author | Gabriel Krisman Bertazi <krisman@collabora.co.uk> | 2019-04-25 13:51:22 -0400 |
|---|---|---|
| committer | Theodore Ts'o <tytso@mit.edu> | 2019-04-25 13:51:22 -0400 |
| commit | 9d53690f0d4e5686e80f034ea584b7a822b356d3 (patch) | |
| tree | c123dcc3f2e193f584acc2560a1358d546e66810 | |
| parent | a8384c68797ee022f5fd7bcef5f4cc57863d4042 (diff) | |
unicode: implement higher level API for string handling
This patch integrates the utf8n patches with some higher level API to
perform UTF-8 string comparison, normalization and casefolding
operations. Implemented is a variation of NFD, and casefold is
performed by doing full casefold on top of NFD. These algorithms are
based on the core implemented by Olaf Weber from SGI.
Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
| -rw-r--r-- | fs/unicode/Makefile | 4 | ||||
| -rw-r--r-- | fs/unicode/utf8-core.c | 187 | ||||
| -rw-r--r-- | fs/unicode/utf8-norm.c | 6 | ||||
| -rw-r--r-- | fs/unicode/utf8n.h | 1 | ||||
| -rw-r--r-- | include/linux/unicode.h | 30 |
5 files changed, 227 insertions, 1 deletions
diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile index 16d43d180416..bfb0360687df 100644 --- a/fs/unicode/Makefile +++ b/fs/unicode/Makefile | |||
| @@ -1,6 +1,8 @@ | |||
| 1 | # SPDX-License-Identifier: GPL-2.0 | 1 | # SPDX-License-Identifier: GPL-2.0 |
| 2 | 2 | ||
| 3 | obj-$(CONFIG_UNICODE) += utf8-norm.o | 3 | obj-$(CONFIG_UNICODE) += unicode.o |
| 4 | |||
| 5 | unicode-y := utf8-norm.o utf8-core.o | ||
| 4 | 6 | ||
| 5 | # This rule is not invoked during the kernel compilation. It is used to | 7 | # This rule is not invoked during the kernel compilation. It is used to |
| 6 | # regenerate the utf8data.h header file. | 8 | # regenerate the utf8data.h header file. |
diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c new file mode 100644 index 000000000000..6afab4fdce90 --- /dev/null +++ b/fs/unicode/utf8-core.c | |||
| @@ -0,0 +1,187 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
| 2 | #include <linux/module.h> | ||
| 3 | #include <linux/kernel.h> | ||
| 4 | #include <linux/string.h> | ||
| 5 | #include <linux/slab.h> | ||
| 6 | #include <linux/parser.h> | ||
| 7 | #include <linux/errno.h> | ||
| 8 | #include <linux/unicode.h> | ||
| 9 | |||
| 10 | #include "utf8n.h" | ||
| 11 | |||
| 12 | int utf8_validate(const struct unicode_map *um, const struct qstr *str) | ||
| 13 | { | ||
| 14 | const struct utf8data *data = utf8nfdi(um->version); | ||
| 15 | |||
| 16 | if (utf8nlen(data, str->name, str->len) < 0) | ||
| 17 | return -1; | ||
| 18 | return 0; | ||
| 19 | } | ||
| 20 | EXPORT_SYMBOL(utf8_validate); | ||
| 21 | |||
| 22 | int utf8_strncmp(const struct unicode_map *um, | ||
| 23 | const struct qstr *s1, const struct qstr *s2) | ||
| 24 | { | ||
| 25 | const struct utf8data *data = utf8nfdi(um->version); | ||
| 26 | struct utf8cursor cur1, cur2; | ||
| 27 | int c1, c2; | ||
| 28 | |||
| 29 | if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0) | ||
| 30 | return -EINVAL; | ||
| 31 | |||
| 32 | if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0) | ||
| 33 | return -EINVAL; | ||
| 34 | |||
| 35 | do { | ||
| 36 | c1 = utf8byte(&cur1); | ||
| 37 | c2 = utf8byte(&cur2); | ||
| 38 | |||
| 39 | if (c1 < 0 || c2 < 0) | ||
| 40 | return -EINVAL; | ||
| 41 | if (c1 != c2) | ||
| 42 | return 1; | ||
| 43 | } while (c1); | ||
| 44 | |||
| 45 | return 0; | ||
| 46 | } | ||
| 47 | EXPORT_SYMBOL(utf8_strncmp); | ||
| 48 | |||
| 49 | int utf8_strncasecmp(const struct unicode_map *um, | ||
| 50 | const struct qstr *s1, const struct qstr *s2) | ||
| 51 | { | ||
| 52 | const struct utf8data *data = utf8nfdicf(um->version); | ||
| 53 | struct utf8cursor cur1, cur2; | ||
| 54 | int c1, c2; | ||
| 55 | |||
| 56 | if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0) | ||
| 57 | return -EINVAL; | ||
| 58 | |||
| 59 | if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0) | ||
| 60 | return -EINVAL; | ||
| 61 | |||
| 62 | do { | ||
| 63 | c1 = utf8byte(&cur1); | ||
| 64 | c2 = utf8byte(&cur2); | ||
| 65 | |||
| 66 | if (c1 < 0 || c2 < 0) | ||
| 67 | return -EINVAL; | ||
| 68 | if (c1 != c2) | ||
| 69 | return 1; | ||
| 70 | } while (c1); | ||
| 71 | |||
| 72 | return 0; | ||
| 73 | } | ||
| 74 | EXPORT_SYMBOL(utf8_strncasecmp); | ||
| 75 | |||
| 76 | int utf8_casefold(const struct unicode_map *um, const struct qstr *str, | ||
| 77 | unsigned char *dest, size_t dlen) | ||
| 78 | { | ||
| 79 | const struct utf8data *data = utf8nfdicf(um->version); | ||
| 80 | struct utf8cursor cur; | ||
| 81 | size_t nlen = 0; | ||
| 82 | |||
| 83 | if (utf8ncursor(&cur, data, str->name, str->len) < 0) | ||
| 84 | return -EINVAL; | ||
| 85 | |||
| 86 | for (nlen = 0; nlen < dlen; nlen++) { | ||
| 87 | int c = utf8byte(&cur); | ||
| 88 | |||
| 89 | dest[nlen] = c; | ||
| 90 | if (!c) | ||
| 91 | return nlen; | ||
| 92 | if (c == -1) | ||
| 93 | break; | ||
| 94 | } | ||
| 95 | return -EINVAL; | ||
| 96 | } | ||
| 97 | |||
| 98 | EXPORT_SYMBOL(utf8_casefold); | ||
| 99 | |||
| 100 | int utf8_normalize(const struct unicode_map *um, const struct qstr *str, | ||
| 101 | unsigned char *dest, size_t dlen) | ||
| 102 | { | ||
| 103 | const struct utf8data *data = utf8nfdi(um->version); | ||
| 104 | struct utf8cursor cur; | ||
| 105 | ssize_t nlen = 0; | ||
| 106 | |||
| 107 | if (utf8ncursor(&cur, data, str->name, str->len) < 0) | ||
| 108 | return -EINVAL; | ||
| 109 | |||
| 110 | for (nlen = 0; nlen < dlen; nlen++) { | ||
| 111 | int c = utf8byte(&cur); | ||
| 112 | |||
| 113 | dest[nlen] = c; | ||
| 114 | if (!c) | ||
| 115 | return nlen; | ||
| 116 | if (c == -1) | ||
| 117 | break; | ||
| 118 | } | ||
| 119 | return -EINVAL; | ||
| 120 | } | ||
| 121 | |||
| 122 | EXPORT_SYMBOL(utf8_normalize); | ||
| 123 | |||
| 124 | static int utf8_parse_version(const char *version, unsigned int *maj, | ||
| 125 | unsigned int *min, unsigned int *rev) | ||
| 126 | { | ||
| 127 | substring_t args[3]; | ||
| 128 | char version_string[12]; | ||
| 129 | const struct match_token token[] = { | ||
| 130 | {1, "%d.%d.%d"}, | ||
| 131 | {0, NULL} | ||
| 132 | }; | ||
| 133 | |||
| 134 | strncpy(version_string, version, sizeof(version_string)); | ||
| 135 | |||
| 136 | if (match_token(version_string, token, args) != 1) | ||
| 137 | return -EINVAL; | ||
| 138 | |||
| 139 | if (match_int(&args[0], maj) || match_int(&args[1], min) || | ||
| 140 | match_int(&args[2], rev)) | ||
| 141 | return -EINVAL; | ||
| 142 | |||
| 143 | return 0; | ||
| 144 | } | ||
| 145 | |||
| 146 | struct unicode_map *utf8_load(const char *version) | ||
| 147 | { | ||
| 148 | struct unicode_map *um = NULL; | ||
| 149 | int unicode_version; | ||
| 150 | |||
| 151 | if (version) { | ||
| 152 | unsigned int maj, min, rev; | ||
| 153 | |||
| 154 | if (utf8_parse_version(version, &maj, &min, &rev) < 0) | ||
| 155 | return ERR_PTR(-EINVAL); | ||
| 156 | |||
| 157 | if (!utf8version_is_supported(maj, min, rev)) | ||
| 158 | return ERR_PTR(-EINVAL); | ||
| 159 | |||
| 160 | unicode_version = UNICODE_AGE(maj, min, rev); | ||
| 161 | } else { | ||
| 162 | unicode_version = utf8version_latest(); | ||
| 163 | printk(KERN_WARNING"UTF-8 version not specified. " | ||
| 164 | "Assuming latest supported version (%d.%d.%d).", | ||
| 165 | (unicode_version >> 16) & 0xff, | ||
| 166 | (unicode_version >> 8) & 0xff, | ||
| 167 | (unicode_version & 0xff)); | ||
| 168 | } | ||
| 169 | |||
| 170 | um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL); | ||
| 171 | if (!um) | ||
| 172 | return ERR_PTR(-ENOMEM); | ||
| 173 | |||
| 174 | um->charset = "UTF-8"; | ||
| 175 | um->version = unicode_version; | ||
| 176 | |||
| 177 | return um; | ||
| 178 | } | ||
| 179 | EXPORT_SYMBOL(utf8_load); | ||
| 180 | |||
| 181 | void utf8_unload(struct unicode_map *um) | ||
| 182 | { | ||
| 183 | kfree(um); | ||
| 184 | } | ||
| 185 | EXPORT_SYMBOL(utf8_unload); | ||
| 186 | |||
| 187 | MODULE_LICENSE("GPL v2"); | ||
diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c index 848b93e97f50..20d440c3f2db 100644 --- a/fs/unicode/utf8-norm.c +++ b/fs/unicode/utf8-norm.c | |||
| @@ -38,6 +38,12 @@ int utf8version_is_supported(u8 maj, u8 min, u8 rev) | |||
| 38 | } | 38 | } |
| 39 | EXPORT_SYMBOL(utf8version_is_supported); | 39 | EXPORT_SYMBOL(utf8version_is_supported); |
| 40 | 40 | ||
| 41 | int utf8version_latest(void) | ||
| 42 | { | ||
| 43 | return utf8vers; | ||
| 44 | } | ||
| 45 | EXPORT_SYMBOL(utf8version_latest); | ||
| 46 | |||
| 41 | /* | 47 | /* |
| 42 | * UTF-8 valid ranges. | 48 | * UTF-8 valid ranges. |
| 43 | * | 49 | * |
diff --git a/fs/unicode/utf8n.h b/fs/unicode/utf8n.h index b63a9091dc39..a120638014c1 100644 --- a/fs/unicode/utf8n.h +++ b/fs/unicode/utf8n.h | |||
| @@ -32,6 +32,7 @@ | |||
| 32 | 32 | ||
| 33 | /* Highest unicode version supported by the data tables. */ | 33 | /* Highest unicode version supported by the data tables. */ |
| 34 | extern int utf8version_is_supported(u8 maj, u8 min, u8 rev); | 34 | extern int utf8version_is_supported(u8 maj, u8 min, u8 rev); |
| 35 | extern int utf8version_latest(void); | ||
| 35 | 36 | ||
| 36 | /* | 37 | /* |
| 37 | * Look for the correct const struct utf8data for a unicode version. | 38 | * Look for the correct const struct utf8data for a unicode version. |
diff --git a/include/linux/unicode.h b/include/linux/unicode.h new file mode 100644 index 000000000000..aec2c6d800aa --- /dev/null +++ b/include/linux/unicode.h | |||
| @@ -0,0 +1,30 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
| 2 | #ifndef _LINUX_UNICODE_H | ||
| 3 | #define _LINUX_UNICODE_H | ||
| 4 | |||
| 5 | #include <linux/init.h> | ||
| 6 | #include <linux/dcache.h> | ||
| 7 | |||
| 8 | struct unicode_map { | ||
| 9 | const char *charset; | ||
| 10 | int version; | ||
| 11 | }; | ||
| 12 | |||
| 13 | int utf8_validate(const struct unicode_map *um, const struct qstr *str); | ||
| 14 | |||
| 15 | int utf8_strncmp(const struct unicode_map *um, | ||
| 16 | const struct qstr *s1, const struct qstr *s2); | ||
| 17 | |||
| 18 | int utf8_strncasecmp(const struct unicode_map *um, | ||
| 19 | const struct qstr *s1, const struct qstr *s2); | ||
| 20 | |||
| 21 | int utf8_normalize(const struct unicode_map *um, const struct qstr *str, | ||
| 22 | unsigned char *dest, size_t dlen); | ||
| 23 | |||
| 24 | int utf8_casefold(const struct unicode_map *um, const struct qstr *str, | ||
| 25 | unsigned char *dest, size_t dlen); | ||
| 26 | |||
| 27 | struct unicode_map *utf8_load(const char *version); | ||
| 28 | void utf8_unload(struct unicode_map *um); | ||
| 29 | |||
| 30 | #endif /* _LINUX_UNICODE_H */ | ||
