aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGabriel Krisman Bertazi <krisman@collabora.co.uk>2019-04-25 13:51:22 -0400
committerTheodore Ts'o <tytso@mit.edu>2019-04-25 13:51:22 -0400
commit9d53690f0d4e5686e80f034ea584b7a822b356d3 (patch)
treec123dcc3f2e193f584acc2560a1358d546e66810
parenta8384c68797ee022f5fd7bcef5f4cc57863d4042 (diff)
unicode: implement higher level API for string handling
This patch integrates the utf8n patches with some higher level API to perform UTF-8 string comparison, normalization and casefolding operations. Implemented is a variation of NFD, and casefold is performed by doing full casefold on top of NFD. These algorithms are based on the core implemented by Olaf Weber from SGI. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
-rw-r--r--fs/unicode/Makefile4
-rw-r--r--fs/unicode/utf8-core.c187
-rw-r--r--fs/unicode/utf8-norm.c6
-rw-r--r--fs/unicode/utf8n.h1
-rw-r--r--include/linux/unicode.h30
5 files changed, 227 insertions, 1 deletions
diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
index 16d43d180416..bfb0360687df 100644
--- a/fs/unicode/Makefile
+++ b/fs/unicode/Makefile
@@ -1,6 +1,8 @@
1# SPDX-License-Identifier: GPL-2.0 1# SPDX-License-Identifier: GPL-2.0
2 2
3obj-$(CONFIG_UNICODE) += utf8-norm.o 3obj-$(CONFIG_UNICODE) += unicode.o
4
5unicode-y := utf8-norm.o utf8-core.o
4 6
5# This rule is not invoked during the kernel compilation. It is used to 7# This rule is not invoked during the kernel compilation. It is used to
6# regenerate the utf8data.h header file. 8# regenerate the utf8data.h header file.
diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
new file mode 100644
index 000000000000..6afab4fdce90
--- /dev/null
+++ b/fs/unicode/utf8-core.c
@@ -0,0 +1,187 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#include <linux/module.h>
3#include <linux/kernel.h>
4#include <linux/string.h>
5#include <linux/slab.h>
6#include <linux/parser.h>
7#include <linux/errno.h>
8#include <linux/unicode.h>
9
10#include "utf8n.h"
11
12int utf8_validate(const struct unicode_map *um, const struct qstr *str)
13{
14 const struct utf8data *data = utf8nfdi(um->version);
15
16 if (utf8nlen(data, str->name, str->len) < 0)
17 return -1;
18 return 0;
19}
20EXPORT_SYMBOL(utf8_validate);
21
22int utf8_strncmp(const struct unicode_map *um,
23 const struct qstr *s1, const struct qstr *s2)
24{
25 const struct utf8data *data = utf8nfdi(um->version);
26 struct utf8cursor cur1, cur2;
27 int c1, c2;
28
29 if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
30 return -EINVAL;
31
32 if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
33 return -EINVAL;
34
35 do {
36 c1 = utf8byte(&cur1);
37 c2 = utf8byte(&cur2);
38
39 if (c1 < 0 || c2 < 0)
40 return -EINVAL;
41 if (c1 != c2)
42 return 1;
43 } while (c1);
44
45 return 0;
46}
47EXPORT_SYMBOL(utf8_strncmp);
48
49int utf8_strncasecmp(const struct unicode_map *um,
50 const struct qstr *s1, const struct qstr *s2)
51{
52 const struct utf8data *data = utf8nfdicf(um->version);
53 struct utf8cursor cur1, cur2;
54 int c1, c2;
55
56 if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
57 return -EINVAL;
58
59 if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
60 return -EINVAL;
61
62 do {
63 c1 = utf8byte(&cur1);
64 c2 = utf8byte(&cur2);
65
66 if (c1 < 0 || c2 < 0)
67 return -EINVAL;
68 if (c1 != c2)
69 return 1;
70 } while (c1);
71
72 return 0;
73}
74EXPORT_SYMBOL(utf8_strncasecmp);
75
76int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
77 unsigned char *dest, size_t dlen)
78{
79 const struct utf8data *data = utf8nfdicf(um->version);
80 struct utf8cursor cur;
81 size_t nlen = 0;
82
83 if (utf8ncursor(&cur, data, str->name, str->len) < 0)
84 return -EINVAL;
85
86 for (nlen = 0; nlen < dlen; nlen++) {
87 int c = utf8byte(&cur);
88
89 dest[nlen] = c;
90 if (!c)
91 return nlen;
92 if (c == -1)
93 break;
94 }
95 return -EINVAL;
96}
97
98EXPORT_SYMBOL(utf8_casefold);
99
100int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
101 unsigned char *dest, size_t dlen)
102{
103 const struct utf8data *data = utf8nfdi(um->version);
104 struct utf8cursor cur;
105 ssize_t nlen = 0;
106
107 if (utf8ncursor(&cur, data, str->name, str->len) < 0)
108 return -EINVAL;
109
110 for (nlen = 0; nlen < dlen; nlen++) {
111 int c = utf8byte(&cur);
112
113 dest[nlen] = c;
114 if (!c)
115 return nlen;
116 if (c == -1)
117 break;
118 }
119 return -EINVAL;
120}
121
122EXPORT_SYMBOL(utf8_normalize);
123
124static int utf8_parse_version(const char *version, unsigned int *maj,
125 unsigned int *min, unsigned int *rev)
126{
127 substring_t args[3];
128 char version_string[12];
129 const struct match_token token[] = {
130 {1, "%d.%d.%d"},
131 {0, NULL}
132 };
133
134 strncpy(version_string, version, sizeof(version_string));
135
136 if (match_token(version_string, token, args) != 1)
137 return -EINVAL;
138
139 if (match_int(&args[0], maj) || match_int(&args[1], min) ||
140 match_int(&args[2], rev))
141 return -EINVAL;
142
143 return 0;
144}
145
146struct unicode_map *utf8_load(const char *version)
147{
148 struct unicode_map *um = NULL;
149 int unicode_version;
150
151 if (version) {
152 unsigned int maj, min, rev;
153
154 if (utf8_parse_version(version, &maj, &min, &rev) < 0)
155 return ERR_PTR(-EINVAL);
156
157 if (!utf8version_is_supported(maj, min, rev))
158 return ERR_PTR(-EINVAL);
159
160 unicode_version = UNICODE_AGE(maj, min, rev);
161 } else {
162 unicode_version = utf8version_latest();
163 printk(KERN_WARNING"UTF-8 version not specified. "
164 "Assuming latest supported version (%d.%d.%d).",
165 (unicode_version >> 16) & 0xff,
166 (unicode_version >> 8) & 0xff,
167 (unicode_version & 0xff));
168 }
169
170 um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
171 if (!um)
172 return ERR_PTR(-ENOMEM);
173
174 um->charset = "UTF-8";
175 um->version = unicode_version;
176
177 return um;
178}
179EXPORT_SYMBOL(utf8_load);
180
181void utf8_unload(struct unicode_map *um)
182{
183 kfree(um);
184}
185EXPORT_SYMBOL(utf8_unload);
186
187MODULE_LICENSE("GPL v2");
diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c
index 848b93e97f50..20d440c3f2db 100644
--- a/fs/unicode/utf8-norm.c
+++ b/fs/unicode/utf8-norm.c
@@ -38,6 +38,12 @@ int utf8version_is_supported(u8 maj, u8 min, u8 rev)
38} 38}
39EXPORT_SYMBOL(utf8version_is_supported); 39EXPORT_SYMBOL(utf8version_is_supported);
40 40
41int utf8version_latest(void)
42{
43 return utf8vers;
44}
45EXPORT_SYMBOL(utf8version_latest);
46
41/* 47/*
42 * UTF-8 valid ranges. 48 * UTF-8 valid ranges.
43 * 49 *
diff --git a/fs/unicode/utf8n.h b/fs/unicode/utf8n.h
index b63a9091dc39..a120638014c1 100644
--- a/fs/unicode/utf8n.h
+++ b/fs/unicode/utf8n.h
@@ -32,6 +32,7 @@
32 32
33/* Highest unicode version supported by the data tables. */ 33/* Highest unicode version supported by the data tables. */
34extern int utf8version_is_supported(u8 maj, u8 min, u8 rev); 34extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);
35extern int utf8version_latest(void);
35 36
36/* 37/*
37 * Look for the correct const struct utf8data for a unicode version. 38 * Look for the correct const struct utf8data for a unicode version.
diff --git a/include/linux/unicode.h b/include/linux/unicode.h
new file mode 100644
index 000000000000..aec2c6d800aa
--- /dev/null
+++ b/include/linux/unicode.h
@@ -0,0 +1,30 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_UNICODE_H
3#define _LINUX_UNICODE_H
4
5#include <linux/init.h>
6#include <linux/dcache.h>
7
8struct unicode_map {
9 const char *charset;
10 int version;
11};
12
13int utf8_validate(const struct unicode_map *um, const struct qstr *str);
14
15int utf8_strncmp(const struct unicode_map *um,
16 const struct qstr *s1, const struct qstr *s2);
17
18int utf8_strncasecmp(const struct unicode_map *um,
19 const struct qstr *s1, const struct qstr *s2);
20
21int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
22 unsigned char *dest, size_t dlen);
23
24int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
25 unsigned char *dest, size_t dlen);
26
27struct unicode_map *utf8_load(const char *version);
28void utf8_unload(struct unicode_map *um);
29
30#endif /* _LINUX_UNICODE_H */