unicode: implement higher level API for string handling

This patch integrates the utf8n patches with some higher level API to perform UTF-8 string comparison, normalization and casefolding operations. Implemented is a variation of NFD, and casefold is performed by doing full casefold on top of NFD. These algorithms are based on the core implemented by Olaf Weber from SGI. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
author: Gabriel Krisman Bertazi <krisman@collabora.co.uk> 2019-04-25 13:51:22 -0400
committer: Theodore Ts'o <tytso@mit.edu> 2019-04-25 13:51:22 -0400
commit: 9d53690f0d4e5686e80f034ea584b7a822b356d3 (patch)
tree: c123dcc3f2e193f584acc2560a1358d546e66810
parent: a8384c68797ee022f5fd7bcef5f4cc57863d4042 (diff)
5 files changed, 227 insertions, 1 deletions
diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
index 16d43d180416..bfb0360687df 100644
--- a/fs/unicode/Makefile
+++ b/fs/unicode/Makefile
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_UNICODE) += utf8-norm.o
+obj-$(CONFIG_UNICODE) += unicode.o
+unicode-y := utf8-norm.o utf8-core.o
 # This rule is not invoked during the kernel compilation.  It is used to
 # regenerate the utf8data.h header file.
diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
new file mode 100644
index 000000000000..6afab4fdce90
--- /dev/null
+++ b/fs/unicode/utf8-core.c
@@ -0,0 +1,187 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/errno.h>
+#include <linux/unicode.h>
+#include "utf8n.h"
+int utf8_validate(const struct unicode_map *um, const struct qstr *str)
+{
+        const struct utf8data *data = utf8nfdi(um->version);
+        if (utf8nlen(data, str->name, str->len) < 0)
+                return -1;
+        return 0;
+}
+EXPORT_SYMBOL(utf8_validate);
+int utf8_strncmp(const struct unicode_map *um,
+                 const struct qstr *s1, const struct qstr *s2)
+{
+        const struct utf8data *data = utf8nfdi(um->version);
+        struct utf8cursor cur1, cur2;
+        int c1, c2;
+        if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+                return -EINVAL;
+        if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
+                return -EINVAL;
+        do {
+                c1 = utf8byte(&cur1);
+                c2 = utf8byte(&cur2);
+                if (c1 < 0 || c2 < 0)
+                        return -EINVAL;
+                if (c1 != c2)
+                        return 1;
+        } while (c1);
+        return 0;
+}
+EXPORT_SYMBOL(utf8_strncmp);
+int utf8_strncasecmp(const struct unicode_map *um,
+                     const struct qstr *s1, const struct qstr *s2)
+{
+        const struct utf8data *data = utf8nfdicf(um->version);
+        struct utf8cursor cur1, cur2;
+        int c1, c2;
+        if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+                return -EINVAL;
+        if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
+                return -EINVAL;
+        do {
+                c1 = utf8byte(&cur1);
+                c2 = utf8byte(&cur2);
+                if (c1 < 0 || c2 < 0)
+                        return -EINVAL;
+                if (c1 != c2)
+                        return 1;
+        } while (c1);
+        return 0;
+}
+EXPORT_SYMBOL(utf8_strncasecmp);
+int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
+                  unsigned char *dest, size_t dlen)
+{
+        const struct utf8data *data = utf8nfdicf(um->version);
+        struct utf8cursor cur;
+        size_t nlen = 0;
+        if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+                return -EINVAL;
+        for (nlen = 0; nlen < dlen; nlen++) {
+                int c = utf8byte(&cur);
+                dest[nlen] = c;
+                if (!c)
+                        return nlen;
+                if (c == -1)
+                        break;
+        }
+        return -EINVAL;
+}
+EXPORT_SYMBOL(utf8_casefold);
+int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
+                   unsigned char *dest, size_t dlen)
+{
+        const struct utf8data *data = utf8nfdi(um->version);
+        struct utf8cursor cur;
+        ssize_t nlen = 0;
+        if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+                return -EINVAL;
+        for (nlen = 0; nlen < dlen; nlen++) {
+                int c = utf8byte(&cur);
+                dest[nlen] = c;
+                if (!c)
+                        return nlen;
+                if (c == -1)
+                        break;
+        }
+        return -EINVAL;
+}
+EXPORT_SYMBOL(utf8_normalize);
+static int utf8_parse_version(const char *version, unsigned int *maj,
+                              unsigned int *min, unsigned int *rev)
+{
+        substring_t args[3];
+        char version_string[12];
+        const struct match_token token[] = {
+                {1, "%d.%d.%d"},
+                {0, NULL}
+        };
+        strncpy(version_string, version, sizeof(version_string));
+        if (match_token(version_string, token, args) != 1)
+                return -EINVAL;
+        if (match_int(&args[0], maj) || match_int(&args[1], min) ||
+            match_int(&args[2], rev))
+                return -EINVAL;
+        return 0;
+}
+struct unicode_map *utf8_load(const char *version)
+{
+        struct unicode_map *um = NULL;
+        int unicode_version;
+        if (version) {
+                unsigned int maj, min, rev;
+                if (utf8_parse_version(version, &maj, &min, &rev) < 0)
+                        return ERR_PTR(-EINVAL);
+                if (!utf8version_is_supported(maj, min, rev))
+                        return ERR_PTR(-EINVAL);
+                unicode_version = UNICODE_AGE(maj, min, rev);
+        } else {
+                unicode_version = utf8version_latest();
+                printk(KERN_WARNING"UTF-8 version not specified. "
+                       "Assuming latest supported version (%d.%d.%d).",
+                       (unicode_version >> 16) & 0xff,
+                       (unicode_version >> 8) & 0xff,
+                       (unicode_version & 0xff));
+        }
+        um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
+        if (!um)
+                return ERR_PTR(-ENOMEM);
+        um->charset = "UTF-8";
+        um->version = unicode_version;
+        return um;
+}
+EXPORT_SYMBOL(utf8_load);
+void utf8_unload(struct unicode_map *um)
+{
+        kfree(um);
+}
+EXPORT_SYMBOL(utf8_unload);
+MODULE_LICENSE("GPL v2");
diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c
index 848b93e97f50..20d440c3f2db 100644
--- a/fs/unicode/utf8-norm.c
+++ b/fs/unicode/utf8-norm.c
@@ -38,6 +38,12 @@ int utf8version_is_supported(u8 maj, u8 min, u8 rev)
 }
 EXPORT_SYMBOL(utf8version_is_supported);
+int utf8version_latest(void)
+{
+        return utf8vers;
+}
+EXPORT_SYMBOL(utf8version_latest);
 /*
 * UTF-8 valid ranges.
 *
diff --git a/fs/unicode/utf8n.h b/fs/unicode/utf8n.h
index b63a9091dc39..a120638014c1 100644
--- a/fs/unicode/utf8n.h
+++ b/fs/unicode/utf8n.h
@@ -32,6 +32,7 @@
 /* Highest unicode version supported by the data tables. */
 extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);
+extern int utf8version_latest(void);
 /*
 * Look for the correct const struct utf8data for a unicode version.
diff --git a/include/linux/unicode.h b/include/linux/unicode.h
new file mode 100644
index 000000000000..aec2c6d800aa
--- /dev/null
+++ b/include/linux/unicode.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_UNICODE_H
+#define _LINUX_UNICODE_H
+#include <linux/init.h>
+#include <linux/dcache.h>
+struct unicode_map {
+        const char *charset;
+        int version;
+};
+int utf8_validate(const struct unicode_map *um, const struct qstr *str);
+int utf8_strncmp(const struct unicode_map *um,
+                 const struct qstr *s1, const struct qstr *s2);
+int utf8_strncasecmp(const struct unicode_map *um,
+                 const struct qstr *s1, const struct qstr *s2);
+int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
+                   unsigned char *dest, size_t dlen);
+int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
+                  unsigned char *dest, size_t dlen);
+struct unicode_map *utf8_load(const char *version);
+void utf8_unload(struct unicode_map *um);
+#endif /* _LINUX_UNICODE_H */
author	Gabriel Krisman Bertazi <krisman@collabora.co.uk>	2019-04-25 13:51:22 -0400
committer	Theodore Ts'o <tytso@mit.edu>	2019-04-25 13:51:22 -0400
commit	9d53690f0d4e5686e80f034ea584b7a822b356d3 (patch)
tree	c123dcc3f2e193f584acc2560a1358d546e66810
parent	a8384c68797ee022f5fd7bcef5f4cc57863d4042 (diff)

diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile index 16d43d180416..bfb0360687df 100644 --- a/fs/unicode/Makefile +++ b/fs/unicode/Makefile
@@ -1,6 +1,8 @@
1	# SPDX-License-Identifier: GPL-2.0	1	# SPDX-License-Identifier: GPL-2.0
2		2
3	obj-$(CONFIG_UNICODE) += utf8-norm.o	3	obj-$(CONFIG_UNICODE) += unicode.o
		4
		5	unicode-y := utf8-norm.o utf8-core.o
4		6
5	# This rule is not invoked during the kernel compilation. It is used to	7	# This rule is not invoked during the kernel compilation. It is used to
6	# regenerate the utf8data.h header file.	8	# regenerate the utf8data.h header file.


diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c new file mode 100644 index 000000000000..6afab4fdce90 --- /dev/null +++ b/fs/unicode/utf8-core.c
@@ -0,0 +1,187 @@
		1	/* SPDX-License-Identifier: GPL-2.0 */
		2	#include <linux/module.h>
		3	#include <linux/kernel.h>
		4	#include <linux/string.h>
		5	#include <linux/slab.h>
		6	#include <linux/parser.h>
		7	#include <linux/errno.h>
		8	#include <linux/unicode.h>
		9
		10	#include "utf8n.h"
		11
		12	int utf8_validate(const struct unicode_map um, const struct qstr str)
		13	{
		14	const struct utf8data *data = utf8nfdi(um->version);
		15
		16	if (utf8nlen(data, str->name, str->len) < 0)
		17	return -1;
		18	return 0;
		19	}
		20	EXPORT_SYMBOL(utf8_validate);
		21
		22	int utf8_strncmp(const struct unicode_map *um,
		23	const struct qstr s1, const struct qstr s2)
		24	{
		25	const struct utf8data *data = utf8nfdi(um->version);
		26	struct utf8cursor cur1, cur2;
		27	int c1, c2;
		28
		29	if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
		30	return -EINVAL;
		31
		32	if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
		33	return -EINVAL;
		34
		35	do {
		36	c1 = utf8byte(&cur1);
		37	c2 = utf8byte(&cur2);
		38
		39	if (c1 < 0 \|\| c2 < 0)
		40	return -EINVAL;
		41	if (c1 != c2)
		42	return 1;
		43	} while (c1);
		44
		45	return 0;
		46	}
		47	EXPORT_SYMBOL(utf8_strncmp);
		48
		49	int utf8_strncasecmp(const struct unicode_map *um,
		50	const struct qstr s1, const struct qstr s2)
		51	{
		52	const struct utf8data *data = utf8nfdicf(um->version);
		53	struct utf8cursor cur1, cur2;
		54	int c1, c2;
		55
		56	if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
		57	return -EINVAL;
		58
		59	if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
		60	return -EINVAL;
		61
		62	do {
		63	c1 = utf8byte(&cur1);
		64	c2 = utf8byte(&cur2);
		65
		66	if (c1 < 0 \|\| c2 < 0)
		67	return -EINVAL;
		68	if (c1 != c2)
		69	return 1;
		70	} while (c1);
		71
		72	return 0;
		73	}
		74	EXPORT_SYMBOL(utf8_strncasecmp);
		75
		76	int utf8_casefold(const struct unicode_map um, const struct qstr str,
		77	unsigned char *dest, size_t dlen)
		78	{
		79	const struct utf8data *data = utf8nfdicf(um->version);
		80	struct utf8cursor cur;
		81	size_t nlen = 0;
		82
		83	if (utf8ncursor(&cur, data, str->name, str->len) < 0)
		84	return -EINVAL;
		85
		86	for (nlen = 0; nlen < dlen; nlen++) {
		87	int c = utf8byte(&cur);
		88
		89	dest[nlen] = c;
		90	if (!c)
		91	return nlen;
		92	if (c == -1)
		93	break;
		94	}
		95	return -EINVAL;
		96	}
		97
		98	EXPORT_SYMBOL(utf8_casefold);
		99
		100	int utf8_normalize(const struct unicode_map um, const struct qstr str,
		101	unsigned char *dest, size_t dlen)
		102	{
		103	const struct utf8data *data = utf8nfdi(um->version);
		104	struct utf8cursor cur;
		105	ssize_t nlen = 0;
		106
		107	if (utf8ncursor(&cur, data, str->name, str->len) < 0)
		108	return -EINVAL;
		109
		110	for (nlen = 0; nlen < dlen; nlen++) {
		111	int c = utf8byte(&cur);
		112
		113	dest[nlen] = c;
		114	if (!c)
		115	return nlen;
		116	if (c == -1)
		117	break;
		118	}
		119	return -EINVAL;
		120	}
		121
		122	EXPORT_SYMBOL(utf8_normalize);
		123
		124	static int utf8_parse_version(const char version, unsigned int maj,
		125	unsigned int min, unsigned int rev)
		126	{
		127	substring_t args[3];
		128	char version_string[12];
		129	const struct match_token token[] = {
		130	{1, "%d.%d.%d"},
		131	{0, NULL}
		132	};
		133
		134	strncpy(version_string, version, sizeof(version_string));
		135
		136	if (match_token(version_string, token, args) != 1)
		137	return -EINVAL;
		138
		139	if (match_int(&args[0], maj) \|\| match_int(&args[1], min) \|\|
		140	match_int(&args[2], rev))
		141	return -EINVAL;
		142
		143	return 0;
		144	}
		145
		146	struct unicode_map utf8_load(const char version)
		147	{
		148	struct unicode_map *um = NULL;
		149	int unicode_version;
		150
		151	if (version) {
		152	unsigned int maj, min, rev;
		153
		154	if (utf8_parse_version(version, &maj, &min, &rev) < 0)
		155	return ERR_PTR(-EINVAL);
		156
		157	if (!utf8version_is_supported(maj, min, rev))
		158	return ERR_PTR(-EINVAL);
		159
		160	unicode_version = UNICODE_AGE(maj, min, rev);
		161	} else {
		162	unicode_version = utf8version_latest();
		163	printk(KERN_WARNING"UTF-8 version not specified. "
		164	"Assuming latest supported version (%d.%d.%d).",
		165	(unicode_version >> 16) & 0xff,
		166	(unicode_version >> 8) & 0xff,
		167	(unicode_version & 0xff));
		168	}
		169
		170	um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
		171	if (!um)
		172	return ERR_PTR(-ENOMEM);
		173
		174	um->charset = "UTF-8";
		175	um->version = unicode_version;
		176
		177	return um;
		178	}
		179	EXPORT_SYMBOL(utf8_load);
		180
		181	void utf8_unload(struct unicode_map *um)
		182	{
		183	kfree(um);
		184	}
		185	EXPORT_SYMBOL(utf8_unload);
		186
		187	MODULE_LICENSE("GPL v2");


diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c index 848b93e97f50..20d440c3f2db 100644 --- a/fs/unicode/utf8-norm.c +++ b/fs/unicode/utf8-norm.c
@@ -38,6 +38,12 @@ int utf8version_is_supported(u8 maj, u8 min, u8 rev)
38	}	38	}
39	EXPORT_SYMBOL(utf8version_is_supported);	39	EXPORT_SYMBOL(utf8version_is_supported);
40		40
		41	int utf8version_latest(void)
		42	{
		43	return utf8vers;
		44	}
		45	EXPORT_SYMBOL(utf8version_latest);
		46
41	/*	47	/*
42	* UTF-8 valid ranges.	48	* UTF-8 valid ranges.
43	*	49	*


diff --git a/fs/unicode/utf8n.h b/fs/unicode/utf8n.h index b63a9091dc39..a120638014c1 100644 --- a/fs/unicode/utf8n.h +++ b/fs/unicode/utf8n.h
@@ -32,6 +32,7 @@
32		32
33	/* Highest unicode version supported by the data tables. */	33	/* Highest unicode version supported by the data tables. */
34	extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);	34	extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);
		35	extern int utf8version_latest(void);
35		36
36	/*	37	/*
37	* Look for the correct const struct utf8data for a unicode version.	38	* Look for the correct const struct utf8data for a unicode version.


diff --git a/include/linux/unicode.h b/include/linux/unicode.h new file mode 100644 index 000000000000..aec2c6d800aa --- /dev/null +++ b/include/linux/unicode.h
@@ -0,0 +1,30 @@
		1	/* SPDX-License-Identifier: GPL-2.0 */
		2	#ifndef _LINUX_UNICODE_H
		3	#define _LINUX_UNICODE_H
		4
		5	#include <linux/init.h>
		6	#include <linux/dcache.h>
		7
		8	struct unicode_map {
		9	const char *charset;
		10	int version;
		11	};
		12
		13	int utf8_validate(const struct unicode_map um, const struct qstr str);
		14
		15	int utf8_strncmp(const struct unicode_map *um,
		16	const struct qstr s1, const struct qstr s2);
		17
		18	int utf8_strncasecmp(const struct unicode_map *um,
		19	const struct qstr s1, const struct qstr s2);
		20
		21	int utf8_normalize(const struct unicode_map um, const struct qstr str,
		22	unsigned char *dest, size_t dlen);
		23
		24	int utf8_casefold(const struct unicode_map um, const struct qstr str,
		25	unsigned char *dest, size_t dlen);
		26
		27	struct unicode_map utf8_load(const char version);
		28	void utf8_unload(struct unicode_map *um);
		29
		30	#endif /* _LINUX_UNICODE_H */