Merge branch 'dcache-word-accesses'

* branch 'dcache-word-accesses': vfs: use 'unsigned long' accesses for dcache name comparison and hashing This does the name hashing and lookup using word-sized accesses when that is efficient, namely on x86 (although any little-endian machine with good unaligned accesses would do). It does very much depend on little-endian logic, but it's a very hot couple of functions under some real loads, and this patch improves the performance of __d_lookup_rcu() and link_path_walk() by up to about 30%. Giving a 10% improvement on some very pathname-heavy benchmarks. Because we do make unaligned accesses past the filename, the optimization is disabled when CONFIG_DEBUG_PAGEALLOC is active, and we effectively depend on the fact that on x86 we don't really ever have the last page of usable RAM followed immediately by any IO memory (due to ACPI tables, BIOS buffer areas etc). Some of the bit operations we do are a bit "subtle". It's commented, but you do need to really think about the code. Or just consider it black magic. Thanks to people on G+ for some of the optimized bit tricks.
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-03-19 19:37:28 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-03-19 19:37:28 -0400
commit: b0e37d7ac6ba937c3776ff5111ff6a7fa832fb4f (patch)
tree: fdb86783c464825a77223e49cc24f632e319d2df /fs
parent: 6d7d1a0dc735ea8412769edae7154885021107a9 (diff)
parent: bfcfaa77bdf0f775263e906015982a608df01c76 (diff)
3 files changed, 149 insertions, 0 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index d621f02a3f9e..aa195265362f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -4,6 +4,10 @@
 menu "File systems"
+# Use unaligned word dcache accesses
+config DCACHE_WORD_ACCESS
+       bool
 if BLOCK
 source "fs/ext2/Kconfig"
diff --git a/fs/dcache.c b/fs/dcache.c
index 5f00a6f63c9e..11828de68dce 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -144,6 +144,28 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
 static inline int dentry_cmp(const unsigned char *cs, size_t scount,
                                const unsigned char *ct, size_t tcount)
 {
+#ifdef CONFIG_DCACHE_WORD_ACCESS
+        unsigned long a,b,mask;
+        if (unlikely(scount != tcount))
+                return 1;
+        for (;;) {
+                a = *(unsigned long *)cs;
+                b = *(unsigned long *)ct;
+                if (tcount < sizeof(unsigned long))
+                        break;
+                if (unlikely(a != b))
+                        return 1;
+                cs += sizeof(unsigned long);
+                ct += sizeof(unsigned long);
+                tcount -= sizeof(unsigned long);
+                if (!tcount)
+                        return 0;
+        }
+        mask = ~(~0ul << tcount*8);
+        return unlikely(!!((a ^ b) & mask));
+#else
        if (scount != tcount)
                return 1;
@@ -155,6 +177,7 @@ static inline int dentry_cmp(const unsigned char *cs, size_t scount,
                tcount--;
        } while (tcount);
        return 0;
+#endif
 }
 static void __d_free(struct rcu_head *head)
diff --git a/fs/namei.c b/fs/namei.c
index 46ea9cc16647..fa96a26d3291 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1374,6 +1374,126 @@ static inline int can_lookup(struct inode *inode)
        return 1;
 }
+/*
+ * We can do the critical dentry name comparison and hashing
+ * operations one word at a time, but we are limited to:
+ *
+ * - Architectures with fast unaligned word accesses. We could
+ *   do a "get_unaligned()" if this helps and is sufficiently
+ *   fast.
+ *
+ * - Little-endian machines (so that we can generate the mask
+ *   of low bytes efficiently). Again, we *could* do a byte
+ *   swapping load on big-endian architectures if that is not
+ *   expensive enough to make the optimization worthless.
+ *
+ * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
+ *   do not trap on the (extremely unlikely) case of a page
+ *   crossing operation.
+ *
+ * - Furthermore, we need an efficient 64-bit compile for the
+ *   64-bit case in order to generate the "number of bytes in
+ *   the final mask". Again, that could be replaced with a
+ *   efficient population count instruction or similar.
+ */
+#ifdef CONFIG_DCACHE_WORD_ACCESS
+#ifdef CONFIG_64BIT
+/*
+ * Jan Achrenius on G+: microoptimized version of
+ * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
+ * that works for the bytemasks without having to
+ * mask them first.
+ */
+static inline long count_masked_bytes(unsigned long mask)
+{
+        return mask*0x0001020304050608 >> 56;
+}
+static inline unsigned int fold_hash(unsigned long hash)
+{
+        hash += hash >> (8*sizeof(int));
+        return hash;
+}
+#else   /* 32-bit case */
+/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
+static inline long count_masked_bytes(long mask)
+{
+        /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
+        long a = (0x0ff0001+mask) >> 23;
+        /* Fix the 1 for 00 case */
+        return a & mask;
+}
+#define fold_hash(x) (x)
+#endif
+unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+{
+        unsigned long a, mask;
+        unsigned long hash = 0;
+        for (;;) {
+                a = *(unsigned long *)name;
+                hash *= 9;
+                if (len < sizeof(unsigned long))
+                        break;
+                hash += a;
+                name += sizeof(unsigned long);
+                len -= sizeof(unsigned long);
+                if (!len)
+                        goto done;
+        }
+        mask = ~(~0ul << len*8);
+        hash += mask & a;
+done:
+        return fold_hash(hash);
+}
+EXPORT_SYMBOL(full_name_hash);
+#define ONEBYTES        0x0101010101010101ul
+#define SLASHBYTES      0x2f2f2f2f2f2f2f2ful
+#define HIGHBITS        0x8080808080808080ul
+/* Return the high bit set in the first byte that is a zero */
+static inline unsigned long has_zero(unsigned long a)
+{
+        return ((a - ONEBYTES) & ~a) & HIGHBITS;
+}
+/*
+ * Calculate the length and hash of the path component, and
+ * return the length of the component;
+ */
+static inline unsigned long hash_name(const char *name, unsigned int *hashp)
+{
+        unsigned long a, mask, hash, len;
+        hash = a = 0;
+        len = -sizeof(unsigned long);
+        do {
+                hash = (hash + a) * 9;
+                len += sizeof(unsigned long);
+                a = *(unsigned long *)(name+len);
+                /* Do we have any NUL or '/' bytes in this word? */
+                mask = has_zero(a) | has_zero(a ^ SLASHBYTES);
+        } while (!mask);
+        /* The mask *below* the first high bit set */
+        mask = (mask - 1) & ~mask;
+        mask >>= 7;
+        hash += a & mask;
+        *hashp = fold_hash(hash);
+        return len + count_masked_bytes(mask);
+}
+#else
 unsigned int full_name_hash(const unsigned char *name, unsigned int len)
 {
        unsigned long hash = init_name_hash();
@@ -1402,6 +1522,8 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp)
        return len;
 }
+#endif
 /*
 * Name resolution.
 * This is the basic name resolution function, turning a pathname into
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-03-19 19:37:28 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-03-19 19:37:28 -0400
commit	b0e37d7ac6ba937c3776ff5111ff6a7fa832fb4f (patch)
tree	fdb86783c464825a77223e49cc24f632e319d2df /fs
parent	6d7d1a0dc735ea8412769edae7154885021107a9 (diff)
parent	bfcfaa77bdf0f775263e906015982a608df01c76 (diff)

diff --git a/fs/Kconfig b/fs/Kconfig index d621f02a3f9e..aa195265362f 100644 --- a/fs/Kconfig +++ b/fs/Kconfig
@@ -4,6 +4,10 @@
4		4
5	menu "File systems"	5	menu "File systems"
6		6
		7	# Use unaligned word dcache accesses
		8	config DCACHE_WORD_ACCESS
		9	bool
		10
7	if BLOCK	11	if BLOCK
8		12
9	source "fs/ext2/Kconfig"	13	source "fs/ext2/Kconfig"


diff --git a/fs/dcache.c b/fs/dcache.c index 5f00a6f63c9e..11828de68dce 100644 --- a/fs/dcache.c +++ b/fs/dcache.c
@@ -144,6 +144,28 @@ int proc_nr_dentry(ctl_table table, int write, void __user buffer,
144	static inline int dentry_cmp(const unsigned char *cs, size_t scount,	144	static inline int dentry_cmp(const unsigned char *cs, size_t scount,
145	const unsigned char *ct, size_t tcount)	145	const unsigned char *ct, size_t tcount)
146	{	146	{
		147	#ifdef CONFIG_DCACHE_WORD_ACCESS
		148	unsigned long a,b,mask;
		149
		150	if (unlikely(scount != tcount))
		151	return 1;
		152
		153	for (;;) {
		154	a = (unsigned long )cs;
		155	b = (unsigned long )ct;
		156	if (tcount < sizeof(unsigned long))
		157	break;
		158	if (unlikely(a != b))
		159	return 1;
		160	cs += sizeof(unsigned long);
		161	ct += sizeof(unsigned long);
		162	tcount -= sizeof(unsigned long);
		163	if (!tcount)
		164	return 0;
		165	}
		166	mask = ~(~0ul << tcount*8);
		167	return unlikely(!!((a ^ b) & mask));
		168	#else
147	if (scount != tcount)	169	if (scount != tcount)
148	return 1;	170	return 1;
149		171
@@ -155,6 +177,7 @@ static inline int dentry_cmp(const unsigned char *cs, size_t scount,
155	tcount--;	177	tcount--;
156	} while (tcount);	178	} while (tcount);
157	return 0;	179	return 0;
		180	#endif
158	}	181	}
159		182
160	static void __d_free(struct rcu_head *head)	183	static void __d_free(struct rcu_head *head)


diff --git a/fs/namei.c b/fs/namei.c index 46ea9cc16647..fa96a26d3291 100644 --- a/fs/namei.c +++ b/fs/namei.c
@@ -1374,6 +1374,126 @@ static inline int can_lookup(struct inode *inode)
1374	return 1;	1374	return 1;
1375	}	1375	}
1376		1376
		1377	/*
		1378	* We can do the critical dentry name comparison and hashing
		1379	* operations one word at a time, but we are limited to:
		1380	*
		1381	* - Architectures with fast unaligned word accesses. We could
		1382	* do a "get_unaligned()" if this helps and is sufficiently
		1383	* fast.
		1384	*
		1385	* - Little-endian machines (so that we can generate the mask
		1386	* of low bytes efficiently). Again, we could do a byte
		1387	* swapping load on big-endian architectures if that is not
		1388	* expensive enough to make the optimization worthless.
		1389	*
		1390	* - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
		1391	* do not trap on the (extremely unlikely) case of a page
		1392	* crossing operation.
		1393	*
		1394	* - Furthermore, we need an efficient 64-bit compile for the
		1395	* 64-bit case in order to generate the "number of bytes in
		1396	* the final mask". Again, that could be replaced with a
		1397	* efficient population count instruction or similar.
		1398	*/
		1399	#ifdef CONFIG_DCACHE_WORD_ACCESS
		1400
		1401	#ifdef CONFIG_64BIT
		1402
		1403	/*
		1404	* Jan Achrenius on G+: microoptimized version of
		1405	* the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
		1406	* that works for the bytemasks without having to
		1407	* mask them first.
		1408	*/
		1409	static inline long count_masked_bytes(unsigned long mask)
		1410	{
		1411	return mask*0x0001020304050608 >> 56;
		1412	}
		1413
		1414	static inline unsigned int fold_hash(unsigned long hash)
		1415	{
		1416	hash += hash >> (8*sizeof(int));
		1417	return hash;
		1418	}
		1419
		1420	#else /* 32-bit case */
		1421
		1422	/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
		1423	static inline long count_masked_bytes(long mask)
		1424	{
		1425	/* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
		1426	long a = (0x0ff0001+mask) >> 23;
		1427	/* Fix the 1 for 00 case */
		1428	return a & mask;
		1429	}
		1430
		1431	#define fold_hash(x) (x)
		1432
		1433	#endif
		1434
		1435	unsigned int full_name_hash(const unsigned char *name, unsigned int len)
		1436	{
		1437	unsigned long a, mask;
		1438	unsigned long hash = 0;
		1439
		1440	for (;;) {
		1441	a = (unsigned long )name;
		1442	hash *= 9;
		1443	if (len < sizeof(unsigned long))
		1444	break;
		1445	hash += a;
		1446	name += sizeof(unsigned long);
		1447	len -= sizeof(unsigned long);
		1448	if (!len)
		1449	goto done;
		1450	}
		1451	mask = ~(~0ul << len*8);
		1452	hash += mask & a;
		1453	done:
		1454	return fold_hash(hash);
		1455	}
		1456	EXPORT_SYMBOL(full_name_hash);
		1457
		1458	#define ONEBYTES 0x0101010101010101ul
		1459	#define SLASHBYTES 0x2f2f2f2f2f2f2f2ful
		1460	#define HIGHBITS 0x8080808080808080ul
		1461
		1462	/* Return the high bit set in the first byte that is a zero */
		1463	static inline unsigned long has_zero(unsigned long a)
		1464	{
		1465	return ((a - ONEBYTES) & ~a) & HIGHBITS;
		1466	}
		1467
		1468	/*
		1469	* Calculate the length and hash of the path component, and
		1470	* return the length of the component;
		1471	*/
		1472	static inline unsigned long hash_name(const char name, unsigned int hashp)
		1473	{
		1474	unsigned long a, mask, hash, len;
		1475
		1476	hash = a = 0;
		1477	len = -sizeof(unsigned long);
		1478	do {
		1479	hash = (hash + a) * 9;
		1480	len += sizeof(unsigned long);
		1481	a = (unsigned long )(name+len);
		1482	/* Do we have any NUL or '/' bytes in this word? */
		1483	mask = has_zero(a) \| has_zero(a ^ SLASHBYTES);
		1484	} while (!mask);
		1485
		1486	/* The mask below the first high bit set */
		1487	mask = (mask - 1) & ~mask;
		1488	mask >>= 7;
		1489	hash += a & mask;
		1490	*hashp = fold_hash(hash);
		1491
		1492	return len + count_masked_bytes(mask);
		1493	}
		1494
		1495	#else
		1496
1377	unsigned int full_name_hash(const unsigned char *name, unsigned int len)	1497	unsigned int full_name_hash(const unsigned char *name, unsigned int len)
1378	{	1498	{
1379	unsigned long hash = init_name_hash();	1499	unsigned long hash = init_name_hash();
@@ -1402,6 +1522,8 @@ static inline unsigned long hash_name(const char name, unsigned int hashp)
1402	return len;	1522	return len;
1403	}	1523	}
1404		1524
		1525	#endif
		1526
1405	/*	1527	/*
1406	* Name resolution.	1528	* Name resolution.
1407	* This is the basic name resolution function, turning a pathname into	1529	* This is the basic name resolution function, turning a pathname into