aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHiro Yoshioka <hyoshiok@miraclelinux.com>2006-06-23 05:04:16 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-06-23 10:42:56 -0400
commitc22ce143d15eb288543fe9873e1c5ac1c01b69a1 (patch)
treedc7d457b8952fc50dfc90df659b35de4117c61fc
parent7dbdf43cfa635ddc3701cc8d1eab07597cd731c0 (diff)
[PATCH] x86: cache pollution aware __copy_from_user_ll()
Use the x86 cache-bypassing copy instructions for copy_from_user(). Some performance data are Total of GLOBAL_POWER_EVENTS (CPU cycle samples) 2.6.12.4.orig 1921587 2.6.12.4.nt 1599424 1599424/1921587=83.23% (16.77% reduction) BSQ_CACHE_REFERENCE (L3 cache miss) 2.6.12.4.orig 57427 2.6.12.4.nt 20858 20858/57427=36.32% (63.7% reduction) L3 cache miss reduction of __copy_from_user_ll samples % 37408 65.1412 vmlinux __copy_from_user_ll 23 0.1103 vmlinux __copy_user_zeroing_intel_nocache 23/37408=0.061% (99.94% reduction) Top 5 of 2.6.12.4.nt Counted GLOBAL_POWER_EVENTS events (time during which processor is not stopped) with a unit mask of 0x01 (mandatory) count 100000 samples % app name symbol name 128392 8.0274 vmlinux __copy_user_zeroing_intel_nocache 64206 4.0143 vmlinux journal_add_journal_head 59746 3.7355 vmlinux do_get_write_access 47674 2.9807 vmlinux journal_put_journal_head 46021 2.8774 vmlinux journal_dirty_metadata pattern9-0-cpu4-0-09011728/summary.out Counted BSQ_CACHE_REFERENCE events (cache references seen by the bus unit) with a unit mask of 0x3f (multiple flags) count 3000 samples % app name symbol name 69755 4.2861 vmlinux __copy_user_zeroing_intel_nocache 55685 3.4215 vmlinux journal_add_journal_head 52371 3.2179 vmlinux __find_get_block 45504 2.7960 vmlinux journal_put_journal_head 36005 2.2123 vmlinux journal_stop pattern9-0-cpu4-0-09011744/summary.out Counted BSQ_CACHE_REFERENCE events (cache references seen by the bus unit) with a unit mask of 0x200 (read 3rd level cache miss) count 3000 samples % app name symbol name 1147 5.4994 vmlinux journal_add_journal_head 881 4.2240 vmlinux journal_dirty_data 872 4.1809 vmlinux blk_rq_map_sg 734 3.5192 vmlinux journal_commit_transaction 617 2.9582 vmlinux radix_tree_delete pattern9-0-cpu4-0-09011731/summary.out iozone results are original 2.6.12.4 CPU time = 207.768 sec cache aware CPU time = 184.783 sec (three times run) 184.783/207.768=88.94% (11.06% reduction) original: pattern9-0-cpu4-0-08191720/iozone.out: CPU Utilization: Wall time 45.997 CPU time 64.527 CPU utilization 140.28 % pattern9-0-cpu4-0-08191741/iozone.out: CPU Utilization: Wall time 46.878 CPU time 71.933 CPU utilization 153.45 % pattern9-0-cpu4-0-08191743/iozone.out: CPU Utilization: Wall time 45.152 CPU time 71.308 CPU utilization 157.93 % cache awre: pattern9-0-cpu4-0-09011728/iozone.out: CPU Utilization: Wall time 44.842 CPU time 62.465 CPU utilization 139.30 % pattern9-0-cpu4-0-09011731/iozone.out: CPU Utilization: Wall time 44.718 CPU time 59.273 CPU utilization 132.55 % pattern9-0-cpu4-0-09011744/iozone.out: CPU Utilization: Wall time 44.367 CPU time 63.045 CPU utilization 142.10 % Signed-off-by: Hiro Yoshioka <hyoshiok@miraclelinux.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--arch/i386/lib/usercopy.c137
-rw-r--r--include/asm-i386/uaccess.h33
-rw-r--r--include/linux/uaccess.h22
-rw-r--r--mm/filemap.c4
-rw-r--r--mm/filemap.h6
5 files changed, 189 insertions, 13 deletions
diff --git a/arch/i386/lib/usercopy.c b/arch/i386/lib/usercopy.c
index 4cf981d70f45..6979297ce278 100644
--- a/arch/i386/lib/usercopy.c
+++ b/arch/i386/lib/usercopy.c
@@ -425,15 +425,121 @@ __copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
425 : "eax", "edx", "memory"); 425 : "eax", "edx", "memory");
426 return size; 426 return size;
427} 427}
428
429/*
430 * Non Temporal Hint version of __copy_user_zeroing_intel. It is cache aware.
431 * hyoshiok@miraclelinux.com
432 */
433
434static unsigned long __copy_user_zeroing_intel_nocache(void *to,
435 const void __user *from, unsigned long size)
436{
437 int d0, d1;
438
439 __asm__ __volatile__(
440 " .align 2,0x90\n"
441 "0: movl 32(%4), %%eax\n"
442 " cmpl $67, %0\n"
443 " jbe 2f\n"
444 "1: movl 64(%4), %%eax\n"
445 " .align 2,0x90\n"
446 "2: movl 0(%4), %%eax\n"
447 "21: movl 4(%4), %%edx\n"
448 " movnti %%eax, 0(%3)\n"
449 " movnti %%edx, 4(%3)\n"
450 "3: movl 8(%4), %%eax\n"
451 "31: movl 12(%4),%%edx\n"
452 " movnti %%eax, 8(%3)\n"
453 " movnti %%edx, 12(%3)\n"
454 "4: movl 16(%4), %%eax\n"
455 "41: movl 20(%4), %%edx\n"
456 " movnti %%eax, 16(%3)\n"
457 " movnti %%edx, 20(%3)\n"
458 "10: movl 24(%4), %%eax\n"
459 "51: movl 28(%4), %%edx\n"
460 " movnti %%eax, 24(%3)\n"
461 " movnti %%edx, 28(%3)\n"
462 "11: movl 32(%4), %%eax\n"
463 "61: movl 36(%4), %%edx\n"
464 " movnti %%eax, 32(%3)\n"
465 " movnti %%edx, 36(%3)\n"
466 "12: movl 40(%4), %%eax\n"
467 "71: movl 44(%4), %%edx\n"
468 " movnti %%eax, 40(%3)\n"
469 " movnti %%edx, 44(%3)\n"
470 "13: movl 48(%4), %%eax\n"
471 "81: movl 52(%4), %%edx\n"
472 " movnti %%eax, 48(%3)\n"
473 " movnti %%edx, 52(%3)\n"
474 "14: movl 56(%4), %%eax\n"
475 "91: movl 60(%4), %%edx\n"
476 " movnti %%eax, 56(%3)\n"
477 " movnti %%edx, 60(%3)\n"
478 " addl $-64, %0\n"
479 " addl $64, %4\n"
480 " addl $64, %3\n"
481 " cmpl $63, %0\n"
482 " ja 0b\n"
483 " sfence \n"
484 "5: movl %0, %%eax\n"
485 " shrl $2, %0\n"
486 " andl $3, %%eax\n"
487 " cld\n"
488 "6: rep; movsl\n"
489 " movl %%eax,%0\n"
490 "7: rep; movsb\n"
491 "8:\n"
492 ".section .fixup,\"ax\"\n"
493 "9: lea 0(%%eax,%0,4),%0\n"
494 "16: pushl %0\n"
495 " pushl %%eax\n"
496 " xorl %%eax,%%eax\n"
497 " rep; stosb\n"
498 " popl %%eax\n"
499 " popl %0\n"
500 " jmp 8b\n"
501 ".previous\n"
502 ".section __ex_table,\"a\"\n"
503 " .align 4\n"
504 " .long 0b,16b\n"
505 " .long 1b,16b\n"
506 " .long 2b,16b\n"
507 " .long 21b,16b\n"
508 " .long 3b,16b\n"
509 " .long 31b,16b\n"
510 " .long 4b,16b\n"
511 " .long 41b,16b\n"
512 " .long 10b,16b\n"
513 " .long 51b,16b\n"
514 " .long 11b,16b\n"
515 " .long 61b,16b\n"
516 " .long 12b,16b\n"
517 " .long 71b,16b\n"
518 " .long 13b,16b\n"
519 " .long 81b,16b\n"
520 " .long 14b,16b\n"
521 " .long 91b,16b\n"
522 " .long 6b,9b\n"
523 " .long 7b,16b\n"
524 ".previous"
525 : "=&c"(size), "=&D" (d0), "=&S" (d1)
526 : "1"(to), "2"(from), "0"(size)
527 : "eax", "edx", "memory");
528 return size;
529}
530
428#else 531#else
532
429/* 533/*
430 * Leave these declared but undefined. They should not be any references to 534 * Leave these declared but undefined. They should not be any references to
431 * them 535 * them
432 */ 536 */
433unsigned long 537unsigned long __copy_user_zeroing_intel(void *to, const void __user *from,
434__copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size); 538 unsigned long size);
435unsigned long 539unsigned long __copy_user_intel(void __user *to, const void *from,
436__copy_user_intel(void __user *to, const void *from, unsigned long size); 540 unsigned long size);
541unsigned long __copy_user_zeroing_intel_nocache(void *to,
542 const void __user *from, unsigned long size);
437#endif /* CONFIG_X86_INTEL_USERCOPY */ 543#endif /* CONFIG_X86_INTEL_USERCOPY */
438 544
439/* Generic arbitrary sized copy. */ 545/* Generic arbitrary sized copy. */
@@ -515,8 +621,8 @@ do { \
515 : "memory"); \ 621 : "memory"); \
516} while (0) 622} while (0)
517 623
518 624unsigned long __copy_to_user_ll(void __user *to, const void *from,
519unsigned long __copy_to_user_ll(void __user *to, const void *from, unsigned long n) 625 unsigned long n)
520{ 626{
521 BUG_ON((long) n < 0); 627 BUG_ON((long) n < 0);
522#ifndef CONFIG_X86_WP_WORKS_OK 628#ifndef CONFIG_X86_WP_WORKS_OK
@@ -576,8 +682,8 @@ survive:
576} 682}
577EXPORT_SYMBOL(__copy_to_user_ll); 683EXPORT_SYMBOL(__copy_to_user_ll);
578 684
579unsigned long 685unsigned long __copy_from_user_ll(void *to, const void __user *from,
580__copy_from_user_ll(void *to, const void __user *from, unsigned long n) 686 unsigned long n)
581{ 687{
582 BUG_ON((long)n < 0); 688 BUG_ON((long)n < 0);
583 if (movsl_is_ok(to, from, n)) 689 if (movsl_is_ok(to, from, n))
@@ -588,6 +694,21 @@ __copy_from_user_ll(void *to, const void __user *from, unsigned long n)
588} 694}
589EXPORT_SYMBOL(__copy_from_user_ll); 695EXPORT_SYMBOL(__copy_from_user_ll);
590 696
697unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
698 unsigned long n)
699{
700 BUG_ON((long)n < 0);
701#ifdef CONFIG_X86_INTEL_USERCOPY
702 if ( n > 64 && cpu_has_xmm2)
703 n = __copy_user_zeroing_intel_nocache(to, from, n);
704 else
705 __copy_user_zeroing(to, from, n);
706#else
707 __copy_user_zeroing(to, from, n);
708#endif
709 return n;
710}
711
591/** 712/**
592 * copy_to_user: - Copy a block of data into user space. 713 * copy_to_user: - Copy a block of data into user space.
593 * @to: Destination address, in user space. 714 * @to: Destination address, in user space.
diff --git a/include/asm-i386/uaccess.h b/include/asm-i386/uaccess.h
index 1ec65523ea5e..82af28a943ab 100644
--- a/include/asm-i386/uaccess.h
+++ b/include/asm-i386/uaccess.h
@@ -390,6 +390,8 @@ unsigned long __must_check __copy_to_user_ll(void __user *to,
390 const void *from, unsigned long n); 390 const void *from, unsigned long n);
391unsigned long __must_check __copy_from_user_ll(void *to, 391unsigned long __must_check __copy_from_user_ll(void *to,
392 const void __user *from, unsigned long n); 392 const void __user *from, unsigned long n);
393unsigned long __must_check __copy_from_user_ll_nocache(void *to,
394 const void __user *from, unsigned long n);
393 395
394/* 396/*
395 * Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault 397 * Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault
@@ -478,12 +480,43 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
478 return __copy_from_user_ll(to, from, n); 480 return __copy_from_user_ll(to, from, n);
479} 481}
480 482
483#define ARCH_HAS_NOCACHE_UACCESS
484
485static __always_inline unsigned long __copy_from_user_inatomic_nocache(void *to,
486 const void __user *from, unsigned long n)
487{
488 if (__builtin_constant_p(n)) {
489 unsigned long ret;
490
491 switch (n) {
492 case 1:
493 __get_user_size(*(u8 *)to, from, 1, ret, 1);
494 return ret;
495 case 2:
496 __get_user_size(*(u16 *)to, from, 2, ret, 2);
497 return ret;
498 case 4:
499 __get_user_size(*(u32 *)to, from, 4, ret, 4);
500 return ret;
501 }
502 }
503 return __copy_from_user_ll_nocache(to, from, n);
504}
505
481static __always_inline unsigned long 506static __always_inline unsigned long
482__copy_from_user(void *to, const void __user *from, unsigned long n) 507__copy_from_user(void *to, const void __user *from, unsigned long n)
483{ 508{
484 might_sleep(); 509 might_sleep();
485 return __copy_from_user_inatomic(to, from, n); 510 return __copy_from_user_inatomic(to, from, n);
486} 511}
512
513static __always_inline unsigned long
514__copy_from_user_nocache(void *to, const void __user *from, unsigned long n)
515{
516 might_sleep();
517 return __copy_from_user_inatomic_nocache(to, from, n);
518}
519
487unsigned long __must_check copy_to_user(void __user *to, 520unsigned long __must_check copy_to_user(void __user *to,
488 const void *from, unsigned long n); 521 const void *from, unsigned long n);
489unsigned long __must_check copy_from_user(void *to, 522unsigned long __must_check copy_from_user(void *to,
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
new file mode 100644
index 000000000000..391e7ed1eb3f
--- /dev/null
+++ b/include/linux/uaccess.h
@@ -0,0 +1,22 @@
1#ifndef __LINUX_UACCESS_H__
2#define __LINUX_UACCESS_H__
3
4#include <asm/uaccess.h>
5
6#ifndef ARCH_HAS_NOCACHE_UACCESS
7
8static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
9 const void __user *from, unsigned long n)
10{
11 return __copy_from_user_inatomic(to, from, n);
12}
13
14static inline unsigned long __copy_from_user_nocache(void *to,
15 const void __user *from, unsigned long n)
16{
17 return __copy_from_user(to, from, n);
18}
19
20#endif /* ARCH_HAS_NOCACHE_UACCESS */
21
22#endif /* __LINUX_UACCESS_H__ */
diff --git a/mm/filemap.c b/mm/filemap.c
index 368678c2d531..807a463fd5ed 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -14,6 +14,7 @@
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/compiler.h> 15#include <linux/compiler.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/uaccess.h>
17#include <linux/aio.h> 18#include <linux/aio.h>
18#include <linux/capability.h> 19#include <linux/capability.h>
19#include <linux/kernel_stat.h> 20#include <linux/kernel_stat.h>
@@ -38,7 +39,6 @@
38 */ 39 */
39#include <linux/buffer_head.h> /* for generic_osync_inode */ 40#include <linux/buffer_head.h> /* for generic_osync_inode */
40 41
41#include <asm/uaccess.h>
42#include <asm/mman.h> 42#include <asm/mman.h>
43 43
44static ssize_t 44static ssize_t
@@ -1902,7 +1902,7 @@ __filemap_copy_from_user_iovec(char *vaddr,
1902 int copy = min(bytes, iov->iov_len - base); 1902 int copy = min(bytes, iov->iov_len - base);
1903 1903
1904 base = 0; 1904 base = 0;
1905 left = __copy_from_user_inatomic(vaddr, buf, copy); 1905 left = __copy_from_user_inatomic_nocache(vaddr, buf, copy);
1906 copied += copy; 1906 copied += copy;
1907 bytes -= copy; 1907 bytes -= copy;
1908 vaddr += copy; 1908 vaddr += copy;
diff --git a/mm/filemap.h b/mm/filemap.h
index 13793ba0ce17..5683cde22055 100644
--- a/mm/filemap.h
+++ b/mm/filemap.h
@@ -13,7 +13,7 @@
13#include <linux/highmem.h> 13#include <linux/highmem.h>
14#include <linux/uio.h> 14#include <linux/uio.h>
15#include <linux/config.h> 15#include <linux/config.h>
16#include <asm/uaccess.h> 16#include <linux/uaccess.h>
17 17
18size_t 18size_t
19__filemap_copy_from_user_iovec(char *vaddr, 19__filemap_copy_from_user_iovec(char *vaddr,
@@ -34,13 +34,13 @@ filemap_copy_from_user(struct page *page, unsigned long offset,
34 int left; 34 int left;
35 35
36 kaddr = kmap_atomic(page, KM_USER0); 36 kaddr = kmap_atomic(page, KM_USER0);
37 left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); 37 left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
38 kunmap_atomic(kaddr, KM_USER0); 38 kunmap_atomic(kaddr, KM_USER0);
39 39
40 if (left != 0) { 40 if (left != 0) {
41 /* Do it the slow way */ 41 /* Do it the slow way */
42 kaddr = kmap(page); 42 kaddr = kmap(page);
43 left = __copy_from_user(kaddr + offset, buf, bytes); 43 left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
44 kunmap(page); 44 kunmap(page);
45 } 45 }
46 return bytes - left; 46 return bytes - left;