diff options
author | Fenghua Yu <fenghua.yu@intel.com> | 2007-02-12 19:27:10 -0500 |
---|---|---|
committer | Tony Luck <tony.luck@intel.com> | 2007-03-07 19:27:09 -0500 |
commit | 3bc207d2b72ea0e6927cccc653c2dc8be593f89f (patch) | |
tree | 6227ca004edf20809668ce0899e1835e075842bc /arch/ia64 | |
parent | ddbad076303dfc0ed4fcba53907dc175bb6d67b2 (diff) |
[IA64] fsys_getcpu for IA64
On 1.6GHz Montectio Tiger4, the following performance data is measured with
kernel built with defconfig which has NUMA configured:
Fastest sys_getcpu: 502 itc counts.
Fastest fsys_getcpu: 28 itc counts.
fsys_getcpu performance is largly impacted by whether data (node_to_cpu_map
etc) is in cache. It can take fsys_getcpu up to ~150 itc counts in cold
cache case.
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Diffstat (limited to 'arch/ia64')
-rw-r--r-- | arch/ia64/kernel/asm-offsets.c | 1 | ||||
-rw-r--r-- | arch/ia64/kernel/fsys.S | 105 |
2 files changed, 106 insertions, 0 deletions
diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c index 75a2a2c12258..2236fabbb3c6 100644 --- a/arch/ia64/kernel/asm-offsets.c +++ b/arch/ia64/kernel/asm-offsets.c | |||
@@ -35,6 +35,7 @@ void foo(void) | |||
35 | BLANK(); | 35 | BLANK(); |
36 | 36 | ||
37 | DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); | 37 | DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); |
38 | DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); | ||
38 | DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count)); | 39 | DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count)); |
39 | 40 | ||
40 | BLANK(); | 41 | BLANK(); |
diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index 7a05b1cb2ad5..8589e84a27c6 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S | |||
@@ -10,6 +10,8 @@ | |||
10 | * probably broke it along the way... ;-) | 10 | * probably broke it along the way... ;-) |
11 | * 13-Jul-04 clameter Implement fsys_clock_gettime and revise fsys_gettimeofday to make | 11 | * 13-Jul-04 clameter Implement fsys_clock_gettime and revise fsys_gettimeofday to make |
12 | * it capable of using memory based clocks without falling back to C code. | 12 | * it capable of using memory based clocks without falling back to C code. |
13 | * 08-Feb-07 Fenghua Yu Implement fsys_getcpu. | ||
14 | * | ||
13 | */ | 15 | */ |
14 | 16 | ||
15 | #include <asm/asmmacro.h> | 17 | #include <asm/asmmacro.h> |
@@ -505,6 +507,59 @@ EX(.fail_efault, (p15) st8 [r34]=r3) | |||
505 | #endif | 507 | #endif |
506 | END(fsys_rt_sigprocmask) | 508 | END(fsys_rt_sigprocmask) |
507 | 509 | ||
510 | /* | ||
511 | * fsys_getcpu doesn't use the third parameter in this implementation. It reads | ||
512 | * current_thread_info()->cpu and corresponding node in cpu_to_node_map. | ||
513 | */ | ||
514 | ENTRY(fsys_getcpu) | ||
515 | .prologue | ||
516 | .altrp b6 | ||
517 | .body | ||
518 | ;; | ||
519 | add r2=TI_FLAGS+IA64_TASK_SIZE,r16 | ||
520 | tnat.nz p6,p0 = r32 // guard against NaT argument | ||
521 | add r3=TI_CPU+IA64_TASK_SIZE,r16 | ||
522 | ;; | ||
523 | ld4 r3=[r3] // M r3 = thread_info->cpu | ||
524 | ld4 r2=[r2] // M r2 = thread_info->flags | ||
525 | (p6) br.cond.spnt.few .fail_einval // B | ||
526 | ;; | ||
527 | tnat.nz p7,p0 = r33 // I guard against NaT argument | ||
528 | (p7) br.cond.spnt.few .fail_einval // B | ||
529 | #ifdef CONFIG_NUMA | ||
530 | movl r17=cpu_to_node_map | ||
531 | ;; | ||
532 | EX(.fail_efault, probe.w.fault r32, 3) // M This takes 5 cycles | ||
533 | EX(.fail_efault, probe.w.fault r33, 3) // M This takes 5 cycles | ||
534 | shladd r18=r3,1,r17 | ||
535 | ;; | ||
536 | ld2 r20=[r18] // r20 = cpu_to_node_map[cpu] | ||
537 | and r2 = TIF_ALLWORK_MASK,r2 | ||
538 | ;; | ||
539 | cmp.ne p8,p0=0,r2 | ||
540 | (p8) br.spnt.many fsys_fallback_syscall | ||
541 | ;; | ||
542 | ;; | ||
543 | EX(.fail_efault, st4 [r32] = r3) | ||
544 | EX(.fail_efault, st2 [r33] = r20) | ||
545 | mov r8=0 | ||
546 | ;; | ||
547 | #else | ||
548 | EX(.fail_efault, probe.w.fault r32, 3) // M This takes 5 cycles | ||
549 | EX(.fail_efault, probe.w.fault r33, 3) // M This takes 5 cycles | ||
550 | and r2 = TIF_ALLWORK_MASK,r2 | ||
551 | ;; | ||
552 | cmp.ne p8,p0=0,r2 | ||
553 | (p8) br.spnt.many fsys_fallback_syscall | ||
554 | ;; | ||
555 | EX(.fail_efault, st4 [r32] = r3) | ||
556 | EX(.fail_efault, st2 [r33] = r0) | ||
557 | mov r8=0 | ||
558 | ;; | ||
559 | #endif | ||
560 | FSYS_RETURN | ||
561 | END(fsys_getcpu) | ||
562 | |||
508 | ENTRY(fsys_fallback_syscall) | 563 | ENTRY(fsys_fallback_syscall) |
509 | .prologue | 564 | .prologue |
510 | .altrp b6 | 565 | .altrp b6 |
@@ -878,6 +933,56 @@ fsyscall_table: | |||
878 | data8 0 // timer_delete | 933 | data8 0 // timer_delete |
879 | data8 0 // clock_settime | 934 | data8 0 // clock_settime |
880 | data8 fsys_clock_gettime // clock_gettime | 935 | data8 fsys_clock_gettime // clock_gettime |
936 | data8 0 // clock_getres // 1255 | ||
937 | data8 0 // clock_nanosleep | ||
938 | data8 0 // fstatfs64 | ||
939 | data8 0 // statfs64 | ||
940 | data8 0 // mbind | ||
941 | data8 0 // get_mempolicy // 1260 | ||
942 | data8 0 // set_mempolicy | ||
943 | data8 0 // mq_open | ||
944 | data8 0 // mq_unlink | ||
945 | data8 0 // mq_timedsend | ||
946 | data8 0 // mq_timedreceive // 1265 | ||
947 | data8 0 // mq_notify | ||
948 | data8 0 // mq_getsetattr | ||
949 | data8 0 // kexec_load | ||
950 | data8 0 // vserver | ||
951 | data8 0 // waitid // 1270 | ||
952 | data8 0 // add_key | ||
953 | data8 0 // request_key | ||
954 | data8 0 // keyctl | ||
955 | data8 0 // ioprio_set | ||
956 | data8 0 // ioprio_get // 1275 | ||
957 | data8 0 // move_pages | ||
958 | data8 0 // inotify_init | ||
959 | data8 0 // inotify_add_watch | ||
960 | data8 0 // inotify_rm_watch | ||
961 | data8 0 // migrate_pages // 1280 | ||
962 | data8 0 // openat | ||
963 | data8 0 // mkdirat | ||
964 | data8 0 // mknodat | ||
965 | data8 0 // fchownat | ||
966 | data8 0 // futimesat // 1285 | ||
967 | data8 0 // newfstatat | ||
968 | data8 0 // unlinkat | ||
969 | data8 0 // renameat | ||
970 | data8 0 // linkat | ||
971 | data8 0 // symlinkat // 1290 | ||
972 | data8 0 // readlinkat | ||
973 | data8 0 // fchmodat | ||
974 | data8 0 // faccessat | ||
975 | data8 0 | ||
976 | data8 0 // 1295 | ||
977 | data8 0 // unshare | ||
978 | data8 0 // splice | ||
979 | data8 0 // set_robust_list | ||
980 | data8 0 // get_robust_list | ||
981 | data8 0 // sync_file_range // 1300 | ||
982 | data8 0 // tee | ||
983 | data8 0 // vmsplice | ||
984 | data8 0 | ||
985 | data8 fsys_getcpu // getcpu // 1304 | ||
881 | 986 | ||
882 | // fill in zeros for the remaining entries | 987 | // fill in zeros for the remaining entries |
883 | .zero: | 988 | .zero: |