aboutsummaryrefslogtreecommitdiffstats
path: root/arch/ia64/kernel
diff options
context:
space:
mode:
authorFenghua Yu <fenghua.yu@intel.com>2007-02-12 19:27:10 -0500
committerTony Luck <tony.luck@intel.com>2007-03-07 19:27:09 -0500
commit3bc207d2b72ea0e6927cccc653c2dc8be593f89f (patch)
tree6227ca004edf20809668ce0899e1835e075842bc /arch/ia64/kernel
parentddbad076303dfc0ed4fcba53907dc175bb6d67b2 (diff)
[IA64] fsys_getcpu for IA64
On 1.6GHz Montectio Tiger4, the following performance data is measured with kernel built with defconfig which has NUMA configured: Fastest sys_getcpu: 502 itc counts. Fastest fsys_getcpu: 28 itc counts. fsys_getcpu performance is largly impacted by whether data (node_to_cpu_map etc) is in cache. It can take fsys_getcpu up to ~150 itc counts in cold cache case. Signed-off-by: Fenghua Yu <fenghua.yu@intel.com> Signed-off-by: Tony Luck <tony.luck@intel.com>
Diffstat (limited to 'arch/ia64/kernel')
-rw-r--r--arch/ia64/kernel/asm-offsets.c1
-rw-r--r--arch/ia64/kernel/fsys.S105
2 files changed, 106 insertions, 0 deletions
diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c
index 75a2a2c12258..2236fabbb3c6 100644
--- a/arch/ia64/kernel/asm-offsets.c
+++ b/arch/ia64/kernel/asm-offsets.c
@@ -35,6 +35,7 @@ void foo(void)
35 BLANK(); 35 BLANK();
36 36
37 DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); 37 DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
38 DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
38 DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count)); 39 DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count));
39 40
40 BLANK(); 41 BLANK();
diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
index 7a05b1cb2ad5..8589e84a27c6 100644
--- a/arch/ia64/kernel/fsys.S
+++ b/arch/ia64/kernel/fsys.S
@@ -10,6 +10,8 @@
10 * probably broke it along the way... ;-) 10 * probably broke it along the way... ;-)
11 * 13-Jul-04 clameter Implement fsys_clock_gettime and revise fsys_gettimeofday to make 11 * 13-Jul-04 clameter Implement fsys_clock_gettime and revise fsys_gettimeofday to make
12 * it capable of using memory based clocks without falling back to C code. 12 * it capable of using memory based clocks without falling back to C code.
13 * 08-Feb-07 Fenghua Yu Implement fsys_getcpu.
14 *
13 */ 15 */
14 16
15#include <asm/asmmacro.h> 17#include <asm/asmmacro.h>
@@ -505,6 +507,59 @@ EX(.fail_efault, (p15) st8 [r34]=r3)
505#endif 507#endif
506END(fsys_rt_sigprocmask) 508END(fsys_rt_sigprocmask)
507 509
510/*
511 * fsys_getcpu doesn't use the third parameter in this implementation. It reads
512 * current_thread_info()->cpu and corresponding node in cpu_to_node_map.
513 */
514ENTRY(fsys_getcpu)
515 .prologue
516 .altrp b6
517 .body
518 ;;
519 add r2=TI_FLAGS+IA64_TASK_SIZE,r16
520 tnat.nz p6,p0 = r32 // guard against NaT argument
521 add r3=TI_CPU+IA64_TASK_SIZE,r16
522 ;;
523 ld4 r3=[r3] // M r3 = thread_info->cpu
524 ld4 r2=[r2] // M r2 = thread_info->flags
525(p6) br.cond.spnt.few .fail_einval // B
526 ;;
527 tnat.nz p7,p0 = r33 // I guard against NaT argument
528(p7) br.cond.spnt.few .fail_einval // B
529#ifdef CONFIG_NUMA
530 movl r17=cpu_to_node_map
531 ;;
532EX(.fail_efault, probe.w.fault r32, 3) // M This takes 5 cycles
533EX(.fail_efault, probe.w.fault r33, 3) // M This takes 5 cycles
534 shladd r18=r3,1,r17
535 ;;
536 ld2 r20=[r18] // r20 = cpu_to_node_map[cpu]
537 and r2 = TIF_ALLWORK_MASK,r2
538 ;;
539 cmp.ne p8,p0=0,r2
540(p8) br.spnt.many fsys_fallback_syscall
541 ;;
542 ;;
543EX(.fail_efault, st4 [r32] = r3)
544EX(.fail_efault, st2 [r33] = r20)
545 mov r8=0
546 ;;
547#else
548EX(.fail_efault, probe.w.fault r32, 3) // M This takes 5 cycles
549EX(.fail_efault, probe.w.fault r33, 3) // M This takes 5 cycles
550 and r2 = TIF_ALLWORK_MASK,r2
551 ;;
552 cmp.ne p8,p0=0,r2
553(p8) br.spnt.many fsys_fallback_syscall
554 ;;
555EX(.fail_efault, st4 [r32] = r3)
556EX(.fail_efault, st2 [r33] = r0)
557 mov r8=0
558 ;;
559#endif
560 FSYS_RETURN
561END(fsys_getcpu)
562
508ENTRY(fsys_fallback_syscall) 563ENTRY(fsys_fallback_syscall)
509 .prologue 564 .prologue
510 .altrp b6 565 .altrp b6
@@ -878,6 +933,56 @@ fsyscall_table:
878 data8 0 // timer_delete 933 data8 0 // timer_delete
879 data8 0 // clock_settime 934 data8 0 // clock_settime
880 data8 fsys_clock_gettime // clock_gettime 935 data8 fsys_clock_gettime // clock_gettime
936 data8 0 // clock_getres // 1255
937 data8 0 // clock_nanosleep
938 data8 0 // fstatfs64
939 data8 0 // statfs64
940 data8 0 // mbind
941 data8 0 // get_mempolicy // 1260
942 data8 0 // set_mempolicy
943 data8 0 // mq_open
944 data8 0 // mq_unlink
945 data8 0 // mq_timedsend
946 data8 0 // mq_timedreceive // 1265
947 data8 0 // mq_notify
948 data8 0 // mq_getsetattr
949 data8 0 // kexec_load
950 data8 0 // vserver
951 data8 0 // waitid // 1270
952 data8 0 // add_key
953 data8 0 // request_key
954 data8 0 // keyctl
955 data8 0 // ioprio_set
956 data8 0 // ioprio_get // 1275
957 data8 0 // move_pages
958 data8 0 // inotify_init
959 data8 0 // inotify_add_watch
960 data8 0 // inotify_rm_watch
961 data8 0 // migrate_pages // 1280
962 data8 0 // openat
963 data8 0 // mkdirat
964 data8 0 // mknodat
965 data8 0 // fchownat
966 data8 0 // futimesat // 1285
967 data8 0 // newfstatat
968 data8 0 // unlinkat
969 data8 0 // renameat
970 data8 0 // linkat
971 data8 0 // symlinkat // 1290
972 data8 0 // readlinkat
973 data8 0 // fchmodat
974 data8 0 // faccessat
975 data8 0
976 data8 0 // 1295
977 data8 0 // unshare
978 data8 0 // splice
979 data8 0 // set_robust_list
980 data8 0 // get_robust_list
981 data8 0 // sync_file_range // 1300
982 data8 0 // tee
983 data8 0 // vmsplice
984 data8 0
985 data8 fsys_getcpu // getcpu // 1304
881 986
882 // fill in zeros for the remaining entries 987 // fill in zeros for the remaining entries
883 .zero: 988 .zero: