diff options
Diffstat (limited to 'kernel')
92 files changed, 14786 insertions, 3322 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index f6ef00f4f90f..d62ec66c1af2 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -8,19 +8,30 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ | |||
| 8 | signal.o sys.o kmod.o workqueue.o pid.o \ | 8 | signal.o sys.o kmod.o workqueue.o pid.o \ |
| 9 | rcupdate.o extable.o params.o posix-timers.o \ | 9 | rcupdate.o extable.o params.o posix-timers.o \ |
| 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
| 11 | hrtimer.o | 11 | hrtimer.o rwsem.o |
| 12 | 12 | ||
| 13 | obj-$(CONFIG_STACKTRACE) += stacktrace.o | ||
| 14 | obj-y += time/ | ||
| 13 | obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o | 15 | obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o |
| 16 | obj-$(CONFIG_LOCKDEP) += lockdep.o | ||
| 17 | ifeq ($(CONFIG_PROC_FS),y) | ||
| 18 | obj-$(CONFIG_LOCKDEP) += lockdep_proc.o | ||
| 19 | endif | ||
| 14 | obj-$(CONFIG_FUTEX) += futex.o | 20 | obj-$(CONFIG_FUTEX) += futex.o |
| 15 | ifeq ($(CONFIG_COMPAT),y) | 21 | ifeq ($(CONFIG_COMPAT),y) |
| 16 | obj-$(CONFIG_FUTEX) += futex_compat.o | 22 | obj-$(CONFIG_FUTEX) += futex_compat.o |
| 17 | endif | 23 | endif |
| 24 | obj-$(CONFIG_RT_MUTEXES) += rtmutex.o | ||
| 25 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o | ||
| 26 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o | ||
| 18 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 27 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
| 19 | obj-$(CONFIG_SMP) += cpu.o spinlock.o | 28 | obj-$(CONFIG_SMP) += cpu.o spinlock.o |
| 20 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o | 29 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o |
| 30 | obj-$(CONFIG_PROVE_LOCKING) += spinlock.o | ||
| 21 | obj-$(CONFIG_UID16) += uid16.o | 31 | obj-$(CONFIG_UID16) += uid16.o |
| 22 | obj-$(CONFIG_MODULES) += module.o | 32 | obj-$(CONFIG_MODULES) += module.o |
| 23 | obj-$(CONFIG_KALLSYMS) += kallsyms.o | 33 | obj-$(CONFIG_KALLSYMS) += kallsyms.o |
| 34 | obj-$(CONFIG_STACK_UNWIND) += unwind.o | ||
| 24 | obj-$(CONFIG_PM) += power/ | 35 | obj-$(CONFIG_PM) += power/ |
| 25 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o | 36 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o |
| 26 | obj-$(CONFIG_KEXEC) += kexec.o | 37 | obj-$(CONFIG_KEXEC) += kexec.o |
| @@ -37,6 +48,8 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | |||
| 37 | obj-$(CONFIG_SECCOMP) += seccomp.o | 48 | obj-$(CONFIG_SECCOMP) += seccomp.o |
| 38 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 49 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
| 39 | obj-$(CONFIG_RELAY) += relay.o | 50 | obj-$(CONFIG_RELAY) += relay.o |
| 51 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o | ||
| 52 | obj-$(CONFIG_TASKSTATS) += taskstats.o | ||
| 40 | 53 | ||
| 41 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) | 54 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) |
| 42 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 55 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
diff --git a/kernel/acct.c b/kernel/acct.c index 6802020e0ceb..f4330acead46 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
| @@ -43,7 +43,6 @@ | |||
| 43 | * a struct file opened for write. Fixed. 2/6/2000, AV. | 43 | * a struct file opened for write. Fixed. 2/6/2000, AV. |
| 44 | */ | 44 | */ |
| 45 | 45 | ||
| 46 | #include <linux/config.h> | ||
| 47 | #include <linux/mm.h> | 46 | #include <linux/mm.h> |
| 48 | #include <linux/slab.h> | 47 | #include <linux/slab.h> |
| 49 | #include <linux/acct.h> | 48 | #include <linux/acct.h> |
| @@ -75,7 +74,7 @@ int acct_parm[3] = {4, 2, 30}; | |||
| 75 | /* | 74 | /* |
| 76 | * External references and all of the globals. | 75 | * External references and all of the globals. |
| 77 | */ | 76 | */ |
| 78 | static void do_acct_process(long, struct file *); | 77 | static void do_acct_process(struct file *); |
| 79 | 78 | ||
| 80 | /* | 79 | /* |
| 81 | * This structure is used so that all the data protected by lock | 80 | * This structure is used so that all the data protected by lock |
| @@ -196,7 +195,7 @@ static void acct_file_reopen(struct file *file) | |||
| 196 | if (old_acct) { | 195 | if (old_acct) { |
| 197 | mnt_unpin(old_acct->f_vfsmnt); | 196 | mnt_unpin(old_acct->f_vfsmnt); |
| 198 | spin_unlock(&acct_globals.lock); | 197 | spin_unlock(&acct_globals.lock); |
| 199 | do_acct_process(0, old_acct); | 198 | do_acct_process(old_acct); |
| 200 | filp_close(old_acct, NULL); | 199 | filp_close(old_acct, NULL); |
| 201 | spin_lock(&acct_globals.lock); | 200 | spin_lock(&acct_globals.lock); |
| 202 | } | 201 | } |
| @@ -419,16 +418,15 @@ static u32 encode_float(u64 value) | |||
| 419 | /* | 418 | /* |
| 420 | * do_acct_process does all actual work. Caller holds the reference to file. | 419 | * do_acct_process does all actual work. Caller holds the reference to file. |
| 421 | */ | 420 | */ |
| 422 | static void do_acct_process(long exitcode, struct file *file) | 421 | static void do_acct_process(struct file *file) |
| 423 | { | 422 | { |
| 423 | struct pacct_struct *pacct = ¤t->signal->pacct; | ||
| 424 | acct_t ac; | 424 | acct_t ac; |
| 425 | mm_segment_t fs; | 425 | mm_segment_t fs; |
| 426 | unsigned long vsize; | ||
| 427 | unsigned long flim; | 426 | unsigned long flim; |
| 428 | u64 elapsed; | 427 | u64 elapsed; |
| 429 | u64 run_time; | 428 | u64 run_time; |
| 430 | struct timespec uptime; | 429 | struct timespec uptime; |
| 431 | unsigned long jiffies; | ||
| 432 | 430 | ||
| 433 | /* | 431 | /* |
| 434 | * First check to see if there is enough free_space to continue | 432 | * First check to see if there is enough free_space to continue |
| @@ -469,12 +467,6 @@ static void do_acct_process(long exitcode, struct file *file) | |||
| 469 | #endif | 467 | #endif |
| 470 | do_div(elapsed, AHZ); | 468 | do_div(elapsed, AHZ); |
| 471 | ac.ac_btime = xtime.tv_sec - elapsed; | 469 | ac.ac_btime = xtime.tv_sec - elapsed; |
| 472 | jiffies = cputime_to_jiffies(cputime_add(current->utime, | ||
| 473 | current->signal->utime)); | ||
| 474 | ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies)); | ||
| 475 | jiffies = cputime_to_jiffies(cputime_add(current->stime, | ||
| 476 | current->signal->stime)); | ||
| 477 | ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies)); | ||
| 478 | /* we really need to bite the bullet and change layout */ | 470 | /* we really need to bite the bullet and change layout */ |
| 479 | ac.ac_uid = current->uid; | 471 | ac.ac_uid = current->uid; |
| 480 | ac.ac_gid = current->gid; | 472 | ac.ac_gid = current->gid; |
| @@ -491,42 +483,27 @@ static void do_acct_process(long exitcode, struct file *file) | |||
| 491 | ac.ac_ppid = current->parent->tgid; | 483 | ac.ac_ppid = current->parent->tgid; |
| 492 | #endif | 484 | #endif |
| 493 | 485 | ||
| 494 | read_lock(&tasklist_lock); /* pin current->signal */ | 486 | mutex_lock(&tty_mutex); |
| 487 | /* FIXME: Whoever is responsible for current->signal locking needs | ||
| 488 | to use the same locking all over the kernel and document it */ | ||
| 489 | read_lock(&tasklist_lock); | ||
| 495 | ac.ac_tty = current->signal->tty ? | 490 | ac.ac_tty = current->signal->tty ? |
| 496 | old_encode_dev(tty_devnum(current->signal->tty)) : 0; | 491 | old_encode_dev(tty_devnum(current->signal->tty)) : 0; |
| 497 | read_unlock(&tasklist_lock); | 492 | read_unlock(&tasklist_lock); |
| 498 | 493 | mutex_unlock(&tty_mutex); | |
| 499 | ac.ac_flag = 0; | 494 | |
| 500 | if (current->flags & PF_FORKNOEXEC) | 495 | spin_lock_irq(¤t->sighand->siglock); |
| 501 | ac.ac_flag |= AFORK; | 496 | ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); |
| 502 | if (current->flags & PF_SUPERPRIV) | 497 | ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); |
| 503 | ac.ac_flag |= ASU; | 498 | ac.ac_flag = pacct->ac_flag; |
| 504 | if (current->flags & PF_DUMPCORE) | 499 | ac.ac_mem = encode_comp_t(pacct->ac_mem); |
| 505 | ac.ac_flag |= ACORE; | 500 | ac.ac_minflt = encode_comp_t(pacct->ac_minflt); |
| 506 | if (current->flags & PF_SIGNALED) | 501 | ac.ac_majflt = encode_comp_t(pacct->ac_majflt); |
| 507 | ac.ac_flag |= AXSIG; | 502 | ac.ac_exitcode = pacct->ac_exitcode; |
| 508 | 503 | spin_unlock_irq(¤t->sighand->siglock); | |
| 509 | vsize = 0; | ||
| 510 | if (current->mm) { | ||
| 511 | struct vm_area_struct *vma; | ||
| 512 | down_read(¤t->mm->mmap_sem); | ||
| 513 | vma = current->mm->mmap; | ||
| 514 | while (vma) { | ||
| 515 | vsize += vma->vm_end - vma->vm_start; | ||
| 516 | vma = vma->vm_next; | ||
| 517 | } | ||
| 518 | up_read(¤t->mm->mmap_sem); | ||
| 519 | } | ||
| 520 | vsize = vsize / 1024; | ||
| 521 | ac.ac_mem = encode_comp_t(vsize); | ||
| 522 | ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ | 504 | ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ |
| 523 | ac.ac_rw = encode_comp_t(ac.ac_io / 1024); | 505 | ac.ac_rw = encode_comp_t(ac.ac_io / 1024); |
| 524 | ac.ac_minflt = encode_comp_t(current->signal->min_flt + | ||
| 525 | current->min_flt); | ||
| 526 | ac.ac_majflt = encode_comp_t(current->signal->maj_flt + | ||
| 527 | current->maj_flt); | ||
| 528 | ac.ac_swaps = encode_comp_t(0); | 506 | ac.ac_swaps = encode_comp_t(0); |
| 529 | ac.ac_exitcode = exitcode; | ||
| 530 | 507 | ||
| 531 | /* | 508 | /* |
| 532 | * Kernel segment override to datasegment and write it | 509 | * Kernel segment override to datasegment and write it |
| @@ -546,12 +523,64 @@ static void do_acct_process(long exitcode, struct file *file) | |||
| 546 | } | 523 | } |
| 547 | 524 | ||
| 548 | /** | 525 | /** |
| 526 | * acct_init_pacct - initialize a new pacct_struct | ||
| 527 | * @pacct: per-process accounting info struct to initialize | ||
| 528 | */ | ||
| 529 | void acct_init_pacct(struct pacct_struct *pacct) | ||
| 530 | { | ||
| 531 | memset(pacct, 0, sizeof(struct pacct_struct)); | ||
| 532 | pacct->ac_utime = pacct->ac_stime = cputime_zero; | ||
| 533 | } | ||
| 534 | |||
| 535 | /** | ||
| 536 | * acct_collect - collect accounting information into pacct_struct | ||
| 537 | * @exitcode: task exit code | ||
| 538 | * @group_dead: not 0, if this thread is the last one in the process. | ||
| 539 | */ | ||
| 540 | void acct_collect(long exitcode, int group_dead) | ||
| 541 | { | ||
| 542 | struct pacct_struct *pacct = ¤t->signal->pacct; | ||
| 543 | unsigned long vsize = 0; | ||
| 544 | |||
| 545 | if (group_dead && current->mm) { | ||
| 546 | struct vm_area_struct *vma; | ||
| 547 | down_read(¤t->mm->mmap_sem); | ||
| 548 | vma = current->mm->mmap; | ||
| 549 | while (vma) { | ||
| 550 | vsize += vma->vm_end - vma->vm_start; | ||
| 551 | vma = vma->vm_next; | ||
| 552 | } | ||
| 553 | up_read(¤t->mm->mmap_sem); | ||
| 554 | } | ||
| 555 | |||
| 556 | spin_lock_irq(¤t->sighand->siglock); | ||
| 557 | if (group_dead) | ||
| 558 | pacct->ac_mem = vsize / 1024; | ||
| 559 | if (thread_group_leader(current)) { | ||
| 560 | pacct->ac_exitcode = exitcode; | ||
| 561 | if (current->flags & PF_FORKNOEXEC) | ||
| 562 | pacct->ac_flag |= AFORK; | ||
| 563 | } | ||
| 564 | if (current->flags & PF_SUPERPRIV) | ||
| 565 | pacct->ac_flag |= ASU; | ||
| 566 | if (current->flags & PF_DUMPCORE) | ||
| 567 | pacct->ac_flag |= ACORE; | ||
| 568 | if (current->flags & PF_SIGNALED) | ||
| 569 | pacct->ac_flag |= AXSIG; | ||
| 570 | pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); | ||
| 571 | pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); | ||
| 572 | pacct->ac_minflt += current->min_flt; | ||
| 573 | pacct->ac_majflt += current->maj_flt; | ||
| 574 | spin_unlock_irq(¤t->sighand->siglock); | ||
| 575 | } | ||
| 576 | |||
| 577 | /** | ||
| 549 | * acct_process - now just a wrapper around do_acct_process | 578 | * acct_process - now just a wrapper around do_acct_process |
| 550 | * @exitcode: task exit code | 579 | * @exitcode: task exit code |
| 551 | * | 580 | * |
| 552 | * handles process accounting for an exiting task | 581 | * handles process accounting for an exiting task |
| 553 | */ | 582 | */ |
| 554 | void acct_process(long exitcode) | 583 | void acct_process(void) |
| 555 | { | 584 | { |
| 556 | struct file *file = NULL; | 585 | struct file *file = NULL; |
| 557 | 586 | ||
| @@ -570,7 +599,7 @@ void acct_process(long exitcode) | |||
| 570 | get_file(file); | 599 | get_file(file); |
| 571 | spin_unlock(&acct_globals.lock); | 600 | spin_unlock(&acct_globals.lock); |
| 572 | 601 | ||
| 573 | do_acct_process(exitcode, file); | 602 | do_acct_process(file); |
| 574 | fput(file); | 603 | fput(file); |
| 575 | } | 604 | } |
| 576 | 605 | ||
| @@ -599,9 +628,7 @@ void acct_update_integrals(struct task_struct *tsk) | |||
| 599 | */ | 628 | */ |
| 600 | void acct_clear_integrals(struct task_struct *tsk) | 629 | void acct_clear_integrals(struct task_struct *tsk) |
| 601 | { | 630 | { |
| 602 | if (tsk) { | 631 | tsk->acct_stimexpd = 0; |
| 603 | tsk->acct_stimexpd = 0; | 632 | tsk->acct_rss_mem1 = 0; |
| 604 | tsk->acct_rss_mem1 = 0; | 633 | tsk->acct_vm_mem1 = 0; |
| 605 | tsk->acct_vm_mem1 = 0; | ||
| 606 | } | ||
| 607 | } | 634 | } |
diff --git a/kernel/audit.c b/kernel/audit.c index 7dfac7031bd7..f9889ee77825 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -244,7 +244,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) | |||
| 244 | char *ctx = NULL; | 244 | char *ctx = NULL; |
| 245 | u32 len; | 245 | u32 len; |
| 246 | int rc; | 246 | int rc; |
| 247 | if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) | 247 | if ((rc = selinux_sid_to_string(sid, &ctx, &len))) |
| 248 | return rc; | 248 | return rc; |
| 249 | else | 249 | else |
| 250 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 250 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
| @@ -267,7 +267,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) | |||
| 267 | char *ctx = NULL; | 267 | char *ctx = NULL; |
| 268 | u32 len; | 268 | u32 len; |
| 269 | int rc; | 269 | int rc; |
| 270 | if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) | 270 | if ((rc = selinux_sid_to_string(sid, &ctx, &len))) |
| 271 | return rc; | 271 | return rc; |
| 272 | else | 272 | else |
| 273 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 273 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
| @@ -293,7 +293,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid) | |||
| 293 | char *ctx = NULL; | 293 | char *ctx = NULL; |
| 294 | u32 len; | 294 | u32 len; |
| 295 | int rc; | 295 | int rc; |
| 296 | if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) | 296 | if ((rc = selinux_sid_to_string(sid, &ctx, &len))) |
| 297 | return rc; | 297 | return rc; |
| 298 | else | 298 | else |
| 299 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 299 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
| @@ -321,7 +321,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid) | |||
| 321 | char *ctx = NULL; | 321 | char *ctx = NULL; |
| 322 | u32 len; | 322 | u32 len; |
| 323 | int rc; | 323 | int rc; |
| 324 | if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) | 324 | if ((rc = selinux_sid_to_string(sid, &ctx, &len))) |
| 325 | return rc; | 325 | return rc; |
| 326 | else | 326 | else |
| 327 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 327 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, |
| @@ -445,7 +445,7 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi, | |||
| 445 | * Check for appropriate CAP_AUDIT_ capabilities on incoming audit | 445 | * Check for appropriate CAP_AUDIT_ capabilities on incoming audit |
| 446 | * control messages. | 446 | * control messages. |
| 447 | */ | 447 | */ |
| 448 | static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) | 448 | static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) |
| 449 | { | 449 | { |
| 450 | int err = 0; | 450 | int err = 0; |
| 451 | 451 | ||
| @@ -459,13 +459,13 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) | |||
| 459 | case AUDIT_DEL: | 459 | case AUDIT_DEL: |
| 460 | case AUDIT_DEL_RULE: | 460 | case AUDIT_DEL_RULE: |
| 461 | case AUDIT_SIGNAL_INFO: | 461 | case AUDIT_SIGNAL_INFO: |
| 462 | if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) | 462 | if (security_netlink_recv(skb, CAP_AUDIT_CONTROL)) |
| 463 | err = -EPERM; | 463 | err = -EPERM; |
| 464 | break; | 464 | break; |
| 465 | case AUDIT_USER: | 465 | case AUDIT_USER: |
| 466 | case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: | 466 | case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: |
| 467 | case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: | 467 | case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: |
| 468 | if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) | 468 | if (security_netlink_recv(skb, CAP_AUDIT_WRITE)) |
| 469 | err = -EPERM; | 469 | err = -EPERM; |
| 470 | break; | 470 | break; |
| 471 | default: /* bad msg */ | 471 | default: /* bad msg */ |
| @@ -488,7 +488,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 488 | char *ctx; | 488 | char *ctx; |
| 489 | u32 len; | 489 | u32 len; |
| 490 | 490 | ||
| 491 | err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); | 491 | err = audit_netlink_ok(skb, msg_type); |
| 492 | if (err) | 492 | if (err) |
| 493 | return err; | 493 | return err; |
| 494 | 494 | ||
| @@ -538,7 +538,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 538 | if (status_get->mask & AUDIT_STATUS_PID) { | 538 | if (status_get->mask & AUDIT_STATUS_PID) { |
| 539 | int old = audit_pid; | 539 | int old = audit_pid; |
| 540 | if (sid) { | 540 | if (sid) { |
| 541 | if ((err = selinux_ctxid_to_string( | 541 | if ((err = selinux_sid_to_string( |
| 542 | sid, &ctx, &len))) | 542 | sid, &ctx, &len))) |
| 543 | return err; | 543 | return err; |
| 544 | else | 544 | else |
| @@ -576,7 +576,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 576 | "user pid=%d uid=%u auid=%u", | 576 | "user pid=%d uid=%u auid=%u", |
| 577 | pid, uid, loginuid); | 577 | pid, uid, loginuid); |
| 578 | if (sid) { | 578 | if (sid) { |
| 579 | if (selinux_ctxid_to_string( | 579 | if (selinux_sid_to_string( |
| 580 | sid, &ctx, &len)) { | 580 | sid, &ctx, &len)) { |
| 581 | audit_log_format(ab, | 581 | audit_log_format(ab, |
| 582 | " ssid=%u", sid); | 582 | " ssid=%u", sid); |
| @@ -614,7 +614,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 614 | loginuid, sid); | 614 | loginuid, sid); |
| 615 | break; | 615 | break; |
| 616 | case AUDIT_SIGNAL_INFO: | 616 | case AUDIT_SIGNAL_INFO: |
| 617 | err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len); | 617 | err = selinux_sid_to_string(audit_sig_sid, &ctx, &len); |
| 618 | if (err) | 618 | if (err) |
| 619 | return err; | 619 | return err; |
| 620 | sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); | 620 | sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); |
| @@ -690,9 +690,7 @@ static const struct inotify_operations audit_inotify_ops = { | |||
| 690 | /* Initialize audit support at boot time. */ | 690 | /* Initialize audit support at boot time. */ |
| 691 | static int __init audit_init(void) | 691 | static int __init audit_init(void) |
| 692 | { | 692 | { |
| 693 | #ifdef CONFIG_AUDITSYSCALL | ||
| 694 | int i; | 693 | int i; |
| 695 | #endif | ||
| 696 | 694 | ||
| 697 | printk(KERN_INFO "audit: initializing netlink socket (%s)\n", | 695 | printk(KERN_INFO "audit: initializing netlink socket (%s)\n", |
| 698 | audit_default ? "enabled" : "disabled"); | 696 | audit_default ? "enabled" : "disabled"); |
| @@ -717,10 +715,10 @@ static int __init audit_init(void) | |||
| 717 | audit_ih = inotify_init(&audit_inotify_ops); | 715 | audit_ih = inotify_init(&audit_inotify_ops); |
| 718 | if (IS_ERR(audit_ih)) | 716 | if (IS_ERR(audit_ih)) |
| 719 | audit_panic("cannot initialize inotify handle"); | 717 | audit_panic("cannot initialize inotify handle"); |
| 718 | #endif | ||
| 720 | 719 | ||
| 721 | for (i = 0; i < AUDIT_INODE_BUCKETS; i++) | 720 | for (i = 0; i < AUDIT_INODE_BUCKETS; i++) |
| 722 | INIT_LIST_HEAD(&audit_inode_hash[i]); | 721 | INIT_LIST_HEAD(&audit_inode_hash[i]); |
| 723 | #endif | ||
| 724 | 722 | ||
| 725 | return 0; | 723 | return 0; |
| 726 | } | 724 | } |
| @@ -818,7 +816,7 @@ err: | |||
| 818 | */ | 816 | */ |
| 819 | unsigned int audit_serial(void) | 817 | unsigned int audit_serial(void) |
| 820 | { | 818 | { |
| 821 | static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; | 819 | static DEFINE_SPINLOCK(serial_lock); |
| 822 | static unsigned int serial = 0; | 820 | static unsigned int serial = 0; |
| 823 | 821 | ||
| 824 | unsigned long flags; | 822 | unsigned long flags; |
| @@ -1030,6 +1028,9 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, | |||
| 1030 | struct sk_buff *skb; | 1028 | struct sk_buff *skb; |
| 1031 | static const unsigned char *hex = "0123456789ABCDEF"; | 1029 | static const unsigned char *hex = "0123456789ABCDEF"; |
| 1032 | 1030 | ||
| 1031 | if (!ab) | ||
| 1032 | return; | ||
| 1033 | |||
| 1033 | BUG_ON(!ab->skb); | 1034 | BUG_ON(!ab->skb); |
| 1034 | skb = ab->skb; | 1035 | skb = ab->skb; |
| 1035 | avail = skb_tailroom(skb); | 1036 | avail = skb_tailroom(skb); |
| @@ -1062,6 +1063,9 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen, | |||
| 1062 | unsigned char *ptr; | 1063 | unsigned char *ptr; |
| 1063 | struct sk_buff *skb; | 1064 | struct sk_buff *skb; |
| 1064 | 1065 | ||
| 1066 | if (!ab) | ||
| 1067 | return; | ||
| 1068 | |||
| 1065 | BUG_ON(!ab->skb); | 1069 | BUG_ON(!ab->skb); |
| 1066 | skb = ab->skb; | 1070 | skb = ab->skb; |
| 1067 | avail = skb_tailroom(skb); | 1071 | avail = skb_tailroom(skb); |
diff --git a/kernel/audit.h b/kernel/audit.h index 8323e4132a33..a3370232a390 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
| @@ -81,6 +81,7 @@ struct audit_krule { | |||
| 81 | u32 mask[AUDIT_BITMASK_SIZE]; | 81 | u32 mask[AUDIT_BITMASK_SIZE]; |
| 82 | u32 buflen; /* for data alloc on list rules */ | 82 | u32 buflen; /* for data alloc on list rules */ |
| 83 | u32 field_count; | 83 | u32 field_count; |
| 84 | char *filterkey; /* ties events to rules */ | ||
| 84 | struct audit_field *fields; | 85 | struct audit_field *fields; |
| 85 | struct audit_field *inode_f; /* quick access to an inode field */ | 86 | struct audit_field *inode_f; /* quick access to an inode field */ |
| 86 | struct audit_watch *watch; /* associated watch */ | 87 | struct audit_watch *watch; /* associated watch */ |
| @@ -103,6 +104,7 @@ static inline int audit_hash_ino(u32 ino) | |||
| 103 | return (ino & (AUDIT_INODE_BUCKETS-1)); | 104 | return (ino & (AUDIT_INODE_BUCKETS-1)); |
| 104 | } | 105 | } |
| 105 | 106 | ||
| 107 | extern int audit_match_class(int class, unsigned syscall); | ||
| 106 | extern int audit_comparator(const u32 left, const u32 op, const u32 right); | 108 | extern int audit_comparator(const u32 left, const u32 op, const u32 right); |
| 107 | extern int audit_compare_dname_path(const char *dname, const char *path, | 109 | extern int audit_compare_dname_path(const char *dname, const char *path, |
| 108 | int *dirlen); | 110 | int *dirlen); |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 4c99d2c586ed..1a58a81fb09d 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
| @@ -141,6 +141,7 @@ static inline void audit_free_rule(struct audit_entry *e) | |||
| 141 | selinux_audit_rule_free(f->se_rule); | 141 | selinux_audit_rule_free(f->se_rule); |
| 142 | } | 142 | } |
| 143 | kfree(e->rule.fields); | 143 | kfree(e->rule.fields); |
| 144 | kfree(e->rule.filterkey); | ||
| 144 | kfree(e); | 145 | kfree(e); |
| 145 | } | 146 | } |
| 146 | 147 | ||
| @@ -278,6 +279,38 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len, | |||
| 278 | return 0; | 279 | return 0; |
| 279 | } | 280 | } |
| 280 | 281 | ||
| 282 | static __u32 *classes[AUDIT_SYSCALL_CLASSES]; | ||
| 283 | |||
| 284 | int __init audit_register_class(int class, unsigned *list) | ||
| 285 | { | ||
| 286 | __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL); | ||
| 287 | if (!p) | ||
| 288 | return -ENOMEM; | ||
| 289 | while (*list != ~0U) { | ||
| 290 | unsigned n = *list++; | ||
| 291 | if (n >= AUDIT_BITMASK_SIZE * 32 - AUDIT_SYSCALL_CLASSES) { | ||
| 292 | kfree(p); | ||
| 293 | return -EINVAL; | ||
| 294 | } | ||
| 295 | p[AUDIT_WORD(n)] |= AUDIT_BIT(n); | ||
| 296 | } | ||
| 297 | if (class >= AUDIT_SYSCALL_CLASSES || classes[class]) { | ||
| 298 | kfree(p); | ||
| 299 | return -EINVAL; | ||
| 300 | } | ||
| 301 | classes[class] = p; | ||
| 302 | return 0; | ||
| 303 | } | ||
| 304 | |||
| 305 | int audit_match_class(int class, unsigned syscall) | ||
| 306 | { | ||
| 307 | if (unlikely(syscall >= AUDIT_BITMASK_SIZE * sizeof(__u32))) | ||
| 308 | return 0; | ||
| 309 | if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class])) | ||
| 310 | return 0; | ||
| 311 | return classes[class][AUDIT_WORD(syscall)] & AUDIT_BIT(syscall); | ||
| 312 | } | ||
| 313 | |||
| 281 | /* Common user-space to kernel rule translation. */ | 314 | /* Common user-space to kernel rule translation. */ |
| 282 | static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) | 315 | static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) |
| 283 | { | 316 | { |
| @@ -321,6 +354,22 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) | |||
| 321 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) | 354 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) |
| 322 | entry->rule.mask[i] = rule->mask[i]; | 355 | entry->rule.mask[i] = rule->mask[i]; |
| 323 | 356 | ||
| 357 | for (i = 0; i < AUDIT_SYSCALL_CLASSES; i++) { | ||
| 358 | int bit = AUDIT_BITMASK_SIZE * 32 - i - 1; | ||
| 359 | __u32 *p = &entry->rule.mask[AUDIT_WORD(bit)]; | ||
| 360 | __u32 *class; | ||
| 361 | |||
| 362 | if (!(*p & AUDIT_BIT(bit))) | ||
| 363 | continue; | ||
| 364 | *p &= ~AUDIT_BIT(bit); | ||
| 365 | class = classes[i]; | ||
| 366 | if (class) { | ||
| 367 | int j; | ||
| 368 | for (j = 0; j < AUDIT_BITMASK_SIZE; j++) | ||
| 369 | entry->rule.mask[j] |= class[j]; | ||
| 370 | } | ||
| 371 | } | ||
| 372 | |||
| 324 | return entry; | 373 | return entry; |
| 325 | 374 | ||
| 326 | exit_err: | 375 | exit_err: |
| @@ -364,6 +413,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) | |||
| 364 | case AUDIT_PERS: | 413 | case AUDIT_PERS: |
| 365 | case AUDIT_ARCH: | 414 | case AUDIT_ARCH: |
| 366 | case AUDIT_MSGTYPE: | 415 | case AUDIT_MSGTYPE: |
| 416 | case AUDIT_PPID: | ||
| 367 | case AUDIT_DEVMAJOR: | 417 | case AUDIT_DEVMAJOR: |
| 368 | case AUDIT_DEVMINOR: | 418 | case AUDIT_DEVMINOR: |
| 369 | case AUDIT_EXIT: | 419 | case AUDIT_EXIT: |
| @@ -373,6 +423,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) | |||
| 373 | case AUDIT_ARG2: | 423 | case AUDIT_ARG2: |
| 374 | case AUDIT_ARG3: | 424 | case AUDIT_ARG3: |
| 375 | break; | 425 | break; |
| 426 | case AUDIT_PERM: | ||
| 427 | if (f->val & ~15) | ||
| 428 | goto exit_free; | ||
| 429 | break; | ||
| 376 | case AUDIT_INODE: | 430 | case AUDIT_INODE: |
| 377 | err = audit_to_inode(&entry->rule, f); | 431 | err = audit_to_inode(&entry->rule, f); |
| 378 | if (err) | 432 | if (err) |
| @@ -402,6 +456,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) | |||
| 402 | case AUDIT_EQUAL: | 456 | case AUDIT_EQUAL: |
| 403 | break; | 457 | break; |
| 404 | default: | 458 | default: |
| 459 | err = -EINVAL; | ||
| 405 | goto exit_free; | 460 | goto exit_free; |
| 406 | } | 461 | } |
| 407 | } | 462 | } |
| @@ -469,11 +524,16 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
| 469 | case AUDIT_ARG2: | 524 | case AUDIT_ARG2: |
| 470 | case AUDIT_ARG3: | 525 | case AUDIT_ARG3: |
| 471 | break; | 526 | break; |
| 472 | case AUDIT_SE_USER: | 527 | case AUDIT_SUBJ_USER: |
| 473 | case AUDIT_SE_ROLE: | 528 | case AUDIT_SUBJ_ROLE: |
| 474 | case AUDIT_SE_TYPE: | 529 | case AUDIT_SUBJ_TYPE: |
| 475 | case AUDIT_SE_SEN: | 530 | case AUDIT_SUBJ_SEN: |
| 476 | case AUDIT_SE_CLR: | 531 | case AUDIT_SUBJ_CLR: |
| 532 | case AUDIT_OBJ_USER: | ||
| 533 | case AUDIT_OBJ_ROLE: | ||
| 534 | case AUDIT_OBJ_TYPE: | ||
| 535 | case AUDIT_OBJ_LEV_LOW: | ||
| 536 | case AUDIT_OBJ_LEV_HIGH: | ||
| 477 | str = audit_unpack_string(&bufp, &remain, f->val); | 537 | str = audit_unpack_string(&bufp, &remain, f->val); |
| 478 | if (IS_ERR(str)) | 538 | if (IS_ERR(str)) |
| 479 | goto exit_free; | 539 | goto exit_free; |
| @@ -511,6 +571,20 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
| 511 | if (err) | 571 | if (err) |
| 512 | goto exit_free; | 572 | goto exit_free; |
| 513 | break; | 573 | break; |
| 574 | case AUDIT_FILTERKEY: | ||
| 575 | err = -EINVAL; | ||
| 576 | if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) | ||
| 577 | goto exit_free; | ||
| 578 | str = audit_unpack_string(&bufp, &remain, f->val); | ||
| 579 | if (IS_ERR(str)) | ||
| 580 | goto exit_free; | ||
| 581 | entry->rule.buflen += f->val; | ||
| 582 | entry->rule.filterkey = str; | ||
| 583 | break; | ||
| 584 | case AUDIT_PERM: | ||
| 585 | if (f->val & ~15) | ||
| 586 | goto exit_free; | ||
| 587 | break; | ||
| 514 | default: | 588 | default: |
| 515 | goto exit_free; | 589 | goto exit_free; |
| 516 | } | 590 | } |
| @@ -524,6 +598,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
| 524 | case AUDIT_EQUAL: | 598 | case AUDIT_EQUAL: |
| 525 | break; | 599 | break; |
| 526 | default: | 600 | default: |
| 601 | err = -EINVAL; | ||
| 527 | goto exit_free; | 602 | goto exit_free; |
| 528 | } | 603 | } |
| 529 | } | 604 | } |
| @@ -600,11 +675,16 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) | |||
| 600 | data->fields[i] = f->type; | 675 | data->fields[i] = f->type; |
| 601 | data->fieldflags[i] = f->op; | 676 | data->fieldflags[i] = f->op; |
| 602 | switch(f->type) { | 677 | switch(f->type) { |
| 603 | case AUDIT_SE_USER: | 678 | case AUDIT_SUBJ_USER: |
| 604 | case AUDIT_SE_ROLE: | 679 | case AUDIT_SUBJ_ROLE: |
| 605 | case AUDIT_SE_TYPE: | 680 | case AUDIT_SUBJ_TYPE: |
| 606 | case AUDIT_SE_SEN: | 681 | case AUDIT_SUBJ_SEN: |
| 607 | case AUDIT_SE_CLR: | 682 | case AUDIT_SUBJ_CLR: |
| 683 | case AUDIT_OBJ_USER: | ||
| 684 | case AUDIT_OBJ_ROLE: | ||
| 685 | case AUDIT_OBJ_TYPE: | ||
| 686 | case AUDIT_OBJ_LEV_LOW: | ||
| 687 | case AUDIT_OBJ_LEV_HIGH: | ||
| 608 | data->buflen += data->values[i] = | 688 | data->buflen += data->values[i] = |
| 609 | audit_pack_string(&bufp, f->se_str); | 689 | audit_pack_string(&bufp, f->se_str); |
| 610 | break; | 690 | break; |
| @@ -612,6 +692,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) | |||
| 612 | data->buflen += data->values[i] = | 692 | data->buflen += data->values[i] = |
| 613 | audit_pack_string(&bufp, krule->watch->path); | 693 | audit_pack_string(&bufp, krule->watch->path); |
| 614 | break; | 694 | break; |
| 695 | case AUDIT_FILTERKEY: | ||
| 696 | data->buflen += data->values[i] = | ||
| 697 | audit_pack_string(&bufp, krule->filterkey); | ||
| 698 | break; | ||
| 615 | default: | 699 | default: |
| 616 | data->values[i] = f->val; | 700 | data->values[i] = f->val; |
| 617 | } | 701 | } |
| @@ -639,11 +723,16 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) | |||
| 639 | return 1; | 723 | return 1; |
| 640 | 724 | ||
| 641 | switch(a->fields[i].type) { | 725 | switch(a->fields[i].type) { |
| 642 | case AUDIT_SE_USER: | 726 | case AUDIT_SUBJ_USER: |
| 643 | case AUDIT_SE_ROLE: | 727 | case AUDIT_SUBJ_ROLE: |
| 644 | case AUDIT_SE_TYPE: | 728 | case AUDIT_SUBJ_TYPE: |
| 645 | case AUDIT_SE_SEN: | 729 | case AUDIT_SUBJ_SEN: |
| 646 | case AUDIT_SE_CLR: | 730 | case AUDIT_SUBJ_CLR: |
| 731 | case AUDIT_OBJ_USER: | ||
| 732 | case AUDIT_OBJ_ROLE: | ||
| 733 | case AUDIT_OBJ_TYPE: | ||
| 734 | case AUDIT_OBJ_LEV_LOW: | ||
| 735 | case AUDIT_OBJ_LEV_HIGH: | ||
| 647 | if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) | 736 | if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) |
| 648 | return 1; | 737 | return 1; |
| 649 | break; | 738 | break; |
| @@ -651,6 +740,11 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) | |||
| 651 | if (strcmp(a->watch->path, b->watch->path)) | 740 | if (strcmp(a->watch->path, b->watch->path)) |
| 652 | return 1; | 741 | return 1; |
| 653 | break; | 742 | break; |
| 743 | case AUDIT_FILTERKEY: | ||
| 744 | /* both filterkeys exist based on above type compare */ | ||
| 745 | if (strcmp(a->filterkey, b->filterkey)) | ||
| 746 | return 1; | ||
| 747 | break; | ||
| 654 | default: | 748 | default: |
| 655 | if (a->fields[i].val != b->fields[i].val) | 749 | if (a->fields[i].val != b->fields[i].val) |
| 656 | return 1; | 750 | return 1; |
| @@ -730,6 +824,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, | |||
| 730 | u32 fcount = old->field_count; | 824 | u32 fcount = old->field_count; |
| 731 | struct audit_entry *entry; | 825 | struct audit_entry *entry; |
| 732 | struct audit_krule *new; | 826 | struct audit_krule *new; |
| 827 | char *fk; | ||
| 733 | int i, err = 0; | 828 | int i, err = 0; |
| 734 | 829 | ||
| 735 | entry = audit_init_entry(fcount); | 830 | entry = audit_init_entry(fcount); |
| @@ -753,13 +848,25 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, | |||
| 753 | * the originals will all be freed when the old rule is freed. */ | 848 | * the originals will all be freed when the old rule is freed. */ |
| 754 | for (i = 0; i < fcount; i++) { | 849 | for (i = 0; i < fcount; i++) { |
| 755 | switch (new->fields[i].type) { | 850 | switch (new->fields[i].type) { |
| 756 | case AUDIT_SE_USER: | 851 | case AUDIT_SUBJ_USER: |
| 757 | case AUDIT_SE_ROLE: | 852 | case AUDIT_SUBJ_ROLE: |
| 758 | case AUDIT_SE_TYPE: | 853 | case AUDIT_SUBJ_TYPE: |
| 759 | case AUDIT_SE_SEN: | 854 | case AUDIT_SUBJ_SEN: |
| 760 | case AUDIT_SE_CLR: | 855 | case AUDIT_SUBJ_CLR: |
| 856 | case AUDIT_OBJ_USER: | ||
| 857 | case AUDIT_OBJ_ROLE: | ||
| 858 | case AUDIT_OBJ_TYPE: | ||
| 859 | case AUDIT_OBJ_LEV_LOW: | ||
| 860 | case AUDIT_OBJ_LEV_HIGH: | ||
| 761 | err = audit_dupe_selinux_field(&new->fields[i], | 861 | err = audit_dupe_selinux_field(&new->fields[i], |
| 762 | &old->fields[i]); | 862 | &old->fields[i]); |
| 863 | break; | ||
| 864 | case AUDIT_FILTERKEY: | ||
| 865 | fk = kstrdup(old->filterkey, GFP_KERNEL); | ||
| 866 | if (unlikely(!fk)) | ||
| 867 | err = -ENOMEM; | ||
| 868 | else | ||
| 869 | new->filterkey = fk; | ||
| 763 | } | 870 | } |
| 764 | if (err) { | 871 | if (err) { |
| 765 | audit_free_rule(entry); | 872 | audit_free_rule(entry); |
| @@ -824,7 +931,7 @@ static void audit_update_watch(struct audit_parent *parent, | |||
| 824 | } | 931 | } |
| 825 | 932 | ||
| 826 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | 933 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); |
| 827 | audit_log_format(ab, "audit updated rules specifying watch="); | 934 | audit_log_format(ab, "audit updated rules specifying path="); |
| 828 | audit_log_untrustedstring(ab, owatch->path); | 935 | audit_log_untrustedstring(ab, owatch->path); |
| 829 | audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino); | 936 | audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino); |
| 830 | audit_log_end(ab); | 937 | audit_log_end(ab); |
| @@ -847,19 +954,28 @@ static void audit_remove_parent_watches(struct audit_parent *parent) | |||
| 847 | struct audit_watch *w, *nextw; | 954 | struct audit_watch *w, *nextw; |
| 848 | struct audit_krule *r, *nextr; | 955 | struct audit_krule *r, *nextr; |
| 849 | struct audit_entry *e; | 956 | struct audit_entry *e; |
| 957 | struct audit_buffer *ab; | ||
| 850 | 958 | ||
| 851 | mutex_lock(&audit_filter_mutex); | 959 | mutex_lock(&audit_filter_mutex); |
| 852 | parent->flags |= AUDIT_PARENT_INVALID; | 960 | parent->flags |= AUDIT_PARENT_INVALID; |
| 853 | list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { | 961 | list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { |
| 854 | list_for_each_entry_safe(r, nextr, &w->rules, rlist) { | 962 | list_for_each_entry_safe(r, nextr, &w->rules, rlist) { |
| 855 | e = container_of(r, struct audit_entry, rule); | 963 | e = container_of(r, struct audit_entry, rule); |
| 964 | |||
| 965 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | ||
| 966 | audit_log_format(ab, "audit implicitly removed rule path="); | ||
| 967 | audit_log_untrustedstring(ab, w->path); | ||
| 968 | if (r->filterkey) { | ||
| 969 | audit_log_format(ab, " key="); | ||
| 970 | audit_log_untrustedstring(ab, r->filterkey); | ||
| 971 | } else | ||
| 972 | audit_log_format(ab, " key=(null)"); | ||
| 973 | audit_log_format(ab, " list=%d", r->listnr); | ||
| 974 | audit_log_end(ab); | ||
| 975 | |||
| 856 | list_del(&r->rlist); | 976 | list_del(&r->rlist); |
| 857 | list_del_rcu(&e->list); | 977 | list_del_rcu(&e->list); |
| 858 | call_rcu(&e->rcu, audit_free_rule_rcu); | 978 | call_rcu(&e->rcu, audit_free_rule_rcu); |
| 859 | |||
| 860 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
| 861 | "audit implicitly removed rule from list=%d\n", | ||
| 862 | AUDIT_FILTER_EXIT); | ||
| 863 | } | 979 | } |
| 864 | audit_remove_watch(w); | 980 | audit_remove_watch(w); |
| 865 | } | 981 | } |
| @@ -1047,6 +1163,14 @@ static inline int audit_add_rule(struct audit_entry *entry, | |||
| 1047 | struct audit_watch *watch = entry->rule.watch; | 1163 | struct audit_watch *watch = entry->rule.watch; |
| 1048 | struct nameidata *ndp, *ndw; | 1164 | struct nameidata *ndp, *ndw; |
| 1049 | int h, err, putnd_needed = 0; | 1165 | int h, err, putnd_needed = 0; |
| 1166 | #ifdef CONFIG_AUDITSYSCALL | ||
| 1167 | int dont_count = 0; | ||
| 1168 | |||
| 1169 | /* If either of these, don't count towards total */ | ||
| 1170 | if (entry->rule.listnr == AUDIT_FILTER_USER || | ||
| 1171 | entry->rule.listnr == AUDIT_FILTER_TYPE) | ||
| 1172 | dont_count = 1; | ||
| 1173 | #endif | ||
| 1050 | 1174 | ||
| 1051 | if (inode_f) { | 1175 | if (inode_f) { |
| 1052 | h = audit_hash_ino(inode_f->val); | 1176 | h = audit_hash_ino(inode_f->val); |
| @@ -1087,6 +1211,10 @@ static inline int audit_add_rule(struct audit_entry *entry, | |||
| 1087 | } else { | 1211 | } else { |
| 1088 | list_add_tail_rcu(&entry->list, list); | 1212 | list_add_tail_rcu(&entry->list, list); |
| 1089 | } | 1213 | } |
| 1214 | #ifdef CONFIG_AUDITSYSCALL | ||
| 1215 | if (!dont_count) | ||
| 1216 | audit_n_rules++; | ||
| 1217 | #endif | ||
| 1090 | mutex_unlock(&audit_filter_mutex); | 1218 | mutex_unlock(&audit_filter_mutex); |
| 1091 | 1219 | ||
| 1092 | if (putnd_needed) | 1220 | if (putnd_needed) |
| @@ -1111,6 +1239,14 @@ static inline int audit_del_rule(struct audit_entry *entry, | |||
| 1111 | struct audit_watch *watch, *tmp_watch = entry->rule.watch; | 1239 | struct audit_watch *watch, *tmp_watch = entry->rule.watch; |
| 1112 | LIST_HEAD(inotify_list); | 1240 | LIST_HEAD(inotify_list); |
| 1113 | int h, ret = 0; | 1241 | int h, ret = 0; |
| 1242 | #ifdef CONFIG_AUDITSYSCALL | ||
| 1243 | int dont_count = 0; | ||
| 1244 | |||
| 1245 | /* If either of these, don't count towards total */ | ||
| 1246 | if (entry->rule.listnr == AUDIT_FILTER_USER || | ||
| 1247 | entry->rule.listnr == AUDIT_FILTER_TYPE) | ||
| 1248 | dont_count = 1; | ||
| 1249 | #endif | ||
| 1114 | 1250 | ||
| 1115 | if (inode_f) { | 1251 | if (inode_f) { |
| 1116 | h = audit_hash_ino(inode_f->val); | 1252 | h = audit_hash_ino(inode_f->val); |
| @@ -1148,6 +1284,10 @@ static inline int audit_del_rule(struct audit_entry *entry, | |||
| 1148 | list_del_rcu(&e->list); | 1284 | list_del_rcu(&e->list); |
| 1149 | call_rcu(&e->rcu, audit_free_rule_rcu); | 1285 | call_rcu(&e->rcu, audit_free_rule_rcu); |
| 1150 | 1286 | ||
| 1287 | #ifdef CONFIG_AUDITSYSCALL | ||
| 1288 | if (!dont_count) | ||
| 1289 | audit_n_rules--; | ||
| 1290 | #endif | ||
| 1151 | mutex_unlock(&audit_filter_mutex); | 1291 | mutex_unlock(&audit_filter_mutex); |
| 1152 | 1292 | ||
| 1153 | if (!list_empty(&inotify_list)) | 1293 | if (!list_empty(&inotify_list)) |
| @@ -1245,6 +1385,34 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) | |||
| 1245 | skb_queue_tail(q, skb); | 1385 | skb_queue_tail(q, skb); |
| 1246 | } | 1386 | } |
| 1247 | 1387 | ||
| 1388 | /* Log rule additions and removals */ | ||
| 1389 | static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action, | ||
| 1390 | struct audit_krule *rule, int res) | ||
| 1391 | { | ||
| 1392 | struct audit_buffer *ab; | ||
| 1393 | |||
| 1394 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | ||
| 1395 | if (!ab) | ||
| 1396 | return; | ||
| 1397 | audit_log_format(ab, "auid=%u", loginuid); | ||
| 1398 | if (sid) { | ||
| 1399 | char *ctx = NULL; | ||
| 1400 | u32 len; | ||
| 1401 | if (selinux_sid_to_string(sid, &ctx, &len)) | ||
| 1402 | audit_log_format(ab, " ssid=%u", sid); | ||
| 1403 | else | ||
| 1404 | audit_log_format(ab, " subj=%s", ctx); | ||
| 1405 | kfree(ctx); | ||
| 1406 | } | ||
| 1407 | audit_log_format(ab, " %s rule key=", action); | ||
| 1408 | if (rule->filterkey) | ||
| 1409 | audit_log_untrustedstring(ab, rule->filterkey); | ||
| 1410 | else | ||
| 1411 | audit_log_format(ab, "(null)"); | ||
| 1412 | audit_log_format(ab, " list=%d res=%d", rule->listnr, res); | ||
| 1413 | audit_log_end(ab); | ||
| 1414 | } | ||
| 1415 | |||
| 1248 | /** | 1416 | /** |
| 1249 | * audit_receive_filter - apply all rules to the specified message type | 1417 | * audit_receive_filter - apply all rules to the specified message type |
| 1250 | * @type: audit message type | 1418 | * @type: audit message type |
| @@ -1304,24 +1472,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, | |||
| 1304 | 1472 | ||
| 1305 | err = audit_add_rule(entry, | 1473 | err = audit_add_rule(entry, |
| 1306 | &audit_filter_list[entry->rule.listnr]); | 1474 | &audit_filter_list[entry->rule.listnr]); |
| 1307 | 1475 | audit_log_rule_change(loginuid, sid, "add", &entry->rule, !err); | |
| 1308 | if (sid) { | ||
| 1309 | char *ctx = NULL; | ||
| 1310 | u32 len; | ||
| 1311 | if (selinux_ctxid_to_string(sid, &ctx, &len)) { | ||
| 1312 | /* Maybe call audit_panic? */ | ||
| 1313 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
| 1314 | "auid=%u ssid=%u add rule to list=%d res=%d", | ||
| 1315 | loginuid, sid, entry->rule.listnr, !err); | ||
| 1316 | } else | ||
| 1317 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
| 1318 | "auid=%u subj=%s add rule to list=%d res=%d", | ||
| 1319 | loginuid, ctx, entry->rule.listnr, !err); | ||
| 1320 | kfree(ctx); | ||
| 1321 | } else | ||
| 1322 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
| 1323 | "auid=%u add rule to list=%d res=%d", | ||
| 1324 | loginuid, entry->rule.listnr, !err); | ||
| 1325 | 1476 | ||
| 1326 | if (err) | 1477 | if (err) |
| 1327 | audit_free_rule(entry); | 1478 | audit_free_rule(entry); |
| @@ -1337,24 +1488,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, | |||
| 1337 | 1488 | ||
| 1338 | err = audit_del_rule(entry, | 1489 | err = audit_del_rule(entry, |
| 1339 | &audit_filter_list[entry->rule.listnr]); | 1490 | &audit_filter_list[entry->rule.listnr]); |
| 1340 | 1491 | audit_log_rule_change(loginuid, sid, "remove", &entry->rule, | |
| 1341 | if (sid) { | 1492 | !err); |
| 1342 | char *ctx = NULL; | ||
| 1343 | u32 len; | ||
| 1344 | if (selinux_ctxid_to_string(sid, &ctx, &len)) { | ||
| 1345 | /* Maybe call audit_panic? */ | ||
| 1346 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
| 1347 | "auid=%u ssid=%u remove rule from list=%d res=%d", | ||
| 1348 | loginuid, sid, entry->rule.listnr, !err); | ||
| 1349 | } else | ||
| 1350 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
| 1351 | "auid=%u subj=%s remove rule from list=%d res=%d", | ||
| 1352 | loginuid, ctx, entry->rule.listnr, !err); | ||
| 1353 | kfree(ctx); | ||
| 1354 | } else | ||
| 1355 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
| 1356 | "auid=%u remove rule from list=%d res=%d", | ||
| 1357 | loginuid, entry->rule.listnr, !err); | ||
| 1358 | 1493 | ||
| 1359 | audit_free_rule(entry); | 1494 | audit_free_rule(entry); |
| 1360 | break; | 1495 | break; |
| @@ -1514,11 +1649,16 @@ static inline int audit_rule_has_selinux(struct audit_krule *rule) | |||
| 1514 | for (i = 0; i < rule->field_count; i++) { | 1649 | for (i = 0; i < rule->field_count; i++) { |
| 1515 | struct audit_field *f = &rule->fields[i]; | 1650 | struct audit_field *f = &rule->fields[i]; |
| 1516 | switch (f->type) { | 1651 | switch (f->type) { |
| 1517 | case AUDIT_SE_USER: | 1652 | case AUDIT_SUBJ_USER: |
| 1518 | case AUDIT_SE_ROLE: | 1653 | case AUDIT_SUBJ_ROLE: |
| 1519 | case AUDIT_SE_TYPE: | 1654 | case AUDIT_SUBJ_TYPE: |
| 1520 | case AUDIT_SE_SEN: | 1655 | case AUDIT_SUBJ_SEN: |
| 1521 | case AUDIT_SE_CLR: | 1656 | case AUDIT_SUBJ_CLR: |
| 1657 | case AUDIT_OBJ_USER: | ||
| 1658 | case AUDIT_OBJ_ROLE: | ||
| 1659 | case AUDIT_OBJ_TYPE: | ||
| 1660 | case AUDIT_OBJ_LEV_LOW: | ||
| 1661 | case AUDIT_OBJ_LEV_HIGH: | ||
| 1522 | return 1; | 1662 | return 1; |
| 1523 | } | 1663 | } |
| 1524 | } | 1664 | } |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9ebd96fda295..105147631753 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
| @@ -85,6 +85,9 @@ extern int audit_enabled; | |||
| 85 | /* Indicates that audit should log the full pathname. */ | 85 | /* Indicates that audit should log the full pathname. */ |
| 86 | #define AUDIT_NAME_FULL -1 | 86 | #define AUDIT_NAME_FULL -1 |
| 87 | 87 | ||
| 88 | /* number of audit rules */ | ||
| 89 | int audit_n_rules; | ||
| 90 | |||
| 88 | /* When fs/namei.c:getname() is called, we store the pointer in name and | 91 | /* When fs/namei.c:getname() is called, we store the pointer in name and |
| 89 | * we don't let putname() free it (instead we free all of the saved | 92 | * we don't let putname() free it (instead we free all of the saved |
| 90 | * pointers at syscall exit time). | 93 | * pointers at syscall exit time). |
| @@ -174,6 +177,7 @@ struct audit_aux_data_path { | |||
| 174 | 177 | ||
| 175 | /* The per-task audit context. */ | 178 | /* The per-task audit context. */ |
| 176 | struct audit_context { | 179 | struct audit_context { |
| 180 | int dummy; /* must be the first element */ | ||
| 177 | int in_syscall; /* 1 if task is in a syscall */ | 181 | int in_syscall; /* 1 if task is in a syscall */ |
| 178 | enum audit_state state; | 182 | enum audit_state state; |
| 179 | unsigned int serial; /* serial number for record */ | 183 | unsigned int serial; /* serial number for record */ |
| @@ -186,6 +190,7 @@ struct audit_context { | |||
| 186 | int auditable; /* 1 if record should be written */ | 190 | int auditable; /* 1 if record should be written */ |
| 187 | int name_count; | 191 | int name_count; |
| 188 | struct audit_names names[AUDIT_NAMES]; | 192 | struct audit_names names[AUDIT_NAMES]; |
| 193 | char * filterkey; /* key for rule that triggered record */ | ||
| 189 | struct dentry * pwd; | 194 | struct dentry * pwd; |
| 190 | struct vfsmount * pwdmnt; | 195 | struct vfsmount * pwdmnt; |
| 191 | struct audit_context *previous; /* For nested syscalls */ | 196 | struct audit_context *previous; /* For nested syscalls */ |
| @@ -204,6 +209,54 @@ struct audit_context { | |||
| 204 | #endif | 209 | #endif |
| 205 | }; | 210 | }; |
| 206 | 211 | ||
| 212 | #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) | ||
| 213 | static inline int open_arg(int flags, int mask) | ||
| 214 | { | ||
| 215 | int n = ACC_MODE(flags); | ||
| 216 | if (flags & (O_TRUNC | O_CREAT)) | ||
| 217 | n |= AUDIT_PERM_WRITE; | ||
| 218 | return n & mask; | ||
| 219 | } | ||
| 220 | |||
| 221 | static int audit_match_perm(struct audit_context *ctx, int mask) | ||
| 222 | { | ||
| 223 | unsigned n = ctx->major; | ||
| 224 | switch (audit_classify_syscall(ctx->arch, n)) { | ||
| 225 | case 0: /* native */ | ||
| 226 | if ((mask & AUDIT_PERM_WRITE) && | ||
| 227 | audit_match_class(AUDIT_CLASS_WRITE, n)) | ||
| 228 | return 1; | ||
| 229 | if ((mask & AUDIT_PERM_READ) && | ||
| 230 | audit_match_class(AUDIT_CLASS_READ, n)) | ||
| 231 | return 1; | ||
| 232 | if ((mask & AUDIT_PERM_ATTR) && | ||
| 233 | audit_match_class(AUDIT_CLASS_CHATTR, n)) | ||
| 234 | return 1; | ||
| 235 | return 0; | ||
| 236 | case 1: /* 32bit on biarch */ | ||
| 237 | if ((mask & AUDIT_PERM_WRITE) && | ||
| 238 | audit_match_class(AUDIT_CLASS_WRITE_32, n)) | ||
| 239 | return 1; | ||
| 240 | if ((mask & AUDIT_PERM_READ) && | ||
| 241 | audit_match_class(AUDIT_CLASS_READ_32, n)) | ||
| 242 | return 1; | ||
| 243 | if ((mask & AUDIT_PERM_ATTR) && | ||
| 244 | audit_match_class(AUDIT_CLASS_CHATTR_32, n)) | ||
| 245 | return 1; | ||
| 246 | return 0; | ||
| 247 | case 2: /* open */ | ||
| 248 | return mask & ACC_MODE(ctx->argv[1]); | ||
| 249 | case 3: /* openat */ | ||
| 250 | return mask & ACC_MODE(ctx->argv[2]); | ||
| 251 | case 4: /* socketcall */ | ||
| 252 | return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND); | ||
| 253 | case 5: /* execve */ | ||
| 254 | return mask & AUDIT_PERM_EXEC; | ||
| 255 | default: | ||
| 256 | return 0; | ||
| 257 | } | ||
| 258 | } | ||
| 259 | |||
| 207 | /* Determine if any context name data matches a rule's watch data */ | 260 | /* Determine if any context name data matches a rule's watch data */ |
| 208 | /* Compare a task_struct with an audit_rule. Return 1 on match, 0 | 261 | /* Compare a task_struct with an audit_rule. Return 1 on match, 0 |
| 209 | * otherwise. */ | 262 | * otherwise. */ |
| @@ -320,11 +373,11 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
| 320 | if (ctx) | 373 | if (ctx) |
| 321 | result = audit_comparator(ctx->loginuid, f->op, f->val); | 374 | result = audit_comparator(ctx->loginuid, f->op, f->val); |
| 322 | break; | 375 | break; |
| 323 | case AUDIT_SE_USER: | 376 | case AUDIT_SUBJ_USER: |
| 324 | case AUDIT_SE_ROLE: | 377 | case AUDIT_SUBJ_ROLE: |
| 325 | case AUDIT_SE_TYPE: | 378 | case AUDIT_SUBJ_TYPE: |
| 326 | case AUDIT_SE_SEN: | 379 | case AUDIT_SUBJ_SEN: |
| 327 | case AUDIT_SE_CLR: | 380 | case AUDIT_SUBJ_CLR: |
| 328 | /* NOTE: this may return negative values indicating | 381 | /* NOTE: this may return negative values indicating |
| 329 | a temporary error. We simply treat this as a | 382 | a temporary error. We simply treat this as a |
| 330 | match for now to avoid losing information that | 383 | match for now to avoid losing information that |
| @@ -332,7 +385,7 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
| 332 | logged upon error */ | 385 | logged upon error */ |
| 333 | if (f->se_rule) { | 386 | if (f->se_rule) { |
| 334 | if (need_sid) { | 387 | if (need_sid) { |
| 335 | selinux_task_ctxid(tsk, &sid); | 388 | selinux_get_task_sid(tsk, &sid); |
| 336 | need_sid = 0; | 389 | need_sid = 0; |
| 337 | } | 390 | } |
| 338 | result = selinux_audit_rule_match(sid, f->type, | 391 | result = selinux_audit_rule_match(sid, f->type, |
| @@ -341,6 +394,46 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
| 341 | ctx); | 394 | ctx); |
| 342 | } | 395 | } |
| 343 | break; | 396 | break; |
| 397 | case AUDIT_OBJ_USER: | ||
| 398 | case AUDIT_OBJ_ROLE: | ||
| 399 | case AUDIT_OBJ_TYPE: | ||
| 400 | case AUDIT_OBJ_LEV_LOW: | ||
| 401 | case AUDIT_OBJ_LEV_HIGH: | ||
| 402 | /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR | ||
| 403 | also applies here */ | ||
| 404 | if (f->se_rule) { | ||
| 405 | /* Find files that match */ | ||
| 406 | if (name) { | ||
| 407 | result = selinux_audit_rule_match( | ||
| 408 | name->osid, f->type, f->op, | ||
| 409 | f->se_rule, ctx); | ||
| 410 | } else if (ctx) { | ||
| 411 | for (j = 0; j < ctx->name_count; j++) { | ||
| 412 | if (selinux_audit_rule_match( | ||
| 413 | ctx->names[j].osid, | ||
| 414 | f->type, f->op, | ||
| 415 | f->se_rule, ctx)) { | ||
| 416 | ++result; | ||
| 417 | break; | ||
| 418 | } | ||
| 419 | } | ||
| 420 | } | ||
| 421 | /* Find ipc objects that match */ | ||
| 422 | if (ctx) { | ||
| 423 | struct audit_aux_data *aux; | ||
| 424 | for (aux = ctx->aux; aux; | ||
| 425 | aux = aux->next) { | ||
| 426 | if (aux->type == AUDIT_IPC) { | ||
| 427 | struct audit_aux_data_ipcctl *axi = (void *)aux; | ||
| 428 | if (selinux_audit_rule_match(axi->osid, f->type, f->op, f->se_rule, ctx)) { | ||
| 429 | ++result; | ||
| 430 | break; | ||
| 431 | } | ||
| 432 | } | ||
| 433 | } | ||
| 434 | } | ||
| 435 | } | ||
| 436 | break; | ||
| 344 | case AUDIT_ARG0: | 437 | case AUDIT_ARG0: |
| 345 | case AUDIT_ARG1: | 438 | case AUDIT_ARG1: |
| 346 | case AUDIT_ARG2: | 439 | case AUDIT_ARG2: |
| @@ -348,11 +441,20 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
| 348 | if (ctx) | 441 | if (ctx) |
| 349 | result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val); | 442 | result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val); |
| 350 | break; | 443 | break; |
| 444 | case AUDIT_FILTERKEY: | ||
| 445 | /* ignore this field for filtering */ | ||
| 446 | result = 1; | ||
| 447 | break; | ||
| 448 | case AUDIT_PERM: | ||
| 449 | result = audit_match_perm(ctx, f->val); | ||
| 450 | break; | ||
| 351 | } | 451 | } |
| 352 | 452 | ||
| 353 | if (!result) | 453 | if (!result) |
| 354 | return 0; | 454 | return 0; |
| 355 | } | 455 | } |
| 456 | if (rule->filterkey) | ||
| 457 | ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); | ||
| 356 | switch (rule->action) { | 458 | switch (rule->action) { |
| 357 | case AUDIT_NEVER: *state = AUDIT_DISABLED; break; | 459 | case AUDIT_NEVER: *state = AUDIT_DISABLED; break; |
| 358 | case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; | 460 | case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; |
| @@ -467,7 +569,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, | |||
| 467 | context->return_valid = return_valid; | 569 | context->return_valid = return_valid; |
| 468 | context->return_code = return_code; | 570 | context->return_code = return_code; |
| 469 | 571 | ||
| 470 | if (context->in_syscall && !context->auditable) { | 572 | if (context->in_syscall && !context->dummy && !context->auditable) { |
| 471 | enum audit_state state; | 573 | enum audit_state state; |
| 472 | 574 | ||
| 473 | state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); | 575 | state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); |
| @@ -483,17 +585,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, | |||
| 483 | } | 585 | } |
| 484 | 586 | ||
| 485 | get_context: | 587 | get_context: |
| 486 | context->pid = tsk->pid; | 588 | |
| 487 | context->ppid = sys_getppid(); /* sic. tsk == current in all cases */ | ||
| 488 | context->uid = tsk->uid; | ||
| 489 | context->gid = tsk->gid; | ||
| 490 | context->euid = tsk->euid; | ||
| 491 | context->suid = tsk->suid; | ||
| 492 | context->fsuid = tsk->fsuid; | ||
| 493 | context->egid = tsk->egid; | ||
| 494 | context->sgid = tsk->sgid; | ||
| 495 | context->fsgid = tsk->fsgid; | ||
| 496 | context->personality = tsk->personality; | ||
| 497 | tsk->audit_context = NULL; | 589 | tsk->audit_context = NULL; |
| 498 | return context; | 590 | return context; |
| 499 | } | 591 | } |
| @@ -627,6 +719,7 @@ static inline void audit_free_context(struct audit_context *context) | |||
| 627 | } | 719 | } |
| 628 | audit_free_names(context); | 720 | audit_free_names(context); |
| 629 | audit_free_aux(context); | 721 | audit_free_aux(context); |
| 722 | kfree(context->filterkey); | ||
| 630 | kfree(context); | 723 | kfree(context); |
| 631 | context = previous; | 724 | context = previous; |
| 632 | } while (context); | 725 | } while (context); |
| @@ -658,8 +751,7 @@ static void audit_log_task_context(struct audit_buffer *ab) | |||
| 658 | return; | 751 | return; |
| 659 | 752 | ||
| 660 | error_path: | 753 | error_path: |
| 661 | if (ctx) | 754 | kfree(ctx); |
| 662 | kfree(ctx); | ||
| 663 | audit_panic("error in audit_log_task_context"); | 755 | audit_panic("error in audit_log_task_context"); |
| 664 | return; | 756 | return; |
| 665 | } | 757 | } |
| @@ -702,6 +794,17 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
| 702 | const char *tty; | 794 | const char *tty; |
| 703 | 795 | ||
| 704 | /* tsk == current */ | 796 | /* tsk == current */ |
| 797 | context->pid = tsk->pid; | ||
| 798 | context->ppid = sys_getppid(); /* sic. tsk == current in all cases */ | ||
| 799 | context->uid = tsk->uid; | ||
| 800 | context->gid = tsk->gid; | ||
| 801 | context->euid = tsk->euid; | ||
| 802 | context->suid = tsk->suid; | ||
| 803 | context->fsuid = tsk->fsuid; | ||
| 804 | context->egid = tsk->egid; | ||
| 805 | context->sgid = tsk->sgid; | ||
| 806 | context->fsgid = tsk->fsgid; | ||
| 807 | context->personality = tsk->personality; | ||
| 705 | 808 | ||
| 706 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); | 809 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); |
| 707 | if (!ab) | 810 | if (!ab) |
| @@ -714,6 +817,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
| 714 | audit_log_format(ab, " success=%s exit=%ld", | 817 | audit_log_format(ab, " success=%s exit=%ld", |
| 715 | (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", | 818 | (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", |
| 716 | context->return_code); | 819 | context->return_code); |
| 820 | |||
| 821 | mutex_lock(&tty_mutex); | ||
| 717 | if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) | 822 | if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) |
| 718 | tty = tsk->signal->tty->name; | 823 | tty = tsk->signal->tty->name; |
| 719 | else | 824 | else |
| @@ -735,7 +840,15 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
| 735 | context->gid, | 840 | context->gid, |
| 736 | context->euid, context->suid, context->fsuid, | 841 | context->euid, context->suid, context->fsuid, |
| 737 | context->egid, context->sgid, context->fsgid, tty); | 842 | context->egid, context->sgid, context->fsgid, tty); |
| 843 | |||
| 844 | mutex_unlock(&tty_mutex); | ||
| 845 | |||
| 738 | audit_log_task_info(ab, tsk); | 846 | audit_log_task_info(ab, tsk); |
| 847 | if (context->filterkey) { | ||
| 848 | audit_log_format(ab, " key="); | ||
| 849 | audit_log_untrustedstring(ab, context->filterkey); | ||
| 850 | } else | ||
| 851 | audit_log_format(ab, " key=(null)"); | ||
| 739 | audit_log_end(ab); | 852 | audit_log_end(ab); |
| 740 | 853 | ||
| 741 | for (aux = context->aux; aux; aux = aux->next) { | 854 | for (aux = context->aux; aux; aux = aux->next) { |
| @@ -790,7 +903,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
| 790 | if (axi->osid != 0) { | 903 | if (axi->osid != 0) { |
| 791 | char *ctx = NULL; | 904 | char *ctx = NULL; |
| 792 | u32 len; | 905 | u32 len; |
| 793 | if (selinux_ctxid_to_string( | 906 | if (selinux_sid_to_string( |
| 794 | axi->osid, &ctx, &len)) { | 907 | axi->osid, &ctx, &len)) { |
| 795 | audit_log_format(ab, " osid=%u", | 908 | audit_log_format(ab, " osid=%u", |
| 796 | axi->osid); | 909 | axi->osid); |
| @@ -897,7 +1010,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
| 897 | if (n->osid != 0) { | 1010 | if (n->osid != 0) { |
| 898 | char *ctx = NULL; | 1011 | char *ctx = NULL; |
| 899 | u32 len; | 1012 | u32 len; |
| 900 | if (selinux_ctxid_to_string( | 1013 | if (selinux_sid_to_string( |
| 901 | n->osid, &ctx, &len)) { | 1014 | n->osid, &ctx, &len)) { |
| 902 | audit_log_format(ab, " osid=%u", n->osid); | 1015 | audit_log_format(ab, " osid=%u", n->osid); |
| 903 | call_panic = 2; | 1016 | call_panic = 2; |
| @@ -1014,7 +1127,8 @@ void audit_syscall_entry(int arch, int major, | |||
| 1014 | context->argv[3] = a4; | 1127 | context->argv[3] = a4; |
| 1015 | 1128 | ||
| 1016 | state = context->state; | 1129 | state = context->state; |
| 1017 | if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT) | 1130 | context->dummy = !audit_n_rules; |
| 1131 | if (!context->dummy && (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)) | ||
| 1018 | state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); | 1132 | state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); |
| 1019 | if (likely(state == AUDIT_DISABLED)) | 1133 | if (likely(state == AUDIT_DISABLED)) |
| 1020 | return; | 1134 | return; |
| @@ -1061,6 +1175,8 @@ void audit_syscall_exit(int valid, long return_code) | |||
| 1061 | } else { | 1175 | } else { |
| 1062 | audit_free_names(context); | 1176 | audit_free_names(context); |
| 1063 | audit_free_aux(context); | 1177 | audit_free_aux(context); |
| 1178 | kfree(context->filterkey); | ||
| 1179 | context->filterkey = NULL; | ||
| 1064 | tsk->audit_context = context; | 1180 | tsk->audit_context = context; |
| 1065 | } | 1181 | } |
| 1066 | } | 1182 | } |
| @@ -1145,14 +1261,18 @@ void audit_putname(const char *name) | |||
| 1145 | #endif | 1261 | #endif |
| 1146 | } | 1262 | } |
| 1147 | 1263 | ||
| 1148 | static void audit_inode_context(int idx, const struct inode *inode) | 1264 | /* Copy inode data into an audit_names. */ |
| 1265 | static void audit_copy_inode(struct audit_names *name, const struct inode *inode) | ||
| 1149 | { | 1266 | { |
| 1150 | struct audit_context *context = current->audit_context; | 1267 | name->ino = inode->i_ino; |
| 1151 | 1268 | name->dev = inode->i_sb->s_dev; | |
| 1152 | selinux_get_inode_sid(inode, &context->names[idx].osid); | 1269 | name->mode = inode->i_mode; |
| 1270 | name->uid = inode->i_uid; | ||
| 1271 | name->gid = inode->i_gid; | ||
| 1272 | name->rdev = inode->i_rdev; | ||
| 1273 | selinux_get_inode_sid(inode, &name->osid); | ||
| 1153 | } | 1274 | } |
| 1154 | 1275 | ||
| 1155 | |||
| 1156 | /** | 1276 | /** |
| 1157 | * audit_inode - store the inode and device from a lookup | 1277 | * audit_inode - store the inode and device from a lookup |
| 1158 | * @name: name being audited | 1278 | * @name: name being audited |
| @@ -1186,20 +1306,14 @@ void __audit_inode(const char *name, const struct inode *inode) | |||
| 1186 | ++context->ino_count; | 1306 | ++context->ino_count; |
| 1187 | #endif | 1307 | #endif |
| 1188 | } | 1308 | } |
| 1189 | context->names[idx].ino = inode->i_ino; | 1309 | audit_copy_inode(&context->names[idx], inode); |
| 1190 | context->names[idx].dev = inode->i_sb->s_dev; | ||
| 1191 | context->names[idx].mode = inode->i_mode; | ||
| 1192 | context->names[idx].uid = inode->i_uid; | ||
| 1193 | context->names[idx].gid = inode->i_gid; | ||
| 1194 | context->names[idx].rdev = inode->i_rdev; | ||
| 1195 | audit_inode_context(idx, inode); | ||
| 1196 | } | 1310 | } |
| 1197 | 1311 | ||
| 1198 | /** | 1312 | /** |
| 1199 | * audit_inode_child - collect inode info for created/removed objects | 1313 | * audit_inode_child - collect inode info for created/removed objects |
| 1200 | * @dname: inode's dentry name | 1314 | * @dname: inode's dentry name |
| 1201 | * @inode: inode being audited | 1315 | * @inode: inode being audited |
| 1202 | * @pino: inode number of dentry parent | 1316 | * @parent: inode of dentry parent |
| 1203 | * | 1317 | * |
| 1204 | * For syscalls that create or remove filesystem objects, audit_inode | 1318 | * For syscalls that create or remove filesystem objects, audit_inode |
| 1205 | * can only collect information for the filesystem object's parent. | 1319 | * can only collect information for the filesystem object's parent. |
| @@ -1210,7 +1324,7 @@ void __audit_inode(const char *name, const struct inode *inode) | |||
| 1210 | * unsuccessful attempts. | 1324 | * unsuccessful attempts. |
| 1211 | */ | 1325 | */ |
| 1212 | void __audit_inode_child(const char *dname, const struct inode *inode, | 1326 | void __audit_inode_child(const char *dname, const struct inode *inode, |
| 1213 | unsigned long pino) | 1327 | const struct inode *parent) |
| 1214 | { | 1328 | { |
| 1215 | int idx; | 1329 | int idx; |
| 1216 | struct audit_context *context = current->audit_context; | 1330 | struct audit_context *context = current->audit_context; |
| @@ -1224,7 +1338,7 @@ void __audit_inode_child(const char *dname, const struct inode *inode, | |||
| 1224 | if (!dname) | 1338 | if (!dname) |
| 1225 | goto update_context; | 1339 | goto update_context; |
| 1226 | for (idx = 0; idx < context->name_count; idx++) | 1340 | for (idx = 0; idx < context->name_count; idx++) |
| 1227 | if (context->names[idx].ino == pino) { | 1341 | if (context->names[idx].ino == parent->i_ino) { |
| 1228 | const char *name = context->names[idx].name; | 1342 | const char *name = context->names[idx].name; |
| 1229 | 1343 | ||
| 1230 | if (!name) | 1344 | if (!name) |
| @@ -1248,16 +1362,47 @@ update_context: | |||
| 1248 | context->names[idx].name_len = AUDIT_NAME_FULL; | 1362 | context->names[idx].name_len = AUDIT_NAME_FULL; |
| 1249 | context->names[idx].name_put = 0; /* don't call __putname() */ | 1363 | context->names[idx].name_put = 0; /* don't call __putname() */ |
| 1250 | 1364 | ||
| 1251 | if (inode) { | 1365 | if (!inode) |
| 1252 | context->names[idx].ino = inode->i_ino; | 1366 | context->names[idx].ino = (unsigned long)-1; |
| 1253 | context->names[idx].dev = inode->i_sb->s_dev; | 1367 | else |
| 1254 | context->names[idx].mode = inode->i_mode; | 1368 | audit_copy_inode(&context->names[idx], inode); |
| 1255 | context->names[idx].uid = inode->i_uid; | 1369 | |
| 1256 | context->names[idx].gid = inode->i_gid; | 1370 | /* A parent was not found in audit_names, so copy the inode data for the |
| 1257 | context->names[idx].rdev = inode->i_rdev; | 1371 | * provided parent. */ |
| 1258 | audit_inode_context(idx, inode); | 1372 | if (!found_name) { |
| 1259 | } else | 1373 | idx = context->name_count++; |
| 1260 | context->names[idx].ino = (unsigned long)-1; | 1374 | #if AUDIT_DEBUG |
| 1375 | context->ino_count++; | ||
| 1376 | #endif | ||
| 1377 | audit_copy_inode(&context->names[idx], parent); | ||
| 1378 | } | ||
| 1379 | } | ||
| 1380 | |||
| 1381 | /** | ||
| 1382 | * audit_inode_update - update inode info for last collected name | ||
| 1383 | * @inode: inode being audited | ||
| 1384 | * | ||
| 1385 | * When open() is called on an existing object with the O_CREAT flag, the inode | ||
| 1386 | * data audit initially collects is incorrect. This additional hook ensures | ||
| 1387 | * audit has the inode data for the actual object to be opened. | ||
| 1388 | */ | ||
| 1389 | void __audit_inode_update(const struct inode *inode) | ||
| 1390 | { | ||
| 1391 | struct audit_context *context = current->audit_context; | ||
| 1392 | int idx; | ||
| 1393 | |||
| 1394 | if (!context->in_syscall || !inode) | ||
| 1395 | return; | ||
| 1396 | |||
| 1397 | if (context->name_count == 0) { | ||
| 1398 | context->name_count++; | ||
| 1399 | #if AUDIT_DEBUG | ||
| 1400 | context->ino_count++; | ||
| 1401 | #endif | ||
| 1402 | } | ||
| 1403 | idx = context->name_count - 1; | ||
| 1404 | |||
| 1405 | audit_copy_inode(&context->names[idx], inode); | ||
| 1261 | } | 1406 | } |
| 1262 | 1407 | ||
| 1263 | /** | 1408 | /** |
| @@ -1367,7 +1512,7 @@ int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) | |||
| 1367 | * @mqdes: MQ descriptor | 1512 | * @mqdes: MQ descriptor |
| 1368 | * @msg_len: Message length | 1513 | * @msg_len: Message length |
| 1369 | * @msg_prio: Message priority | 1514 | * @msg_prio: Message priority |
| 1370 | * @abs_timeout: Message timeout in absolute time | 1515 | * @u_abs_timeout: Message timeout in absolute time |
| 1371 | * | 1516 | * |
| 1372 | * Returns 0 for success or NULL context or < 0 on error. | 1517 | * Returns 0 for success or NULL context or < 0 on error. |
| 1373 | */ | 1518 | */ |
| @@ -1409,8 +1554,8 @@ int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, | |||
| 1409 | * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive | 1554 | * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive |
| 1410 | * @mqdes: MQ descriptor | 1555 | * @mqdes: MQ descriptor |
| 1411 | * @msg_len: Message length | 1556 | * @msg_len: Message length |
| 1412 | * @msg_prio: Message priority | 1557 | * @u_msg_prio: Message priority |
| 1413 | * @abs_timeout: Message timeout in absolute time | 1558 | * @u_abs_timeout: Message timeout in absolute time |
| 1414 | * | 1559 | * |
| 1415 | * Returns 0 for success or NULL context or < 0 on error. | 1560 | * Returns 0 for success or NULL context or < 0 on error. |
| 1416 | */ | 1561 | */ |
| @@ -1558,7 +1703,6 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp) | |||
| 1558 | * @uid: msgq user id | 1703 | * @uid: msgq user id |
| 1559 | * @gid: msgq group id | 1704 | * @gid: msgq group id |
| 1560 | * @mode: msgq mode (permissions) | 1705 | * @mode: msgq mode (permissions) |
| 1561 | * @ipcp: in-kernel IPC permissions | ||
| 1562 | * | 1706 | * |
| 1563 | * Returns 0 for success or NULL context or < 0 on error. | 1707 | * Returns 0 for success or NULL context or < 0 on error. |
| 1564 | */ | 1708 | */ |
| @@ -1589,7 +1733,7 @@ int audit_bprm(struct linux_binprm *bprm) | |||
| 1589 | unsigned long p, next; | 1733 | unsigned long p, next; |
| 1590 | void *to; | 1734 | void *to; |
| 1591 | 1735 | ||
| 1592 | if (likely(!audit_enabled || !context)) | 1736 | if (likely(!audit_enabled || !context || context->dummy)) |
| 1593 | return 0; | 1737 | return 0; |
| 1594 | 1738 | ||
| 1595 | ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p, | 1739 | ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p, |
| @@ -1627,7 +1771,7 @@ int audit_socketcall(int nargs, unsigned long *args) | |||
| 1627 | struct audit_aux_data_socketcall *ax; | 1771 | struct audit_aux_data_socketcall *ax; |
| 1628 | struct audit_context *context = current->audit_context; | 1772 | struct audit_context *context = current->audit_context; |
| 1629 | 1773 | ||
| 1630 | if (likely(!context)) | 1774 | if (likely(!context || context->dummy)) |
| 1631 | return 0; | 1775 | return 0; |
| 1632 | 1776 | ||
| 1633 | ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL); | 1777 | ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL); |
| @@ -1655,7 +1799,7 @@ int audit_sockaddr(int len, void *a) | |||
| 1655 | struct audit_aux_data_sockaddr *ax; | 1799 | struct audit_aux_data_sockaddr *ax; |
| 1656 | struct audit_context *context = current->audit_context; | 1800 | struct audit_context *context = current->audit_context; |
| 1657 | 1801 | ||
| 1658 | if (likely(!context)) | 1802 | if (likely(!context || context->dummy)) |
| 1659 | return 0; | 1803 | return 0; |
| 1660 | 1804 | ||
| 1661 | ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL); | 1805 | ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL); |
diff --git a/kernel/capability.c b/kernel/capability.c index 1a4d8a40d3f9..edb845a6e84a 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
| @@ -46,7 +46,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) | |||
| 46 | int ret = 0; | 46 | int ret = 0; |
| 47 | pid_t pid; | 47 | pid_t pid; |
| 48 | __u32 version; | 48 | __u32 version; |
| 49 | task_t *target; | 49 | struct task_struct *target; |
| 50 | struct __user_cap_data_struct data; | 50 | struct __user_cap_data_struct data; |
| 51 | 51 | ||
| 52 | if (get_user(version, &header->version)) | 52 | if (get_user(version, &header->version)) |
| @@ -96,7 +96,7 @@ static inline int cap_set_pg(int pgrp, kernel_cap_t *effective, | |||
| 96 | kernel_cap_t *inheritable, | 96 | kernel_cap_t *inheritable, |
| 97 | kernel_cap_t *permitted) | 97 | kernel_cap_t *permitted) |
| 98 | { | 98 | { |
| 99 | task_t *g, *target; | 99 | struct task_struct *g, *target; |
| 100 | int ret = -EPERM; | 100 | int ret = -EPERM; |
| 101 | int found = 0; | 101 | int found = 0; |
| 102 | 102 | ||
| @@ -128,12 +128,12 @@ static inline int cap_set_all(kernel_cap_t *effective, | |||
| 128 | kernel_cap_t *inheritable, | 128 | kernel_cap_t *inheritable, |
| 129 | kernel_cap_t *permitted) | 129 | kernel_cap_t *permitted) |
| 130 | { | 130 | { |
| 131 | task_t *g, *target; | 131 | struct task_struct *g, *target; |
| 132 | int ret = -EPERM; | 132 | int ret = -EPERM; |
| 133 | int found = 0; | 133 | int found = 0; |
| 134 | 134 | ||
| 135 | do_each_thread(g, target) { | 135 | do_each_thread(g, target) { |
| 136 | if (target == current || target->pid == 1) | 136 | if (target == current || is_init(target)) |
| 137 | continue; | 137 | continue; |
| 138 | found = 1; | 138 | found = 1; |
| 139 | if (security_capset_check(target, effective, inheritable, | 139 | if (security_capset_check(target, effective, inheritable, |
| @@ -172,7 +172,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) | |||
| 172 | { | 172 | { |
| 173 | kernel_cap_t inheritable, permitted, effective; | 173 | kernel_cap_t inheritable, permitted, effective; |
| 174 | __u32 version; | 174 | __u32 version; |
| 175 | task_t *target; | 175 | struct task_struct *target; |
| 176 | int ret; | 176 | int ret; |
| 177 | pid_t pid; | 177 | pid_t pid; |
| 178 | 178 | ||
diff --git a/kernel/compat.c b/kernel/compat.c index 2f672332430f..75573e5d27b0 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
| @@ -22,6 +22,7 @@ | |||
| 22 | #include <linux/security.h> | 22 | #include <linux/security.h> |
| 23 | #include <linux/timex.h> | 23 | #include <linux/timex.h> |
| 24 | #include <linux/migrate.h> | 24 | #include <linux/migrate.h> |
| 25 | #include <linux/posix-timers.h> | ||
| 25 | 26 | ||
| 26 | #include <asm/uaccess.h> | 27 | #include <asm/uaccess.h> |
| 27 | 28 | ||
| @@ -601,6 +602,30 @@ long compat_sys_clock_getres(clockid_t which_clock, | |||
| 601 | return err; | 602 | return err; |
| 602 | } | 603 | } |
| 603 | 604 | ||
| 605 | static long compat_clock_nanosleep_restart(struct restart_block *restart) | ||
| 606 | { | ||
| 607 | long err; | ||
| 608 | mm_segment_t oldfs; | ||
| 609 | struct timespec tu; | ||
| 610 | struct compat_timespec *rmtp = (struct compat_timespec *)(restart->arg1); | ||
| 611 | |||
| 612 | restart->arg1 = (unsigned long) &tu; | ||
| 613 | oldfs = get_fs(); | ||
| 614 | set_fs(KERNEL_DS); | ||
| 615 | err = clock_nanosleep_restart(restart); | ||
| 616 | set_fs(oldfs); | ||
| 617 | |||
| 618 | if ((err == -ERESTART_RESTARTBLOCK) && rmtp && | ||
| 619 | put_compat_timespec(&tu, rmtp)) | ||
| 620 | return -EFAULT; | ||
| 621 | |||
| 622 | if (err == -ERESTART_RESTARTBLOCK) { | ||
| 623 | restart->fn = compat_clock_nanosleep_restart; | ||
| 624 | restart->arg1 = (unsigned long) rmtp; | ||
| 625 | } | ||
| 626 | return err; | ||
| 627 | } | ||
| 628 | |||
| 604 | long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, | 629 | long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, |
| 605 | struct compat_timespec __user *rqtp, | 630 | struct compat_timespec __user *rqtp, |
| 606 | struct compat_timespec __user *rmtp) | 631 | struct compat_timespec __user *rmtp) |
| @@ -608,6 +633,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, | |||
| 608 | long err; | 633 | long err; |
| 609 | mm_segment_t oldfs; | 634 | mm_segment_t oldfs; |
| 610 | struct timespec in, out; | 635 | struct timespec in, out; |
| 636 | struct restart_block *restart; | ||
| 611 | 637 | ||
| 612 | if (get_compat_timespec(&in, rqtp)) | 638 | if (get_compat_timespec(&in, rqtp)) |
| 613 | return -EFAULT; | 639 | return -EFAULT; |
| @@ -618,9 +644,16 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, | |||
| 618 | (struct timespec __user *) &in, | 644 | (struct timespec __user *) &in, |
| 619 | (struct timespec __user *) &out); | 645 | (struct timespec __user *) &out); |
| 620 | set_fs(oldfs); | 646 | set_fs(oldfs); |
| 647 | |||
| 621 | if ((err == -ERESTART_RESTARTBLOCK) && rmtp && | 648 | if ((err == -ERESTART_RESTARTBLOCK) && rmtp && |
| 622 | put_compat_timespec(&out, rmtp)) | 649 | put_compat_timespec(&out, rmtp)) |
| 623 | return -EFAULT; | 650 | return -EFAULT; |
| 651 | |||
| 652 | if (err == -ERESTART_RESTARTBLOCK) { | ||
| 653 | restart = ¤t_thread_info()->restart_block; | ||
| 654 | restart->fn = compat_clock_nanosleep_restart; | ||
| 655 | restart->arg1 = (unsigned long) rmtp; | ||
| 656 | } | ||
| 624 | return err; | 657 | return err; |
| 625 | } | 658 | } |
| 626 | 659 | ||
| @@ -730,17 +763,10 @@ void | |||
| 730 | sigset_from_compat (sigset_t *set, compat_sigset_t *compat) | 763 | sigset_from_compat (sigset_t *set, compat_sigset_t *compat) |
| 731 | { | 764 | { |
| 732 | switch (_NSIG_WORDS) { | 765 | switch (_NSIG_WORDS) { |
| 733 | #if defined (__COMPAT_ENDIAN_SWAP__) | ||
| 734 | case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 ); | ||
| 735 | case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 ); | ||
| 736 | case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 ); | ||
| 737 | case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 ); | ||
| 738 | #else | ||
| 739 | case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); | 766 | case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); |
| 740 | case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); | 767 | case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); |
| 741 | case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); | 768 | case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); |
| 742 | case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); | 769 | case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); |
| 743 | #endif | ||
| 744 | } | 770 | } |
| 745 | } | 771 | } |
| 746 | 772 | ||
diff --git a/kernel/configs.c b/kernel/configs.c index 009e1ebdcb88..f9e31974f4ad 100644 --- a/kernel/configs.c +++ b/kernel/configs.c | |||
| @@ -23,7 +23,6 @@ | |||
| 23 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 23 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
| 24 | */ | 24 | */ |
| 25 | 25 | ||
| 26 | #include <linux/config.h> | ||
| 27 | #include <linux/kernel.h> | 26 | #include <linux/kernel.h> |
| 28 | #include <linux/module.h> | 27 | #include <linux/module.h> |
| 29 | #include <linux/proc_fs.h> | 28 | #include <linux/proc_fs.h> |
diff --git a/kernel/cpu.c b/kernel/cpu.c index fe2b8d0bfe4c..32c96628463e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -13,66 +13,66 @@ | |||
| 13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
| 14 | #include <linux/kthread.h> | 14 | #include <linux/kthread.h> |
| 15 | #include <linux/stop_machine.h> | 15 | #include <linux/stop_machine.h> |
| 16 | #include <asm/semaphore.h> | 16 | #include <linux/mutex.h> |
| 17 | 17 | ||
| 18 | /* This protects CPUs going up and down... */ | 18 | /* This protects CPUs going up and down... */ |
| 19 | static DECLARE_MUTEX(cpucontrol); | 19 | static DEFINE_MUTEX(cpu_add_remove_lock); |
| 20 | static DEFINE_MUTEX(cpu_bitmask_lock); | ||
| 20 | 21 | ||
| 21 | static BLOCKING_NOTIFIER_HEAD(cpu_chain); | 22 | static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain); |
| 22 | 23 | ||
| 23 | #ifdef CONFIG_HOTPLUG_CPU | 24 | /* If set, cpu_up and cpu_down will return -EBUSY and do nothing. |
| 24 | static struct task_struct *lock_cpu_hotplug_owner; | 25 | * Should always be manipulated under cpu_add_remove_lock |
| 25 | static int lock_cpu_hotplug_depth; | 26 | */ |
| 26 | 27 | static int cpu_hotplug_disabled; | |
| 27 | static int __lock_cpu_hotplug(int interruptible) | ||
| 28 | { | ||
| 29 | int ret = 0; | ||
| 30 | 28 | ||
| 31 | if (lock_cpu_hotplug_owner != current) { | 29 | #ifdef CONFIG_HOTPLUG_CPU |
| 32 | if (interruptible) | ||
| 33 | ret = down_interruptible(&cpucontrol); | ||
| 34 | else | ||
| 35 | down(&cpucontrol); | ||
| 36 | } | ||
| 37 | 30 | ||
| 38 | /* | 31 | /* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */ |
| 39 | * Set only if we succeed in locking | 32 | static struct task_struct *recursive; |
| 40 | */ | 33 | static int recursive_depth; |
| 41 | if (!ret) { | ||
| 42 | lock_cpu_hotplug_depth++; | ||
| 43 | lock_cpu_hotplug_owner = current; | ||
| 44 | } | ||
| 45 | |||
| 46 | return ret; | ||
| 47 | } | ||
| 48 | 34 | ||
| 49 | void lock_cpu_hotplug(void) | 35 | void lock_cpu_hotplug(void) |
| 50 | { | 36 | { |
| 51 | __lock_cpu_hotplug(0); | 37 | struct task_struct *tsk = current; |
| 38 | |||
| 39 | if (tsk == recursive) { | ||
| 40 | static int warnings = 10; | ||
| 41 | if (warnings) { | ||
| 42 | printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n"); | ||
| 43 | WARN_ON(1); | ||
| 44 | warnings--; | ||
| 45 | } | ||
| 46 | recursive_depth++; | ||
| 47 | return; | ||
| 48 | } | ||
| 49 | mutex_lock(&cpu_bitmask_lock); | ||
| 50 | recursive = tsk; | ||
| 52 | } | 51 | } |
| 53 | EXPORT_SYMBOL_GPL(lock_cpu_hotplug); | 52 | EXPORT_SYMBOL_GPL(lock_cpu_hotplug); |
| 54 | 53 | ||
| 55 | void unlock_cpu_hotplug(void) | 54 | void unlock_cpu_hotplug(void) |
| 56 | { | 55 | { |
| 57 | if (--lock_cpu_hotplug_depth == 0) { | 56 | WARN_ON(recursive != current); |
| 58 | lock_cpu_hotplug_owner = NULL; | 57 | if (recursive_depth) { |
| 59 | up(&cpucontrol); | 58 | recursive_depth--; |
| 59 | return; | ||
| 60 | } | 60 | } |
| 61 | mutex_unlock(&cpu_bitmask_lock); | ||
| 62 | recursive = NULL; | ||
| 61 | } | 63 | } |
| 62 | EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); | 64 | EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); |
| 63 | 65 | ||
| 64 | int lock_cpu_hotplug_interruptible(void) | ||
| 65 | { | ||
| 66 | return __lock_cpu_hotplug(1); | ||
| 67 | } | ||
| 68 | EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible); | ||
| 69 | #endif /* CONFIG_HOTPLUG_CPU */ | 66 | #endif /* CONFIG_HOTPLUG_CPU */ |
| 70 | 67 | ||
| 71 | /* Need to know about CPUs going up/down? */ | 68 | /* Need to know about CPUs going up/down? */ |
| 72 | int register_cpu_notifier(struct notifier_block *nb) | 69 | int __cpuinit register_cpu_notifier(struct notifier_block *nb) |
| 73 | { | 70 | { |
| 74 | return blocking_notifier_chain_register(&cpu_chain, nb); | 71 | return blocking_notifier_chain_register(&cpu_chain, nb); |
| 75 | } | 72 | } |
| 73 | |||
| 74 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 75 | |||
| 76 | EXPORT_SYMBOL(register_cpu_notifier); | 76 | EXPORT_SYMBOL(register_cpu_notifier); |
| 77 | 77 | ||
| 78 | void unregister_cpu_notifier(struct notifier_block *nb) | 78 | void unregister_cpu_notifier(struct notifier_block *nb) |
| @@ -81,7 +81,6 @@ void unregister_cpu_notifier(struct notifier_block *nb) | |||
| 81 | } | 81 | } |
| 82 | EXPORT_SYMBOL(unregister_cpu_notifier); | 82 | EXPORT_SYMBOL(unregister_cpu_notifier); |
| 83 | 83 | ||
| 84 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 85 | static inline void check_for_tasks(int cpu) | 84 | static inline void check_for_tasks(int cpu) |
| 86 | { | 85 | { |
| 87 | struct task_struct *p; | 86 | struct task_struct *p; |
| @@ -114,32 +113,25 @@ static int take_cpu_down(void *unused) | |||
| 114 | return 0; | 113 | return 0; |
| 115 | } | 114 | } |
| 116 | 115 | ||
| 117 | int cpu_down(unsigned int cpu) | 116 | /* Requires cpu_add_remove_lock to be held */ |
| 117 | static int _cpu_down(unsigned int cpu) | ||
| 118 | { | 118 | { |
| 119 | int err; | 119 | int err; |
| 120 | struct task_struct *p; | 120 | struct task_struct *p; |
| 121 | cpumask_t old_allowed, tmp; | 121 | cpumask_t old_allowed, tmp; |
| 122 | 122 | ||
| 123 | if ((err = lock_cpu_hotplug_interruptible()) != 0) | 123 | if (num_online_cpus() == 1) |
| 124 | return err; | 124 | return -EBUSY; |
| 125 | 125 | ||
| 126 | if (num_online_cpus() == 1) { | 126 | if (!cpu_online(cpu)) |
| 127 | err = -EBUSY; | 127 | return -EINVAL; |
| 128 | goto out; | ||
| 129 | } | ||
| 130 | |||
| 131 | if (!cpu_online(cpu)) { | ||
| 132 | err = -EINVAL; | ||
| 133 | goto out; | ||
| 134 | } | ||
| 135 | 128 | ||
| 136 | err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, | 129 | err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, |
| 137 | (void *)(long)cpu); | 130 | (void *)(long)cpu); |
| 138 | if (err == NOTIFY_BAD) { | 131 | if (err == NOTIFY_BAD) { |
| 139 | printk("%s: attempt to take down CPU %u failed\n", | 132 | printk("%s: attempt to take down CPU %u failed\n", |
| 140 | __FUNCTION__, cpu); | 133 | __FUNCTION__, cpu); |
| 141 | err = -EINVAL; | 134 | return -EINVAL; |
| 142 | goto out; | ||
| 143 | } | 135 | } |
| 144 | 136 | ||
| 145 | /* Ensure that we are not runnable on dying cpu */ | 137 | /* Ensure that we are not runnable on dying cpu */ |
| @@ -148,7 +140,10 @@ int cpu_down(unsigned int cpu) | |||
| 148 | cpu_clear(cpu, tmp); | 140 | cpu_clear(cpu, tmp); |
| 149 | set_cpus_allowed(current, tmp); | 141 | set_cpus_allowed(current, tmp); |
| 150 | 142 | ||
| 143 | mutex_lock(&cpu_bitmask_lock); | ||
| 151 | p = __stop_machine_run(take_cpu_down, NULL, cpu); | 144 | p = __stop_machine_run(take_cpu_down, NULL, cpu); |
| 145 | mutex_unlock(&cpu_bitmask_lock); | ||
| 146 | |||
| 152 | if (IS_ERR(p)) { | 147 | if (IS_ERR(p)) { |
| 153 | /* CPU didn't die: tell everyone. Can't complain. */ | 148 | /* CPU didn't die: tell everyone. Can't complain. */ |
| 154 | if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, | 149 | if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, |
| @@ -184,24 +179,32 @@ out_thread: | |||
| 184 | err = kthread_stop(p); | 179 | err = kthread_stop(p); |
| 185 | out_allowed: | 180 | out_allowed: |
| 186 | set_cpus_allowed(current, old_allowed); | 181 | set_cpus_allowed(current, old_allowed); |
| 187 | out: | 182 | return err; |
| 188 | unlock_cpu_hotplug(); | 183 | } |
| 184 | |||
| 185 | int cpu_down(unsigned int cpu) | ||
| 186 | { | ||
| 187 | int err = 0; | ||
| 188 | |||
| 189 | mutex_lock(&cpu_add_remove_lock); | ||
| 190 | if (cpu_hotplug_disabled) | ||
| 191 | err = -EBUSY; | ||
| 192 | else | ||
| 193 | err = _cpu_down(cpu); | ||
| 194 | |||
| 195 | mutex_unlock(&cpu_add_remove_lock); | ||
| 189 | return err; | 196 | return err; |
| 190 | } | 197 | } |
| 191 | #endif /*CONFIG_HOTPLUG_CPU*/ | 198 | #endif /*CONFIG_HOTPLUG_CPU*/ |
| 192 | 199 | ||
| 193 | int __devinit cpu_up(unsigned int cpu) | 200 | /* Requires cpu_add_remove_lock to be held */ |
| 201 | static int __devinit _cpu_up(unsigned int cpu) | ||
| 194 | { | 202 | { |
| 195 | int ret; | 203 | int ret; |
| 196 | void *hcpu = (void *)(long)cpu; | 204 | void *hcpu = (void *)(long)cpu; |
| 197 | 205 | ||
| 198 | if ((ret = lock_cpu_hotplug_interruptible()) != 0) | 206 | if (cpu_online(cpu) || !cpu_present(cpu)) |
| 199 | return ret; | 207 | return -EINVAL; |
| 200 | |||
| 201 | if (cpu_online(cpu) || !cpu_present(cpu)) { | ||
| 202 | ret = -EINVAL; | ||
| 203 | goto out; | ||
| 204 | } | ||
| 205 | 208 | ||
| 206 | ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); | 209 | ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); |
| 207 | if (ret == NOTIFY_BAD) { | 210 | if (ret == NOTIFY_BAD) { |
| @@ -212,7 +215,9 @@ int __devinit cpu_up(unsigned int cpu) | |||
| 212 | } | 215 | } |
| 213 | 216 | ||
| 214 | /* Arch-specific enabling code. */ | 217 | /* Arch-specific enabling code. */ |
| 218 | mutex_lock(&cpu_bitmask_lock); | ||
| 215 | ret = __cpu_up(cpu); | 219 | ret = __cpu_up(cpu); |
| 220 | mutex_unlock(&cpu_bitmask_lock); | ||
| 216 | if (ret != 0) | 221 | if (ret != 0) |
| 217 | goto out_notify; | 222 | goto out_notify; |
| 218 | BUG_ON(!cpu_online(cpu)); | 223 | BUG_ON(!cpu_online(cpu)); |
| @@ -224,7 +229,95 @@ out_notify: | |||
| 224 | if (ret != 0) | 229 | if (ret != 0) |
| 225 | blocking_notifier_call_chain(&cpu_chain, | 230 | blocking_notifier_call_chain(&cpu_chain, |
| 226 | CPU_UP_CANCELED, hcpu); | 231 | CPU_UP_CANCELED, hcpu); |
| 227 | out: | 232 | |
| 228 | unlock_cpu_hotplug(); | ||
| 229 | return ret; | 233 | return ret; |
| 230 | } | 234 | } |
| 235 | |||
| 236 | int __devinit cpu_up(unsigned int cpu) | ||
| 237 | { | ||
| 238 | int err = 0; | ||
| 239 | |||
| 240 | mutex_lock(&cpu_add_remove_lock); | ||
| 241 | if (cpu_hotplug_disabled) | ||
| 242 | err = -EBUSY; | ||
| 243 | else | ||
| 244 | err = _cpu_up(cpu); | ||
| 245 | |||
| 246 | mutex_unlock(&cpu_add_remove_lock); | ||
| 247 | return err; | ||
| 248 | } | ||
| 249 | |||
| 250 | #ifdef CONFIG_SUSPEND_SMP | ||
| 251 | static cpumask_t frozen_cpus; | ||
| 252 | |||
| 253 | int disable_nonboot_cpus(void) | ||
| 254 | { | ||
| 255 | int cpu, first_cpu, error; | ||
| 256 | |||
| 257 | mutex_lock(&cpu_add_remove_lock); | ||
| 258 | first_cpu = first_cpu(cpu_present_map); | ||
| 259 | if (!cpu_online(first_cpu)) { | ||
| 260 | error = _cpu_up(first_cpu); | ||
| 261 | if (error) { | ||
| 262 | printk(KERN_ERR "Could not bring CPU%d up.\n", | ||
| 263 | first_cpu); | ||
| 264 | goto out; | ||
| 265 | } | ||
| 266 | } | ||
| 267 | error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu)); | ||
| 268 | if (error) { | ||
| 269 | printk(KERN_ERR "Could not run on CPU%d\n", first_cpu); | ||
| 270 | goto out; | ||
| 271 | } | ||
| 272 | /* We take down all of the non-boot CPUs in one shot to avoid races | ||
| 273 | * with the userspace trying to use the CPU hotplug at the same time | ||
| 274 | */ | ||
| 275 | cpus_clear(frozen_cpus); | ||
| 276 | printk("Disabling non-boot CPUs ...\n"); | ||
| 277 | for_each_online_cpu(cpu) { | ||
| 278 | if (cpu == first_cpu) | ||
| 279 | continue; | ||
| 280 | error = _cpu_down(cpu); | ||
| 281 | if (!error) { | ||
| 282 | cpu_set(cpu, frozen_cpus); | ||
| 283 | printk("CPU%d is down\n", cpu); | ||
| 284 | } else { | ||
| 285 | printk(KERN_ERR "Error taking CPU%d down: %d\n", | ||
| 286 | cpu, error); | ||
| 287 | break; | ||
| 288 | } | ||
| 289 | } | ||
| 290 | if (!error) { | ||
| 291 | BUG_ON(num_online_cpus() > 1); | ||
| 292 | /* Make sure the CPUs won't be enabled by someone else */ | ||
| 293 | cpu_hotplug_disabled = 1; | ||
| 294 | } else { | ||
| 295 | printk(KERN_ERR "Non-boot CPUs are not disabled"); | ||
| 296 | } | ||
| 297 | out: | ||
| 298 | mutex_unlock(&cpu_add_remove_lock); | ||
| 299 | return error; | ||
| 300 | } | ||
| 301 | |||
| 302 | void enable_nonboot_cpus(void) | ||
| 303 | { | ||
| 304 | int cpu, error; | ||
| 305 | |||
| 306 | /* Allow everyone to use the CPU hotplug again */ | ||
| 307 | mutex_lock(&cpu_add_remove_lock); | ||
| 308 | cpu_hotplug_disabled = 0; | ||
| 309 | mutex_unlock(&cpu_add_remove_lock); | ||
| 310 | |||
| 311 | printk("Enabling non-boot CPUs ...\n"); | ||
| 312 | for_each_cpu_mask(cpu, frozen_cpus) { | ||
| 313 | error = cpu_up(cpu); | ||
| 314 | if (!error) { | ||
| 315 | printk("CPU%d is up\n", cpu); | ||
| 316 | continue; | ||
| 317 | } | ||
| 318 | printk(KERN_WARNING "Error taking CPU%d up: %d\n", | ||
| 319 | cpu, error); | ||
| 320 | } | ||
| 321 | cpus_clear(frozen_cpus); | ||
| 322 | } | ||
| 323 | #endif | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b602f73fb38d..8c3c400cce91 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -18,7 +18,6 @@ | |||
| 18 | * distribution for more details. | 18 | * distribution for more details. |
| 19 | */ | 19 | */ |
| 20 | 20 | ||
| 21 | #include <linux/config.h> | ||
| 22 | #include <linux/cpu.h> | 21 | #include <linux/cpu.h> |
| 23 | #include <linux/cpumask.h> | 22 | #include <linux/cpumask.h> |
| 24 | #include <linux/cpuset.h> | 23 | #include <linux/cpuset.h> |
| @@ -241,7 +240,7 @@ static struct super_block *cpuset_sb; | |||
| 241 | * A cpuset can only be deleted if both its 'count' of using tasks | 240 | * A cpuset can only be deleted if both its 'count' of using tasks |
| 242 | * is zero, and its list of 'children' cpusets is empty. Since all | 241 | * is zero, and its list of 'children' cpusets is empty. Since all |
| 243 | * tasks in the system use _some_ cpuset, and since there is always at | 242 | * tasks in the system use _some_ cpuset, and since there is always at |
| 244 | * least one task in the system (init, pid == 1), therefore, top_cpuset | 243 | * least one task in the system (init), therefore, top_cpuset |
| 245 | * always has either children cpusets and/or using tasks. So we don't | 244 | * always has either children cpusets and/or using tasks. So we don't |
| 246 | * need a special hack to ensure that top_cpuset cannot be deleted. | 245 | * need a special hack to ensure that top_cpuset cannot be deleted. |
| 247 | * | 246 | * |
| @@ -290,7 +289,6 @@ static struct inode *cpuset_new_inode(mode_t mode) | |||
| 290 | inode->i_mode = mode; | 289 | inode->i_mode = mode; |
| 291 | inode->i_uid = current->fsuid; | 290 | inode->i_uid = current->fsuid; |
| 292 | inode->i_gid = current->fsgid; | 291 | inode->i_gid = current->fsgid; |
| 293 | inode->i_blksize = PAGE_CACHE_SIZE; | ||
| 294 | inode->i_blocks = 0; | 292 | inode->i_blocks = 0; |
| 295 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 293 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
| 296 | inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info; | 294 | inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info; |
| @@ -763,6 +761,8 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
| 763 | * | 761 | * |
| 764 | * Call with manage_mutex held. May nest a call to the | 762 | * Call with manage_mutex held. May nest a call to the |
| 765 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | 763 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. |
| 764 | * Must not be called holding callback_mutex, because we must | ||
| 765 | * not call lock_cpu_hotplug() while holding callback_mutex. | ||
| 766 | */ | 766 | */ |
| 767 | 767 | ||
| 768 | static void update_cpu_domains(struct cpuset *cur) | 768 | static void update_cpu_domains(struct cpuset *cur) |
| @@ -782,7 +782,7 @@ static void update_cpu_domains(struct cpuset *cur) | |||
| 782 | if (is_cpu_exclusive(c)) | 782 | if (is_cpu_exclusive(c)) |
| 783 | cpus_andnot(pspan, pspan, c->cpus_allowed); | 783 | cpus_andnot(pspan, pspan, c->cpus_allowed); |
| 784 | } | 784 | } |
| 785 | if (is_removed(cur) || !is_cpu_exclusive(cur)) { | 785 | if (!is_cpu_exclusive(cur)) { |
| 786 | cpus_or(pspan, pspan, cur->cpus_allowed); | 786 | cpus_or(pspan, pspan, cur->cpus_allowed); |
| 787 | if (cpus_equal(pspan, cur->cpus_allowed)) | 787 | if (cpus_equal(pspan, cur->cpus_allowed)) |
| 788 | return; | 788 | return; |
| @@ -815,6 +815,10 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
| 815 | struct cpuset trialcs; | 815 | struct cpuset trialcs; |
| 816 | int retval, cpus_unchanged; | 816 | int retval, cpus_unchanged; |
| 817 | 817 | ||
| 818 | /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ | ||
| 819 | if (cs == &top_cpuset) | ||
| 820 | return -EACCES; | ||
| 821 | |||
| 818 | trialcs = *cs; | 822 | trialcs = *cs; |
| 819 | retval = cpulist_parse(buf, trialcs.cpus_allowed); | 823 | retval = cpulist_parse(buf, trialcs.cpus_allowed); |
| 820 | if (retval < 0) | 824 | if (retval < 0) |
| @@ -908,6 +912,10 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
| 908 | int fudge; | 912 | int fudge; |
| 909 | int retval; | 913 | int retval; |
| 910 | 914 | ||
| 915 | /* top_cpuset.mems_allowed tracks node_online_map; it's read-only */ | ||
| 916 | if (cs == &top_cpuset) | ||
| 917 | return -EACCES; | ||
| 918 | |||
| 911 | trialcs = *cs; | 919 | trialcs = *cs; |
| 912 | retval = nodelist_parse(buf, trialcs.mems_allowed); | 920 | retval = nodelist_parse(buf, trialcs.mems_allowed); |
| 913 | if (retval < 0) | 921 | if (retval < 0) |
| @@ -1064,7 +1072,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
| 1064 | } | 1072 | } |
| 1065 | 1073 | ||
| 1066 | /* | 1074 | /* |
| 1067 | * Frequency meter - How fast is some event occuring? | 1075 | * Frequency meter - How fast is some event occurring? |
| 1068 | * | 1076 | * |
| 1069 | * These routines manage a digitally filtered, constant time based, | 1077 | * These routines manage a digitally filtered, constant time based, |
| 1070 | * event frequency meter. There are four routines: | 1078 | * event frequency meter. There are four routines: |
| @@ -1217,7 +1225,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
| 1217 | 1225 | ||
| 1218 | task_lock(tsk); | 1226 | task_lock(tsk); |
| 1219 | oldcs = tsk->cpuset; | 1227 | oldcs = tsk->cpuset; |
| 1220 | if (!oldcs) { | 1228 | /* |
| 1229 | * After getting 'oldcs' cpuset ptr, be sure still not exiting. | ||
| 1230 | * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack | ||
| 1231 | * then fail this attach_task(), to avoid breaking top_cpuset.count. | ||
| 1232 | */ | ||
| 1233 | if (tsk->flags & PF_EXITING) { | ||
| 1221 | task_unlock(tsk); | 1234 | task_unlock(tsk); |
| 1222 | mutex_unlock(&callback_mutex); | 1235 | mutex_unlock(&callback_mutex); |
| 1223 | put_task_struct(tsk); | 1236 | put_task_struct(tsk); |
| @@ -1918,6 +1931,17 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode) | |||
| 1918 | return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); | 1931 | return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); |
| 1919 | } | 1932 | } |
| 1920 | 1933 | ||
| 1934 | /* | ||
| 1935 | * Locking note on the strange update_flag() call below: | ||
| 1936 | * | ||
| 1937 | * If the cpuset being removed is marked cpu_exclusive, then simulate | ||
| 1938 | * turning cpu_exclusive off, which will call update_cpu_domains(). | ||
| 1939 | * The lock_cpu_hotplug() call in update_cpu_domains() must not be | ||
| 1940 | * made while holding callback_mutex. Elsewhere the kernel nests | ||
| 1941 | * callback_mutex inside lock_cpu_hotplug() calls. So the reverse | ||
| 1942 | * nesting would risk an ABBA deadlock. | ||
| 1943 | */ | ||
| 1944 | |||
| 1921 | static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | 1945 | static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) |
| 1922 | { | 1946 | { |
| 1923 | struct cpuset *cs = dentry->d_fsdata; | 1947 | struct cpuset *cs = dentry->d_fsdata; |
| @@ -1937,11 +1961,16 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
| 1937 | mutex_unlock(&manage_mutex); | 1961 | mutex_unlock(&manage_mutex); |
| 1938 | return -EBUSY; | 1962 | return -EBUSY; |
| 1939 | } | 1963 | } |
| 1964 | if (is_cpu_exclusive(cs)) { | ||
| 1965 | int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0"); | ||
| 1966 | if (retval < 0) { | ||
| 1967 | mutex_unlock(&manage_mutex); | ||
| 1968 | return retval; | ||
| 1969 | } | ||
| 1970 | } | ||
| 1940 | parent = cs->parent; | 1971 | parent = cs->parent; |
| 1941 | mutex_lock(&callback_mutex); | 1972 | mutex_lock(&callback_mutex); |
| 1942 | set_bit(CS_REMOVED, &cs->flags); | 1973 | set_bit(CS_REMOVED, &cs->flags); |
| 1943 | if (is_cpu_exclusive(cs)) | ||
| 1944 | update_cpu_domains(cs); | ||
| 1945 | list_del(&cs->sibling); /* delete my sibling from parent->children */ | 1974 | list_del(&cs->sibling); /* delete my sibling from parent->children */ |
| 1946 | spin_lock(&cs->dentry->d_lock); | 1975 | spin_lock(&cs->dentry->d_lock); |
| 1947 | d = dget(cs->dentry); | 1976 | d = dget(cs->dentry); |
| @@ -2016,6 +2045,104 @@ out: | |||
| 2016 | return err; | 2045 | return err; |
| 2017 | } | 2046 | } |
| 2018 | 2047 | ||
| 2048 | #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG) | ||
| 2049 | /* | ||
| 2050 | * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs | ||
| 2051 | * or memory nodes, we need to walk over the cpuset hierarchy, | ||
| 2052 | * removing that CPU or node from all cpusets. If this removes the | ||
| 2053 | * last CPU or node from a cpuset, then the guarantee_online_cpus() | ||
| 2054 | * or guarantee_online_mems() code will use that emptied cpusets | ||
| 2055 | * parent online CPUs or nodes. Cpusets that were already empty of | ||
| 2056 | * CPUs or nodes are left empty. | ||
| 2057 | * | ||
| 2058 | * This routine is intentionally inefficient in a couple of regards. | ||
| 2059 | * It will check all cpusets in a subtree even if the top cpuset of | ||
| 2060 | * the subtree has no offline CPUs or nodes. It checks both CPUs and | ||
| 2061 | * nodes, even though the caller could have been coded to know that | ||
| 2062 | * only one of CPUs or nodes needed to be checked on a given call. | ||
| 2063 | * This was done to minimize text size rather than cpu cycles. | ||
| 2064 | * | ||
| 2065 | * Call with both manage_mutex and callback_mutex held. | ||
| 2066 | * | ||
| 2067 | * Recursive, on depth of cpuset subtree. | ||
| 2068 | */ | ||
| 2069 | |||
| 2070 | static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) | ||
| 2071 | { | ||
| 2072 | struct cpuset *c; | ||
| 2073 | |||
| 2074 | /* Each of our child cpusets mems must be online */ | ||
| 2075 | list_for_each_entry(c, &cur->children, sibling) { | ||
| 2076 | guarantee_online_cpus_mems_in_subtree(c); | ||
| 2077 | if (!cpus_empty(c->cpus_allowed)) | ||
| 2078 | guarantee_online_cpus(c, &c->cpus_allowed); | ||
| 2079 | if (!nodes_empty(c->mems_allowed)) | ||
| 2080 | guarantee_online_mems(c, &c->mems_allowed); | ||
| 2081 | } | ||
| 2082 | } | ||
| 2083 | |||
| 2084 | /* | ||
| 2085 | * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track | ||
| 2086 | * cpu_online_map and node_online_map. Force the top cpuset to track | ||
| 2087 | * whats online after any CPU or memory node hotplug or unplug event. | ||
| 2088 | * | ||
| 2089 | * To ensure that we don't remove a CPU or node from the top cpuset | ||
| 2090 | * that is currently in use by a child cpuset (which would violate | ||
| 2091 | * the rule that cpusets must be subsets of their parent), we first | ||
| 2092 | * call the recursive routine guarantee_online_cpus_mems_in_subtree(). | ||
| 2093 | * | ||
| 2094 | * Since there are two callers of this routine, one for CPU hotplug | ||
| 2095 | * events and one for memory node hotplug events, we could have coded | ||
| 2096 | * two separate routines here. We code it as a single common routine | ||
| 2097 | * in order to minimize text size. | ||
| 2098 | */ | ||
| 2099 | |||
| 2100 | static void common_cpu_mem_hotplug_unplug(void) | ||
| 2101 | { | ||
| 2102 | mutex_lock(&manage_mutex); | ||
| 2103 | mutex_lock(&callback_mutex); | ||
| 2104 | |||
| 2105 | guarantee_online_cpus_mems_in_subtree(&top_cpuset); | ||
| 2106 | top_cpuset.cpus_allowed = cpu_online_map; | ||
| 2107 | top_cpuset.mems_allowed = node_online_map; | ||
| 2108 | |||
| 2109 | mutex_unlock(&callback_mutex); | ||
| 2110 | mutex_unlock(&manage_mutex); | ||
| 2111 | } | ||
| 2112 | #endif | ||
| 2113 | |||
| 2114 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 2115 | /* | ||
| 2116 | * The top_cpuset tracks what CPUs and Memory Nodes are online, | ||
| 2117 | * period. This is necessary in order to make cpusets transparent | ||
| 2118 | * (of no affect) on systems that are actively using CPU hotplug | ||
| 2119 | * but making no active use of cpusets. | ||
| 2120 | * | ||
| 2121 | * This routine ensures that top_cpuset.cpus_allowed tracks | ||
| 2122 | * cpu_online_map on each CPU hotplug (cpuhp) event. | ||
| 2123 | */ | ||
| 2124 | |||
| 2125 | static int cpuset_handle_cpuhp(struct notifier_block *nb, | ||
| 2126 | unsigned long phase, void *cpu) | ||
| 2127 | { | ||
| 2128 | common_cpu_mem_hotplug_unplug(); | ||
| 2129 | return 0; | ||
| 2130 | } | ||
| 2131 | #endif | ||
| 2132 | |||
| 2133 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
| 2134 | /* | ||
| 2135 | * Keep top_cpuset.mems_allowed tracking node_online_map. | ||
| 2136 | * Call this routine anytime after you change node_online_map. | ||
| 2137 | * See also the previous routine cpuset_handle_cpuhp(). | ||
| 2138 | */ | ||
| 2139 | |||
| 2140 | void cpuset_track_online_nodes() | ||
| 2141 | { | ||
| 2142 | common_cpu_mem_hotplug_unplug(); | ||
| 2143 | } | ||
| 2144 | #endif | ||
| 2145 | |||
| 2019 | /** | 2146 | /** |
| 2020 | * cpuset_init_smp - initialize cpus_allowed | 2147 | * cpuset_init_smp - initialize cpus_allowed |
| 2021 | * | 2148 | * |
| @@ -2026,6 +2153,8 @@ void __init cpuset_init_smp(void) | |||
| 2026 | { | 2153 | { |
| 2027 | top_cpuset.cpus_allowed = cpu_online_map; | 2154 | top_cpuset.cpus_allowed = cpu_online_map; |
| 2028 | top_cpuset.mems_allowed = node_online_map; | 2155 | top_cpuset.mems_allowed = node_online_map; |
| 2156 | |||
| 2157 | hotcpu_notifier(cpuset_handle_cpuhp, 0); | ||
| 2029 | } | 2158 | } |
| 2030 | 2159 | ||
| 2031 | /** | 2160 | /** |
| @@ -2195,7 +2324,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) | |||
| 2195 | int i; | 2324 | int i; |
| 2196 | 2325 | ||
| 2197 | for (i = 0; zl->zones[i]; i++) { | 2326 | for (i = 0; zl->zones[i]; i++) { |
| 2198 | int nid = zl->zones[i]->zone_pgdat->node_id; | 2327 | int nid = zone_to_nid(zl->zones[i]); |
| 2199 | 2328 | ||
| 2200 | if (node_isset(nid, current->mems_allowed)) | 2329 | if (node_isset(nid, current->mems_allowed)) |
| 2201 | return 1; | 2330 | return 1; |
| @@ -2266,9 +2395,9 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | |||
| 2266 | const struct cpuset *cs; /* current cpuset ancestors */ | 2395 | const struct cpuset *cs; /* current cpuset ancestors */ |
| 2267 | int allowed; /* is allocation in zone z allowed? */ | 2396 | int allowed; /* is allocation in zone z allowed? */ |
| 2268 | 2397 | ||
| 2269 | if (in_interrupt()) | 2398 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) |
| 2270 | return 1; | 2399 | return 1; |
| 2271 | node = z->zone_pgdat->node_id; | 2400 | node = zone_to_nid(z); |
| 2272 | might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); | 2401 | might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); |
| 2273 | if (node_isset(node, current->mems_allowed)) | 2402 | if (node_isset(node, current->mems_allowed)) |
| 2274 | return 1; | 2403 | return 1; |
| @@ -2370,7 +2499,7 @@ EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); | |||
| 2370 | int cpuset_excl_nodes_overlap(const struct task_struct *p) | 2499 | int cpuset_excl_nodes_overlap(const struct task_struct *p) |
| 2371 | { | 2500 | { |
| 2372 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ | 2501 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ |
| 2373 | int overlap = 0; /* do cpusets overlap? */ | 2502 | int overlap = 1; /* do cpusets overlap? */ |
| 2374 | 2503 | ||
| 2375 | task_lock(current); | 2504 | task_lock(current); |
| 2376 | if (current->flags & PF_EXITING) { | 2505 | if (current->flags & PF_EXITING) { |
| @@ -2442,31 +2571,43 @@ void __cpuset_memory_pressure_bump(void) | |||
| 2442 | */ | 2571 | */ |
| 2443 | static int proc_cpuset_show(struct seq_file *m, void *v) | 2572 | static int proc_cpuset_show(struct seq_file *m, void *v) |
| 2444 | { | 2573 | { |
| 2574 | struct pid *pid; | ||
| 2445 | struct task_struct *tsk; | 2575 | struct task_struct *tsk; |
| 2446 | char *buf; | 2576 | char *buf; |
| 2447 | int retval = 0; | 2577 | int retval; |
| 2448 | 2578 | ||
| 2579 | retval = -ENOMEM; | ||
| 2449 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); | 2580 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); |
| 2450 | if (!buf) | 2581 | if (!buf) |
| 2451 | return -ENOMEM; | 2582 | goto out; |
| 2583 | |||
| 2584 | retval = -ESRCH; | ||
| 2585 | pid = m->private; | ||
| 2586 | tsk = get_pid_task(pid, PIDTYPE_PID); | ||
| 2587 | if (!tsk) | ||
| 2588 | goto out_free; | ||
| 2452 | 2589 | ||
| 2453 | tsk = m->private; | 2590 | retval = -EINVAL; |
| 2454 | mutex_lock(&manage_mutex); | 2591 | mutex_lock(&manage_mutex); |
| 2592 | |||
| 2455 | retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); | 2593 | retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); |
| 2456 | if (retval < 0) | 2594 | if (retval < 0) |
| 2457 | goto out; | 2595 | goto out_unlock; |
| 2458 | seq_puts(m, buf); | 2596 | seq_puts(m, buf); |
| 2459 | seq_putc(m, '\n'); | 2597 | seq_putc(m, '\n'); |
| 2460 | out: | 2598 | out_unlock: |
| 2461 | mutex_unlock(&manage_mutex); | 2599 | mutex_unlock(&manage_mutex); |
| 2600 | put_task_struct(tsk); | ||
| 2601 | out_free: | ||
| 2462 | kfree(buf); | 2602 | kfree(buf); |
| 2603 | out: | ||
| 2463 | return retval; | 2604 | return retval; |
| 2464 | } | 2605 | } |
| 2465 | 2606 | ||
| 2466 | static int cpuset_open(struct inode *inode, struct file *file) | 2607 | static int cpuset_open(struct inode *inode, struct file *file) |
| 2467 | { | 2608 | { |
| 2468 | struct task_struct *tsk = PROC_I(inode)->task; | 2609 | struct pid *pid = PROC_I(inode)->pid; |
| 2469 | return single_open(file, proc_cpuset_show, tsk); | 2610 | return single_open(file, proc_cpuset_show, pid); |
| 2470 | } | 2611 | } |
| 2471 | 2612 | ||
| 2472 | struct file_operations proc_cpuset_operations = { | 2613 | struct file_operations proc_cpuset_operations = { |
diff --git a/kernel/delayacct.c b/kernel/delayacct.c new file mode 100644 index 000000000000..36752f124c6a --- /dev/null +++ b/kernel/delayacct.c | |||
| @@ -0,0 +1,162 @@ | |||
| 1 | /* delayacct.c - per-task delay accounting | ||
| 2 | * | ||
| 3 | * Copyright (C) Shailabh Nagar, IBM Corp. 2006 | ||
| 4 | * | ||
| 5 | * This program is free software; you can redistribute it and/or modify | ||
| 6 | * it under the terms of the GNU General Public License as published by | ||
| 7 | * the Free Software Foundation; either version 2 of the License, or | ||
| 8 | * (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This program is distributed in the hope that it would be useful, but | ||
| 11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
| 13 | * the GNU General Public License for more details. | ||
| 14 | */ | ||
| 15 | |||
| 16 | #include <linux/sched.h> | ||
| 17 | #include <linux/slab.h> | ||
| 18 | #include <linux/time.h> | ||
| 19 | #include <linux/sysctl.h> | ||
| 20 | #include <linux/delayacct.h> | ||
| 21 | |||
| 22 | int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ | ||
| 23 | kmem_cache_t *delayacct_cache; | ||
| 24 | |||
| 25 | static int __init delayacct_setup_disable(char *str) | ||
| 26 | { | ||
| 27 | delayacct_on = 0; | ||
| 28 | return 1; | ||
| 29 | } | ||
| 30 | __setup("nodelayacct", delayacct_setup_disable); | ||
| 31 | |||
| 32 | void delayacct_init(void) | ||
| 33 | { | ||
| 34 | delayacct_cache = kmem_cache_create("delayacct_cache", | ||
| 35 | sizeof(struct task_delay_info), | ||
| 36 | 0, | ||
| 37 | SLAB_PANIC, | ||
| 38 | NULL, NULL); | ||
| 39 | delayacct_tsk_init(&init_task); | ||
| 40 | } | ||
| 41 | |||
| 42 | void __delayacct_tsk_init(struct task_struct *tsk) | ||
| 43 | { | ||
| 44 | tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL); | ||
| 45 | if (tsk->delays) | ||
| 46 | spin_lock_init(&tsk->delays->lock); | ||
| 47 | } | ||
| 48 | |||
| 49 | /* | ||
| 50 | * Start accounting for a delay statistic using | ||
| 51 | * its starting timestamp (@start) | ||
| 52 | */ | ||
| 53 | |||
| 54 | static inline void delayacct_start(struct timespec *start) | ||
| 55 | { | ||
| 56 | do_posix_clock_monotonic_gettime(start); | ||
| 57 | } | ||
| 58 | |||
| 59 | /* | ||
| 60 | * Finish delay accounting for a statistic using | ||
| 61 | * its timestamps (@start, @end), accumalator (@total) and @count | ||
| 62 | */ | ||
| 63 | |||
| 64 | static void delayacct_end(struct timespec *start, struct timespec *end, | ||
| 65 | u64 *total, u32 *count) | ||
| 66 | { | ||
| 67 | struct timespec ts; | ||
| 68 | s64 ns; | ||
| 69 | |||
| 70 | do_posix_clock_monotonic_gettime(end); | ||
| 71 | ts = timespec_sub(*end, *start); | ||
| 72 | ns = timespec_to_ns(&ts); | ||
| 73 | if (ns < 0) | ||
| 74 | return; | ||
| 75 | |||
| 76 | spin_lock(¤t->delays->lock); | ||
| 77 | *total += ns; | ||
| 78 | (*count)++; | ||
| 79 | spin_unlock(¤t->delays->lock); | ||
| 80 | } | ||
| 81 | |||
| 82 | void __delayacct_blkio_start(void) | ||
| 83 | { | ||
| 84 | delayacct_start(¤t->delays->blkio_start); | ||
| 85 | } | ||
| 86 | |||
| 87 | void __delayacct_blkio_end(void) | ||
| 88 | { | ||
| 89 | if (current->delays->flags & DELAYACCT_PF_SWAPIN) | ||
| 90 | /* Swapin block I/O */ | ||
| 91 | delayacct_end(¤t->delays->blkio_start, | ||
| 92 | ¤t->delays->blkio_end, | ||
| 93 | ¤t->delays->swapin_delay, | ||
| 94 | ¤t->delays->swapin_count); | ||
| 95 | else /* Other block I/O */ | ||
| 96 | delayacct_end(¤t->delays->blkio_start, | ||
| 97 | ¤t->delays->blkio_end, | ||
| 98 | ¤t->delays->blkio_delay, | ||
| 99 | ¤t->delays->blkio_count); | ||
| 100 | } | ||
| 101 | |||
| 102 | int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) | ||
| 103 | { | ||
| 104 | s64 tmp; | ||
| 105 | struct timespec ts; | ||
| 106 | unsigned long t1,t2,t3; | ||
| 107 | |||
| 108 | /* Though tsk->delays accessed later, early exit avoids | ||
| 109 | * unnecessary returning of other data | ||
| 110 | */ | ||
| 111 | if (!tsk->delays) | ||
| 112 | goto done; | ||
| 113 | |||
| 114 | tmp = (s64)d->cpu_run_real_total; | ||
| 115 | cputime_to_timespec(tsk->utime + tsk->stime, &ts); | ||
| 116 | tmp += timespec_to_ns(&ts); | ||
| 117 | d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; | ||
| 118 | |||
| 119 | /* | ||
| 120 | * No locking available for sched_info (and too expensive to add one) | ||
| 121 | * Mitigate by taking snapshot of values | ||
| 122 | */ | ||
| 123 | t1 = tsk->sched_info.pcnt; | ||
| 124 | t2 = tsk->sched_info.run_delay; | ||
| 125 | t3 = tsk->sched_info.cpu_time; | ||
| 126 | |||
| 127 | d->cpu_count += t1; | ||
| 128 | |||
| 129 | jiffies_to_timespec(t2, &ts); | ||
| 130 | tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts); | ||
| 131 | d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; | ||
| 132 | |||
| 133 | tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000; | ||
| 134 | d->cpu_run_virtual_total = | ||
| 135 | (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; | ||
| 136 | |||
| 137 | /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ | ||
| 138 | |||
| 139 | spin_lock(&tsk->delays->lock); | ||
| 140 | tmp = d->blkio_delay_total + tsk->delays->blkio_delay; | ||
| 141 | d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; | ||
| 142 | tmp = d->swapin_delay_total + tsk->delays->swapin_delay; | ||
| 143 | d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; | ||
| 144 | d->blkio_count += tsk->delays->blkio_count; | ||
| 145 | d->swapin_count += tsk->delays->swapin_count; | ||
| 146 | spin_unlock(&tsk->delays->lock); | ||
| 147 | |||
| 148 | done: | ||
| 149 | return 0; | ||
| 150 | } | ||
| 151 | |||
| 152 | __u64 __delayacct_blkio_ticks(struct task_struct *tsk) | ||
| 153 | { | ||
| 154 | __u64 ret; | ||
| 155 | |||
| 156 | spin_lock(&tsk->delays->lock); | ||
| 157 | ret = nsec_to_clock_t(tsk->delays->blkio_delay + | ||
| 158 | tsk->delays->swapin_delay); | ||
| 159 | spin_unlock(&tsk->delays->lock); | ||
| 160 | return ret; | ||
| 161 | } | ||
| 162 | |||
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index c01cead2cfd6..3c2eaea66b1e 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c | |||
| @@ -7,7 +7,6 @@ | |||
| 7 | * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org) | 7 | * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org) |
| 8 | */ | 8 | */ |
| 9 | 9 | ||
| 10 | #include <linux/config.h> | ||
| 11 | #include <linux/init.h> | 10 | #include <linux/init.h> |
| 12 | #include <linux/kernel.h> | 11 | #include <linux/kernel.h> |
| 13 | #include <linux/kmod.h> | 12 | #include <linux/kmod.h> |
diff --git a/kernel/exit.c b/kernel/exit.c index a3baf92462bd..2e4c13cba95a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -4,7 +4,6 @@ | |||
| 4 | * Copyright (C) 1991, 1992 Linus Torvalds | 4 | * Copyright (C) 1991, 1992 Linus Torvalds |
| 5 | */ | 5 | */ |
| 6 | 6 | ||
| 7 | #include <linux/config.h> | ||
| 8 | #include <linux/mm.h> | 7 | #include <linux/mm.h> |
| 9 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
| 10 | #include <linux/interrupt.h> | 9 | #include <linux/interrupt.h> |
| @@ -26,6 +25,8 @@ | |||
| 26 | #include <linux/mount.h> | 25 | #include <linux/mount.h> |
| 27 | #include <linux/proc_fs.h> | 26 | #include <linux/proc_fs.h> |
| 28 | #include <linux/mempolicy.h> | 27 | #include <linux/mempolicy.h> |
| 28 | #include <linux/taskstats_kern.h> | ||
| 29 | #include <linux/delayacct.h> | ||
| 29 | #include <linux/cpuset.h> | 30 | #include <linux/cpuset.h> |
| 30 | #include <linux/syscalls.h> | 31 | #include <linux/syscalls.h> |
| 31 | #include <linux/signal.h> | 32 | #include <linux/signal.h> |
| @@ -36,6 +37,7 @@ | |||
| 36 | #include <linux/compat.h> | 37 | #include <linux/compat.h> |
| 37 | #include <linux/pipe_fs_i.h> | 38 | #include <linux/pipe_fs_i.h> |
| 38 | #include <linux/audit.h> /* for audit_free() */ | 39 | #include <linux/audit.h> /* for audit_free() */ |
| 40 | #include <linux/resource.h> | ||
| 39 | 41 | ||
| 40 | #include <asm/uaccess.h> | 42 | #include <asm/uaccess.h> |
| 41 | #include <asm/unistd.h> | 43 | #include <asm/unistd.h> |
| @@ -45,8 +47,6 @@ | |||
| 45 | extern void sem_exit (void); | 47 | extern void sem_exit (void); |
| 46 | extern struct task_struct *child_reaper; | 48 | extern struct task_struct *child_reaper; |
| 47 | 49 | ||
| 48 | int getrusage(struct task_struct *, int, struct rusage __user *); | ||
| 49 | |||
| 50 | static void exit_mm(struct task_struct * tsk); | 50 | static void exit_mm(struct task_struct * tsk); |
| 51 | 51 | ||
| 52 | static void __unhash_process(struct task_struct *p) | 52 | static void __unhash_process(struct task_struct *p) |
| @@ -136,14 +136,10 @@ static void delayed_put_task_struct(struct rcu_head *rhp) | |||
| 136 | 136 | ||
| 137 | void release_task(struct task_struct * p) | 137 | void release_task(struct task_struct * p) |
| 138 | { | 138 | { |
| 139 | struct task_struct *leader; | ||
| 139 | int zap_leader; | 140 | int zap_leader; |
| 140 | task_t *leader; | ||
| 141 | struct dentry *proc_dentry; | ||
| 142 | |||
| 143 | repeat: | 141 | repeat: |
| 144 | atomic_dec(&p->user->processes); | 142 | atomic_dec(&p->user->processes); |
| 145 | spin_lock(&p->proc_lock); | ||
| 146 | proc_dentry = proc_pid_unhash(p); | ||
| 147 | write_lock_irq(&tasklist_lock); | 143 | write_lock_irq(&tasklist_lock); |
| 148 | ptrace_unlink(p); | 144 | ptrace_unlink(p); |
| 149 | BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); | 145 | BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); |
| @@ -172,8 +168,7 @@ repeat: | |||
| 172 | 168 | ||
| 173 | sched_exit(p); | 169 | sched_exit(p); |
| 174 | write_unlock_irq(&tasklist_lock); | 170 | write_unlock_irq(&tasklist_lock); |
| 175 | spin_unlock(&p->proc_lock); | 171 | proc_flush_task(p); |
| 176 | proc_pid_flush(proc_dentry); | ||
| 177 | release_thread(p); | 172 | release_thread(p); |
| 178 | call_rcu(&p->rcu, delayed_put_task_struct); | 173 | call_rcu(&p->rcu, delayed_put_task_struct); |
| 179 | 174 | ||
| @@ -216,7 +211,7 @@ out: | |||
| 216 | * | 211 | * |
| 217 | * "I ask you, have you ever known what it is to be an orphan?" | 212 | * "I ask you, have you ever known what it is to be an orphan?" |
| 218 | */ | 213 | */ |
| 219 | static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task) | 214 | static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task) |
| 220 | { | 215 | { |
| 221 | struct task_struct *p; | 216 | struct task_struct *p; |
| 222 | int ret = 1; | 217 | int ret = 1; |
| @@ -224,7 +219,7 @@ static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task) | |||
| 224 | do_each_task_pid(pgrp, PIDTYPE_PGID, p) { | 219 | do_each_task_pid(pgrp, PIDTYPE_PGID, p) { |
| 225 | if (p == ignored_task | 220 | if (p == ignored_task |
| 226 | || p->exit_state | 221 | || p->exit_state |
| 227 | || p->real_parent->pid == 1) | 222 | || is_init(p->real_parent)) |
| 228 | continue; | 223 | continue; |
| 229 | if (process_group(p->real_parent) != pgrp | 224 | if (process_group(p->real_parent) != pgrp |
| 230 | && p->real_parent->signal->session == p->signal->session) { | 225 | && p->real_parent->signal->session == p->signal->session) { |
| @@ -254,17 +249,6 @@ static int has_stopped_jobs(int pgrp) | |||
| 254 | do_each_task_pid(pgrp, PIDTYPE_PGID, p) { | 249 | do_each_task_pid(pgrp, PIDTYPE_PGID, p) { |
| 255 | if (p->state != TASK_STOPPED) | 250 | if (p->state != TASK_STOPPED) |
| 256 | continue; | 251 | continue; |
| 257 | |||
| 258 | /* If p is stopped by a debugger on a signal that won't | ||
| 259 | stop it, then don't count p as stopped. This isn't | ||
| 260 | perfect but it's a good approximation. */ | ||
| 261 | if (unlikely (p->ptrace) | ||
| 262 | && p->exit_code != SIGSTOP | ||
| 263 | && p->exit_code != SIGTSTP | ||
| 264 | && p->exit_code != SIGTTOU | ||
| 265 | && p->exit_code != SIGTTIN) | ||
| 266 | continue; | ||
| 267 | |||
| 268 | retval = 1; | 252 | retval = 1; |
| 269 | break; | 253 | break; |
| 270 | } while_each_task_pid(pgrp, PIDTYPE_PGID, p); | 254 | } while_each_task_pid(pgrp, PIDTYPE_PGID, p); |
| @@ -297,9 +281,7 @@ static void reparent_to_init(void) | |||
| 297 | /* Set the exit signal to SIGCHLD so we signal init on exit */ | 281 | /* Set the exit signal to SIGCHLD so we signal init on exit */ |
| 298 | current->exit_signal = SIGCHLD; | 282 | current->exit_signal = SIGCHLD; |
| 299 | 283 | ||
| 300 | if ((current->policy == SCHED_NORMAL || | 284 | if (!has_rt_policy(current) && (task_nice(current) < 0)) |
| 301 | current->policy == SCHED_BATCH) | ||
| 302 | && (task_nice(current) < 0)) | ||
| 303 | set_user_nice(current, 0); | 285 | set_user_nice(current, 0); |
| 304 | /* cpus_allowed? */ | 286 | /* cpus_allowed? */ |
| 305 | /* rt_priority? */ | 287 | /* rt_priority? */ |
| @@ -492,6 +474,18 @@ void fastcall put_files_struct(struct files_struct *files) | |||
| 492 | 474 | ||
| 493 | EXPORT_SYMBOL(put_files_struct); | 475 | EXPORT_SYMBOL(put_files_struct); |
| 494 | 476 | ||
| 477 | void reset_files_struct(struct task_struct *tsk, struct files_struct *files) | ||
| 478 | { | ||
| 479 | struct files_struct *old; | ||
| 480 | |||
| 481 | old = tsk->files; | ||
| 482 | task_lock(tsk); | ||
| 483 | tsk->files = files; | ||
| 484 | task_unlock(tsk); | ||
| 485 | put_files_struct(old); | ||
| 486 | } | ||
| 487 | EXPORT_SYMBOL(reset_files_struct); | ||
| 488 | |||
| 495 | static inline void __exit_files(struct task_struct *tsk) | 489 | static inline void __exit_files(struct task_struct *tsk) |
| 496 | { | 490 | { |
| 497 | struct files_struct * files = tsk->files; | 491 | struct files_struct * files = tsk->files; |
| @@ -589,7 +583,8 @@ static void exit_mm(struct task_struct * tsk) | |||
| 589 | mmput(mm); | 583 | mmput(mm); |
| 590 | } | 584 | } |
| 591 | 585 | ||
| 592 | static inline void choose_new_parent(task_t *p, task_t *reaper) | 586 | static inline void |
| 587 | choose_new_parent(struct task_struct *p, struct task_struct *reaper) | ||
| 593 | { | 588 | { |
| 594 | /* | 589 | /* |
| 595 | * Make sure we're not reparenting to ourselves and that | 590 | * Make sure we're not reparenting to ourselves and that |
| @@ -599,7 +594,8 @@ static inline void choose_new_parent(task_t *p, task_t *reaper) | |||
| 599 | p->real_parent = reaper; | 594 | p->real_parent = reaper; |
| 600 | } | 595 | } |
| 601 | 596 | ||
| 602 | static void reparent_thread(task_t *p, task_t *father, int traced) | 597 | static void |
| 598 | reparent_thread(struct task_struct *p, struct task_struct *father, int traced) | ||
| 603 | { | 599 | { |
| 604 | /* We don't want people slaying init. */ | 600 | /* We don't want people slaying init. */ |
| 605 | if (p->exit_signal != -1) | 601 | if (p->exit_signal != -1) |
| @@ -663,8 +659,8 @@ static void reparent_thread(task_t *p, task_t *father, int traced) | |||
| 663 | * group, and if no such member exists, give it to | 659 | * group, and if no such member exists, give it to |
| 664 | * the global child reaper process (ie "init") | 660 | * the global child reaper process (ie "init") |
| 665 | */ | 661 | */ |
| 666 | static void forget_original_parent(struct task_struct * father, | 662 | static void |
| 667 | struct list_head *to_release) | 663 | forget_original_parent(struct task_struct *father, struct list_head *to_release) |
| 668 | { | 664 | { |
| 669 | struct task_struct *p, *reaper = father; | 665 | struct task_struct *p, *reaper = father; |
| 670 | struct list_head *_p, *_n; | 666 | struct list_head *_p, *_n; |
| @@ -687,7 +683,7 @@ static void forget_original_parent(struct task_struct * father, | |||
| 687 | */ | 683 | */ |
| 688 | list_for_each_safe(_p, _n, &father->children) { | 684 | list_for_each_safe(_p, _n, &father->children) { |
| 689 | int ptrace; | 685 | int ptrace; |
| 690 | p = list_entry(_p,struct task_struct,sibling); | 686 | p = list_entry(_p, struct task_struct, sibling); |
| 691 | 687 | ||
| 692 | ptrace = p->ptrace; | 688 | ptrace = p->ptrace; |
| 693 | 689 | ||
| @@ -716,7 +712,7 @@ static void forget_original_parent(struct task_struct * father, | |||
| 716 | list_add(&p->ptrace_list, to_release); | 712 | list_add(&p->ptrace_list, to_release); |
| 717 | } | 713 | } |
| 718 | list_for_each_safe(_p, _n, &father->ptrace_children) { | 714 | list_for_each_safe(_p, _n, &father->ptrace_children) { |
| 719 | p = list_entry(_p,struct task_struct,ptrace_list); | 715 | p = list_entry(_p, struct task_struct, ptrace_list); |
| 720 | choose_new_parent(p, reaper); | 716 | choose_new_parent(p, reaper); |
| 721 | reparent_thread(p, father, 1); | 717 | reparent_thread(p, father, 1); |
| 722 | } | 718 | } |
| @@ -836,7 +832,7 @@ static void exit_notify(struct task_struct *tsk) | |||
| 836 | 832 | ||
| 837 | list_for_each_safe(_p, _n, &ptrace_dead) { | 833 | list_for_each_safe(_p, _n, &ptrace_dead) { |
| 838 | list_del_init(_p); | 834 | list_del_init(_p); |
| 839 | t = list_entry(_p,struct task_struct,ptrace_list); | 835 | t = list_entry(_p, struct task_struct, ptrace_list); |
| 840 | release_task(t); | 836 | release_task(t); |
| 841 | } | 837 | } |
| 842 | 838 | ||
| @@ -848,7 +844,9 @@ static void exit_notify(struct task_struct *tsk) | |||
| 848 | fastcall NORET_TYPE void do_exit(long code) | 844 | fastcall NORET_TYPE void do_exit(long code) |
| 849 | { | 845 | { |
| 850 | struct task_struct *tsk = current; | 846 | struct task_struct *tsk = current; |
| 847 | struct taskstats *tidstats; | ||
| 851 | int group_dead; | 848 | int group_dead; |
| 849 | unsigned int mycpu; | ||
| 852 | 850 | ||
| 853 | profile_task_exit(tsk); | 851 | profile_task_exit(tsk); |
| 854 | 852 | ||
| @@ -886,6 +884,8 @@ fastcall NORET_TYPE void do_exit(long code) | |||
| 886 | current->comm, current->pid, | 884 | current->comm, current->pid, |
| 887 | preempt_count()); | 885 | preempt_count()); |
| 888 | 886 | ||
| 887 | taskstats_exit_alloc(&tidstats, &mycpu); | ||
| 888 | |||
| 889 | acct_update_integrals(tsk); | 889 | acct_update_integrals(tsk); |
| 890 | if (tsk->mm) { | 890 | if (tsk->mm) { |
| 891 | update_hiwater_rss(tsk->mm); | 891 | update_hiwater_rss(tsk->mm); |
| @@ -895,18 +895,23 @@ fastcall NORET_TYPE void do_exit(long code) | |||
| 895 | if (group_dead) { | 895 | if (group_dead) { |
| 896 | hrtimer_cancel(&tsk->signal->real_timer); | 896 | hrtimer_cancel(&tsk->signal->real_timer); |
| 897 | exit_itimers(tsk->signal); | 897 | exit_itimers(tsk->signal); |
| 898 | acct_process(code); | ||
| 899 | } | 898 | } |
| 899 | acct_collect(code, group_dead); | ||
| 900 | if (unlikely(tsk->robust_list)) | 900 | if (unlikely(tsk->robust_list)) |
| 901 | exit_robust_list(tsk); | 901 | exit_robust_list(tsk); |
| 902 | #ifdef CONFIG_COMPAT | 902 | #if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT) |
| 903 | if (unlikely(tsk->compat_robust_list)) | 903 | if (unlikely(tsk->compat_robust_list)) |
| 904 | compat_exit_robust_list(tsk); | 904 | compat_exit_robust_list(tsk); |
| 905 | #endif | 905 | #endif |
| 906 | if (unlikely(tsk->audit_context)) | 906 | if (unlikely(tsk->audit_context)) |
| 907 | audit_free(tsk); | 907 | audit_free(tsk); |
| 908 | taskstats_exit_send(tsk, tidstats, group_dead, mycpu); | ||
| 909 | taskstats_exit_free(tidstats); | ||
| 910 | |||
| 908 | exit_mm(tsk); | 911 | exit_mm(tsk); |
| 909 | 912 | ||
| 913 | if (group_dead) | ||
| 914 | acct_process(); | ||
| 910 | exit_sem(tsk); | 915 | exit_sem(tsk); |
| 911 | __exit_files(tsk); | 916 | __exit_files(tsk); |
| 912 | __exit_fs(tsk); | 917 | __exit_fs(tsk); |
| @@ -930,9 +935,17 @@ fastcall NORET_TYPE void do_exit(long code) | |||
| 930 | tsk->mempolicy = NULL; | 935 | tsk->mempolicy = NULL; |
| 931 | #endif | 936 | #endif |
| 932 | /* | 937 | /* |
| 933 | * If DEBUG_MUTEXES is on, make sure we are holding no locks: | 938 | * This must happen late, after the PID is not |
| 939 | * hashed anymore: | ||
| 934 | */ | 940 | */ |
| 935 | mutex_debug_check_no_locks_held(tsk); | 941 | if (unlikely(!list_empty(&tsk->pi_state_list))) |
| 942 | exit_pi_state_list(tsk); | ||
| 943 | if (unlikely(current->pi_state_cache)) | ||
| 944 | kfree(current->pi_state_cache); | ||
| 945 | /* | ||
| 946 | * Make sure we are holding no locks: | ||
| 947 | */ | ||
| 948 | debug_check_no_locks_held(tsk); | ||
| 936 | 949 | ||
| 937 | if (tsk->io_context) | 950 | if (tsk->io_context) |
| 938 | exit_io_context(); | 951 | exit_io_context(); |
| @@ -940,15 +953,15 @@ fastcall NORET_TYPE void do_exit(long code) | |||
| 940 | if (tsk->splice_pipe) | 953 | if (tsk->splice_pipe) |
| 941 | __free_pipe_info(tsk->splice_pipe); | 954 | __free_pipe_info(tsk->splice_pipe); |
| 942 | 955 | ||
| 943 | /* PF_DEAD causes final put_task_struct after we schedule. */ | ||
| 944 | preempt_disable(); | 956 | preempt_disable(); |
| 945 | BUG_ON(tsk->flags & PF_DEAD); | 957 | /* causes final put_task_struct in finish_task_switch(). */ |
| 946 | tsk->flags |= PF_DEAD; | 958 | tsk->state = TASK_DEAD; |
| 947 | 959 | ||
| 948 | schedule(); | 960 | schedule(); |
| 949 | BUG(); | 961 | BUG(); |
| 950 | /* Avoid "noreturn function does return". */ | 962 | /* Avoid "noreturn function does return". */ |
| 951 | for (;;) ; | 963 | for (;;) |
| 964 | cpu_relax(); /* For when BUG is null */ | ||
| 952 | } | 965 | } |
| 953 | 966 | ||
| 954 | EXPORT_SYMBOL_GPL(do_exit); | 967 | EXPORT_SYMBOL_GPL(do_exit); |
| @@ -957,7 +970,7 @@ NORET_TYPE void complete_and_exit(struct completion *comp, long code) | |||
| 957 | { | 970 | { |
| 958 | if (comp) | 971 | if (comp) |
| 959 | complete(comp); | 972 | complete(comp); |
| 960 | 973 | ||
| 961 | do_exit(code); | 974 | do_exit(code); |
| 962 | } | 975 | } |
| 963 | 976 | ||
| @@ -1007,7 +1020,7 @@ asmlinkage void sys_exit_group(int error_code) | |||
| 1007 | do_group_exit((error_code & 0xff) << 8); | 1020 | do_group_exit((error_code & 0xff) << 8); |
| 1008 | } | 1021 | } |
| 1009 | 1022 | ||
| 1010 | static int eligible_child(pid_t pid, int options, task_t *p) | 1023 | static int eligible_child(pid_t pid, int options, struct task_struct *p) |
| 1011 | { | 1024 | { |
| 1012 | if (pid > 0) { | 1025 | if (pid > 0) { |
| 1013 | if (p->pid != pid) | 1026 | if (p->pid != pid) |
| @@ -1039,7 +1052,7 @@ static int eligible_child(pid_t pid, int options, task_t *p) | |||
| 1039 | * Do not consider thread group leaders that are | 1052 | * Do not consider thread group leaders that are |
| 1040 | * in a non-empty thread group: | 1053 | * in a non-empty thread group: |
| 1041 | */ | 1054 | */ |
| 1042 | if (current->tgid != p->tgid && delay_group_leader(p)) | 1055 | if (delay_group_leader(p)) |
| 1043 | return 2; | 1056 | return 2; |
| 1044 | 1057 | ||
| 1045 | if (security_task_wait(p)) | 1058 | if (security_task_wait(p)) |
| @@ -1048,12 +1061,13 @@ static int eligible_child(pid_t pid, int options, task_t *p) | |||
| 1048 | return 1; | 1061 | return 1; |
| 1049 | } | 1062 | } |
| 1050 | 1063 | ||
| 1051 | static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid, | 1064 | static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, |
| 1052 | int why, int status, | 1065 | int why, int status, |
| 1053 | struct siginfo __user *infop, | 1066 | struct siginfo __user *infop, |
| 1054 | struct rusage __user *rusagep) | 1067 | struct rusage __user *rusagep) |
| 1055 | { | 1068 | { |
| 1056 | int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; | 1069 | int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; |
| 1070 | |||
| 1057 | put_task_struct(p); | 1071 | put_task_struct(p); |
| 1058 | if (!retval) | 1072 | if (!retval) |
| 1059 | retval = put_user(SIGCHLD, &infop->si_signo); | 1073 | retval = put_user(SIGCHLD, &infop->si_signo); |
| @@ -1078,7 +1092,7 @@ static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid, | |||
| 1078 | * the lock and this task is uninteresting. If we return nonzero, we have | 1092 | * the lock and this task is uninteresting. If we return nonzero, we have |
| 1079 | * released the lock and the system call should return. | 1093 | * released the lock and the system call should return. |
| 1080 | */ | 1094 | */ |
| 1081 | static int wait_task_zombie(task_t *p, int noreap, | 1095 | static int wait_task_zombie(struct task_struct *p, int noreap, |
| 1082 | struct siginfo __user *infop, | 1096 | struct siginfo __user *infop, |
| 1083 | int __user *stat_addr, struct rusage __user *ru) | 1097 | int __user *stat_addr, struct rusage __user *ru) |
| 1084 | { | 1098 | { |
| @@ -1240,8 +1254,8 @@ static int wait_task_zombie(task_t *p, int noreap, | |||
| 1240 | * the lock and this task is uninteresting. If we return nonzero, we have | 1254 | * the lock and this task is uninteresting. If we return nonzero, we have |
| 1241 | * released the lock and the system call should return. | 1255 | * released the lock and the system call should return. |
| 1242 | */ | 1256 | */ |
| 1243 | static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap, | 1257 | static int wait_task_stopped(struct task_struct *p, int delayed_group_leader, |
| 1244 | struct siginfo __user *infop, | 1258 | int noreap, struct siginfo __user *infop, |
| 1245 | int __user *stat_addr, struct rusage __user *ru) | 1259 | int __user *stat_addr, struct rusage __user *ru) |
| 1246 | { | 1260 | { |
| 1247 | int retval, exit_code; | 1261 | int retval, exit_code; |
| @@ -1355,7 +1369,7 @@ bail_ref: | |||
| 1355 | * the lock and this task is uninteresting. If we return nonzero, we have | 1369 | * the lock and this task is uninteresting. If we return nonzero, we have |
| 1356 | * released the lock and the system call should return. | 1370 | * released the lock and the system call should return. |
| 1357 | */ | 1371 | */ |
| 1358 | static int wait_task_continued(task_t *p, int noreap, | 1372 | static int wait_task_continued(struct task_struct *p, int noreap, |
| 1359 | struct siginfo __user *infop, | 1373 | struct siginfo __user *infop, |
| 1360 | int __user *stat_addr, struct rusage __user *ru) | 1374 | int __user *stat_addr, struct rusage __user *ru) |
| 1361 | { | 1375 | { |
| @@ -1441,7 +1455,7 @@ repeat: | |||
| 1441 | int ret; | 1455 | int ret; |
| 1442 | 1456 | ||
| 1443 | list_for_each(_p,&tsk->children) { | 1457 | list_for_each(_p,&tsk->children) { |
| 1444 | p = list_entry(_p,struct task_struct,sibling); | 1458 | p = list_entry(_p, struct task_struct, sibling); |
| 1445 | 1459 | ||
| 1446 | ret = eligible_child(pid, options, p); | 1460 | ret = eligible_child(pid, options, p); |
| 1447 | if (!ret) | 1461 | if (!ret) |
diff --git a/kernel/fork.c b/kernel/fork.c index 49adc0e8d47c..1c999f3e0b47 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -11,7 +11,6 @@ | |||
| 11 | * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' | 11 | * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' |
| 12 | */ | 12 | */ |
| 13 | 13 | ||
| 14 | #include <linux/config.h> | ||
| 15 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
| 16 | #include <linux/init.h> | 15 | #include <linux/init.h> |
| 17 | #include <linux/unistd.h> | 16 | #include <linux/unistd.h> |
| @@ -44,6 +43,9 @@ | |||
| 44 | #include <linux/rmap.h> | 43 | #include <linux/rmap.h> |
| 45 | #include <linux/acct.h> | 44 | #include <linux/acct.h> |
| 46 | #include <linux/cn_proc.h> | 45 | #include <linux/cn_proc.h> |
| 46 | #include <linux/delayacct.h> | ||
| 47 | #include <linux/taskstats_kern.h> | ||
| 48 | #include <linux/random.h> | ||
| 47 | 49 | ||
| 48 | #include <asm/pgtable.h> | 50 | #include <asm/pgtable.h> |
| 49 | #include <asm/pgalloc.h> | 51 | #include <asm/pgalloc.h> |
| @@ -62,9 +64,7 @@ int max_threads; /* tunable limit on nr_threads */ | |||
| 62 | 64 | ||
| 63 | DEFINE_PER_CPU(unsigned long, process_counts) = 0; | 65 | DEFINE_PER_CPU(unsigned long, process_counts) = 0; |
| 64 | 66 | ||
| 65 | __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ | 67 | __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ |
| 66 | |||
| 67 | EXPORT_SYMBOL(tasklist_lock); | ||
| 68 | 68 | ||
| 69 | int nr_processes(void) | 69 | int nr_processes(void) |
| 70 | { | 70 | { |
| @@ -104,6 +104,7 @@ static kmem_cache_t *mm_cachep; | |||
| 104 | void free_task(struct task_struct *tsk) | 104 | void free_task(struct task_struct *tsk) |
| 105 | { | 105 | { |
| 106 | free_thread_info(tsk->thread_info); | 106 | free_thread_info(tsk->thread_info); |
| 107 | rt_mutex_debug_task_free(tsk); | ||
| 107 | free_task_struct(tsk); | 108 | free_task_struct(tsk); |
| 108 | } | 109 | } |
| 109 | EXPORT_SYMBOL(free_task); | 110 | EXPORT_SYMBOL(free_task); |
| @@ -117,6 +118,7 @@ void __put_task_struct(struct task_struct *tsk) | |||
| 117 | security_task_free(tsk); | 118 | security_task_free(tsk); |
| 118 | free_uid(tsk->user); | 119 | free_uid(tsk->user); |
| 119 | put_group_info(tsk->group_info); | 120 | put_group_info(tsk->group_info); |
| 121 | delayacct_tsk_free(tsk); | ||
| 120 | 122 | ||
| 121 | if (!profile_handoff_task(tsk)) | 123 | if (!profile_handoff_task(tsk)) |
| 122 | free_task(tsk); | 124 | free_task(tsk); |
| @@ -174,10 +176,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
| 174 | tsk->thread_info = ti; | 176 | tsk->thread_info = ti; |
| 175 | setup_thread_stack(tsk, orig); | 177 | setup_thread_stack(tsk, orig); |
| 176 | 178 | ||
| 179 | #ifdef CONFIG_CC_STACKPROTECTOR | ||
| 180 | tsk->stack_canary = get_random_int(); | ||
| 181 | #endif | ||
| 182 | |||
| 177 | /* One for us, one for whoever does the "release_task()" (usually parent) */ | 183 | /* One for us, one for whoever does the "release_task()" (usually parent) */ |
| 178 | atomic_set(&tsk->usage,2); | 184 | atomic_set(&tsk->usage,2); |
| 179 | atomic_set(&tsk->fs_excl, 0); | 185 | atomic_set(&tsk->fs_excl, 0); |
| 186 | #ifdef CONFIG_BLK_DEV_IO_TRACE | ||
| 180 | tsk->btrace_seq = 0; | 187 | tsk->btrace_seq = 0; |
| 188 | #endif | ||
| 181 | tsk->splice_pipe = NULL; | 189 | tsk->splice_pipe = NULL; |
| 182 | return tsk; | 190 | return tsk; |
| 183 | } | 191 | } |
| @@ -193,7 +201,10 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
| 193 | 201 | ||
| 194 | down_write(&oldmm->mmap_sem); | 202 | down_write(&oldmm->mmap_sem); |
| 195 | flush_cache_mm(oldmm); | 203 | flush_cache_mm(oldmm); |
| 196 | down_write(&mm->mmap_sem); | 204 | /* |
| 205 | * Not linked in yet - no deadlock potential: | ||
| 206 | */ | ||
| 207 | down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); | ||
| 197 | 208 | ||
| 198 | mm->locked_vm = 0; | 209 | mm->locked_vm = 0; |
| 199 | mm->mmap = NULL; | 210 | mm->mmap = NULL; |
| @@ -817,6 +828,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
| 817 | if (clone_flags & CLONE_THREAD) { | 828 | if (clone_flags & CLONE_THREAD) { |
| 818 | atomic_inc(¤t->signal->count); | 829 | atomic_inc(¤t->signal->count); |
| 819 | atomic_inc(¤t->signal->live); | 830 | atomic_inc(¤t->signal->live); |
| 831 | taskstats_tgid_alloc(current->signal); | ||
| 820 | return 0; | 832 | return 0; |
| 821 | } | 833 | } |
| 822 | sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); | 834 | sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); |
| @@ -861,6 +873,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
| 861 | INIT_LIST_HEAD(&sig->cpu_timers[0]); | 873 | INIT_LIST_HEAD(&sig->cpu_timers[0]); |
| 862 | INIT_LIST_HEAD(&sig->cpu_timers[1]); | 874 | INIT_LIST_HEAD(&sig->cpu_timers[1]); |
| 863 | INIT_LIST_HEAD(&sig->cpu_timers[2]); | 875 | INIT_LIST_HEAD(&sig->cpu_timers[2]); |
| 876 | taskstats_tgid_init(sig); | ||
| 864 | 877 | ||
| 865 | task_lock(current->group_leader); | 878 | task_lock(current->group_leader); |
| 866 | memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); | 879 | memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); |
| @@ -874,6 +887,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
| 874 | tsk->it_prof_expires = | 887 | tsk->it_prof_expires = |
| 875 | secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); | 888 | secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); |
| 876 | } | 889 | } |
| 890 | acct_init_pacct(&sig->pacct); | ||
| 877 | 891 | ||
| 878 | return 0; | 892 | return 0; |
| 879 | } | 893 | } |
| @@ -881,6 +895,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
| 881 | void __cleanup_signal(struct signal_struct *sig) | 895 | void __cleanup_signal(struct signal_struct *sig) |
| 882 | { | 896 | { |
| 883 | exit_thread_group_keys(sig); | 897 | exit_thread_group_keys(sig); |
| 898 | taskstats_tgid_free(sig); | ||
| 884 | kmem_cache_free(signal_cachep, sig); | 899 | kmem_cache_free(signal_cachep, sig); |
| 885 | } | 900 | } |
| 886 | 901 | ||
| @@ -912,6 +927,15 @@ asmlinkage long sys_set_tid_address(int __user *tidptr) | |||
| 912 | return current->pid; | 927 | return current->pid; |
| 913 | } | 928 | } |
| 914 | 929 | ||
| 930 | static inline void rt_mutex_init_task(struct task_struct *p) | ||
| 931 | { | ||
| 932 | #ifdef CONFIG_RT_MUTEXES | ||
| 933 | spin_lock_init(&p->pi_lock); | ||
| 934 | plist_head_init(&p->pi_waiters, &p->pi_lock); | ||
| 935 | p->pi_blocked_on = NULL; | ||
| 936 | #endif | ||
| 937 | } | ||
| 938 | |||
| 915 | /* | 939 | /* |
| 916 | * This creates a new process as a copy of the old one, | 940 | * This creates a new process as a copy of the old one, |
| 917 | * but does not actually start it yet. | 941 | * but does not actually start it yet. |
| @@ -920,13 +944,13 @@ asmlinkage long sys_set_tid_address(int __user *tidptr) | |||
| 920 | * parts of the process environment (as per the clone | 944 | * parts of the process environment (as per the clone |
| 921 | * flags). The actual kick-off is left to the caller. | 945 | * flags). The actual kick-off is left to the caller. |
| 922 | */ | 946 | */ |
| 923 | static task_t *copy_process(unsigned long clone_flags, | 947 | static struct task_struct *copy_process(unsigned long clone_flags, |
| 924 | unsigned long stack_start, | 948 | unsigned long stack_start, |
| 925 | struct pt_regs *regs, | 949 | struct pt_regs *regs, |
| 926 | unsigned long stack_size, | 950 | unsigned long stack_size, |
| 927 | int __user *parent_tidptr, | 951 | int __user *parent_tidptr, |
| 928 | int __user *child_tidptr, | 952 | int __user *child_tidptr, |
| 929 | int pid) | 953 | int pid) |
| 930 | { | 954 | { |
| 931 | int retval; | 955 | int retval; |
| 932 | struct task_struct *p = NULL; | 956 | struct task_struct *p = NULL; |
| @@ -958,6 +982,10 @@ static task_t *copy_process(unsigned long clone_flags, | |||
| 958 | if (!p) | 982 | if (!p) |
| 959 | goto fork_out; | 983 | goto fork_out; |
| 960 | 984 | ||
| 985 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
| 986 | DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); | ||
| 987 | DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); | ||
| 988 | #endif | ||
| 961 | retval = -EAGAIN; | 989 | retval = -EAGAIN; |
| 962 | if (atomic_read(&p->user->processes) >= | 990 | if (atomic_read(&p->user->processes) >= |
| 963 | p->signal->rlim[RLIMIT_NPROC].rlim_cur) { | 991 | p->signal->rlim[RLIMIT_NPROC].rlim_cur) { |
| @@ -985,20 +1013,18 @@ static task_t *copy_process(unsigned long clone_flags, | |||
| 985 | goto bad_fork_cleanup_put_domain; | 1013 | goto bad_fork_cleanup_put_domain; |
| 986 | 1014 | ||
| 987 | p->did_exec = 0; | 1015 | p->did_exec = 0; |
| 1016 | delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ | ||
| 988 | copy_flags(clone_flags, p); | 1017 | copy_flags(clone_flags, p); |
| 989 | p->pid = pid; | 1018 | p->pid = pid; |
| 990 | retval = -EFAULT; | 1019 | retval = -EFAULT; |
| 991 | if (clone_flags & CLONE_PARENT_SETTID) | 1020 | if (clone_flags & CLONE_PARENT_SETTID) |
| 992 | if (put_user(p->pid, parent_tidptr)) | 1021 | if (put_user(p->pid, parent_tidptr)) |
| 993 | goto bad_fork_cleanup; | 1022 | goto bad_fork_cleanup_delays_binfmt; |
| 994 | |||
| 995 | p->proc_dentry = NULL; | ||
| 996 | 1023 | ||
| 997 | INIT_LIST_HEAD(&p->children); | 1024 | INIT_LIST_HEAD(&p->children); |
| 998 | INIT_LIST_HEAD(&p->sibling); | 1025 | INIT_LIST_HEAD(&p->sibling); |
| 999 | p->vfork_done = NULL; | 1026 | p->vfork_done = NULL; |
| 1000 | spin_lock_init(&p->alloc_lock); | 1027 | spin_lock_init(&p->alloc_lock); |
| 1001 | spin_lock_init(&p->proc_lock); | ||
| 1002 | 1028 | ||
| 1003 | clear_tsk_thread_flag(p, TIF_SIGPENDING); | 1029 | clear_tsk_thread_flag(p, TIF_SIGPENDING); |
| 1004 | init_sigpending(&p->pending); | 1030 | init_sigpending(&p->pending); |
| @@ -1035,6 +1061,32 @@ static task_t *copy_process(unsigned long clone_flags, | |||
| 1035 | } | 1061 | } |
| 1036 | mpol_fix_fork_child_flag(p); | 1062 | mpol_fix_fork_child_flag(p); |
| 1037 | #endif | 1063 | #endif |
| 1064 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
| 1065 | p->irq_events = 0; | ||
| 1066 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 1067 | p->hardirqs_enabled = 1; | ||
| 1068 | #else | ||
| 1069 | p->hardirqs_enabled = 0; | ||
| 1070 | #endif | ||
| 1071 | p->hardirq_enable_ip = 0; | ||
| 1072 | p->hardirq_enable_event = 0; | ||
| 1073 | p->hardirq_disable_ip = _THIS_IP_; | ||
| 1074 | p->hardirq_disable_event = 0; | ||
| 1075 | p->softirqs_enabled = 1; | ||
| 1076 | p->softirq_enable_ip = _THIS_IP_; | ||
| 1077 | p->softirq_enable_event = 0; | ||
| 1078 | p->softirq_disable_ip = 0; | ||
| 1079 | p->softirq_disable_event = 0; | ||
| 1080 | p->hardirq_context = 0; | ||
| 1081 | p->softirq_context = 0; | ||
| 1082 | #endif | ||
| 1083 | #ifdef CONFIG_LOCKDEP | ||
| 1084 | p->lockdep_depth = 0; /* no locks held yet */ | ||
| 1085 | p->curr_chain_key = 0; | ||
| 1086 | p->lockdep_recursion = 0; | ||
| 1087 | #endif | ||
| 1088 | |||
| 1089 | rt_mutex_init_task(p); | ||
| 1038 | 1090 | ||
| 1039 | #ifdef CONFIG_DEBUG_MUTEXES | 1091 | #ifdef CONFIG_DEBUG_MUTEXES |
| 1040 | p->blocked_on = NULL; /* not blocked yet */ | 1092 | p->blocked_on = NULL; /* not blocked yet */ |
| @@ -1078,6 +1130,9 @@ static task_t *copy_process(unsigned long clone_flags, | |||
| 1078 | #ifdef CONFIG_COMPAT | 1130 | #ifdef CONFIG_COMPAT |
| 1079 | p->compat_robust_list = NULL; | 1131 | p->compat_robust_list = NULL; |
| 1080 | #endif | 1132 | #endif |
| 1133 | INIT_LIST_HEAD(&p->pi_state_list); | ||
| 1134 | p->pi_state_cache = NULL; | ||
| 1135 | |||
| 1081 | /* | 1136 | /* |
| 1082 | * sigaltstack should be cleared when sharing the same VM | 1137 | * sigaltstack should be cleared when sharing the same VM |
| 1083 | */ | 1138 | */ |
| @@ -1095,7 +1150,6 @@ static task_t *copy_process(unsigned long clone_flags, | |||
| 1095 | 1150 | ||
| 1096 | /* Our parent execution domain becomes current domain | 1151 | /* Our parent execution domain becomes current domain |
| 1097 | These must match for thread signalling to apply */ | 1152 | These must match for thread signalling to apply */ |
| 1098 | |||
| 1099 | p->parent_exec_id = p->self_exec_id; | 1153 | p->parent_exec_id = p->self_exec_id; |
| 1100 | 1154 | ||
| 1101 | /* ok, now we should be set up.. */ | 1155 | /* ok, now we should be set up.. */ |
| @@ -1118,6 +1172,9 @@ static task_t *copy_process(unsigned long clone_flags, | |||
| 1118 | /* Need tasklist lock for parent etc handling! */ | 1172 | /* Need tasklist lock for parent etc handling! */ |
| 1119 | write_lock_irq(&tasklist_lock); | 1173 | write_lock_irq(&tasklist_lock); |
| 1120 | 1174 | ||
| 1175 | /* for sys_ioprio_set(IOPRIO_WHO_PGRP) */ | ||
| 1176 | p->ioprio = current->ioprio; | ||
| 1177 | |||
| 1121 | /* | 1178 | /* |
| 1122 | * The task hasn't been attached yet, so its cpus_allowed mask will | 1179 | * The task hasn't been attached yet, so its cpus_allowed mask will |
| 1123 | * not be changed, nor will its assigned CPU. | 1180 | * not be changed, nor will its assigned CPU. |
| @@ -1158,18 +1215,6 @@ static task_t *copy_process(unsigned long clone_flags, | |||
| 1158 | } | 1215 | } |
| 1159 | 1216 | ||
| 1160 | if (clone_flags & CLONE_THREAD) { | 1217 | if (clone_flags & CLONE_THREAD) { |
| 1161 | /* | ||
| 1162 | * Important: if an exit-all has been started then | ||
| 1163 | * do not create this new thread - the whole thread | ||
| 1164 | * group is supposed to exit anyway. | ||
| 1165 | */ | ||
| 1166 | if (current->signal->flags & SIGNAL_GROUP_EXIT) { | ||
| 1167 | spin_unlock(¤t->sighand->siglock); | ||
| 1168 | write_unlock_irq(&tasklist_lock); | ||
| 1169 | retval = -EAGAIN; | ||
| 1170 | goto bad_fork_cleanup_namespace; | ||
| 1171 | } | ||
| 1172 | |||
| 1173 | p->group_leader = current->group_leader; | 1218 | p->group_leader = current->group_leader; |
| 1174 | list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); | 1219 | list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); |
| 1175 | 1220 | ||
| @@ -1189,11 +1234,6 @@ static task_t *copy_process(unsigned long clone_flags, | |||
| 1189 | } | 1234 | } |
| 1190 | } | 1235 | } |
| 1191 | 1236 | ||
| 1192 | /* | ||
| 1193 | * inherit ioprio | ||
| 1194 | */ | ||
| 1195 | p->ioprio = current->ioprio; | ||
| 1196 | |||
| 1197 | if (likely(p->pid)) { | 1237 | if (likely(p->pid)) { |
| 1198 | add_parent(p); | 1238 | add_parent(p); |
| 1199 | if (unlikely(p->ptrace & PT_PTRACED)) | 1239 | if (unlikely(p->ptrace & PT_PTRACED)) |
| @@ -1246,7 +1286,8 @@ bad_fork_cleanup_policy: | |||
| 1246 | bad_fork_cleanup_cpuset: | 1286 | bad_fork_cleanup_cpuset: |
| 1247 | #endif | 1287 | #endif |
| 1248 | cpuset_exit(p); | 1288 | cpuset_exit(p); |
| 1249 | bad_fork_cleanup: | 1289 | bad_fork_cleanup_delays_binfmt: |
| 1290 | delayacct_tsk_free(p); | ||
| 1250 | if (p->binfmt) | 1291 | if (p->binfmt) |
| 1251 | module_put(p->binfmt->module); | 1292 | module_put(p->binfmt->module); |
| 1252 | bad_fork_cleanup_put_domain: | 1293 | bad_fork_cleanup_put_domain: |
| @@ -1267,9 +1308,9 @@ struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) | |||
| 1267 | return regs; | 1308 | return regs; |
| 1268 | } | 1309 | } |
| 1269 | 1310 | ||
| 1270 | task_t * __devinit fork_idle(int cpu) | 1311 | struct task_struct * __devinit fork_idle(int cpu) |
| 1271 | { | 1312 | { |
| 1272 | task_t *task; | 1313 | struct task_struct *task; |
| 1273 | struct pt_regs regs; | 1314 | struct pt_regs regs; |
| 1274 | 1315 | ||
| 1275 | task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0); | 1316 | task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0); |
| @@ -1356,8 +1397,10 @@ long do_fork(unsigned long clone_flags, | |||
| 1356 | 1397 | ||
| 1357 | if (clone_flags & CLONE_VFORK) { | 1398 | if (clone_flags & CLONE_VFORK) { |
| 1358 | wait_for_completion(&vfork); | 1399 | wait_for_completion(&vfork); |
| 1359 | if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) | 1400 | if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { |
| 1401 | current->ptrace_message = nr; | ||
| 1360 | ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); | 1402 | ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); |
| 1403 | } | ||
| 1361 | } | 1404 | } |
| 1362 | } else { | 1405 | } else { |
| 1363 | free_pid(pid); | 1406 | free_pid(pid); |
diff --git a/kernel/futex.c b/kernel/futex.c index e1a380c77a5a..4b6770e9806d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -12,6 +12,10 @@ | |||
| 12 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved | 12 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved |
| 13 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. | 13 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. |
| 14 | * | 14 | * |
| 15 | * PI-futex support started by Ingo Molnar and Thomas Gleixner | ||
| 16 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
| 17 | * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
| 18 | * | ||
| 15 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly | 19 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly |
| 16 | * enough at me, Linus for the original (flawed) idea, Matthew | 20 | * enough at me, Linus for the original (flawed) idea, Matthew |
| 17 | * Kirkwood for proof-of-concept implementation. | 21 | * Kirkwood for proof-of-concept implementation. |
| @@ -46,6 +50,8 @@ | |||
| 46 | #include <linux/signal.h> | 50 | #include <linux/signal.h> |
| 47 | #include <asm/futex.h> | 51 | #include <asm/futex.h> |
| 48 | 52 | ||
| 53 | #include "rtmutex_common.h" | ||
| 54 | |||
| 49 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) | 55 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) |
| 50 | 56 | ||
| 51 | /* | 57 | /* |
| @@ -63,7 +69,7 @@ union futex_key { | |||
| 63 | int offset; | 69 | int offset; |
| 64 | } shared; | 70 | } shared; |
| 65 | struct { | 71 | struct { |
| 66 | unsigned long uaddr; | 72 | unsigned long address; |
| 67 | struct mm_struct *mm; | 73 | struct mm_struct *mm; |
| 68 | int offset; | 74 | int offset; |
| 69 | } private; | 75 | } private; |
| @@ -75,6 +81,27 @@ union futex_key { | |||
| 75 | }; | 81 | }; |
| 76 | 82 | ||
| 77 | /* | 83 | /* |
| 84 | * Priority Inheritance state: | ||
| 85 | */ | ||
| 86 | struct futex_pi_state { | ||
| 87 | /* | ||
| 88 | * list of 'owned' pi_state instances - these have to be | ||
| 89 | * cleaned up in do_exit() if the task exits prematurely: | ||
| 90 | */ | ||
| 91 | struct list_head list; | ||
| 92 | |||
| 93 | /* | ||
| 94 | * The PI object: | ||
| 95 | */ | ||
| 96 | struct rt_mutex pi_mutex; | ||
| 97 | |||
| 98 | struct task_struct *owner; | ||
| 99 | atomic_t refcount; | ||
| 100 | |||
| 101 | union futex_key key; | ||
| 102 | }; | ||
| 103 | |||
| 104 | /* | ||
| 78 | * We use this hashed waitqueue instead of a normal wait_queue_t, so | 105 | * We use this hashed waitqueue instead of a normal wait_queue_t, so |
| 79 | * we can wake only the relevant ones (hashed queues may be shared). | 106 | * we can wake only the relevant ones (hashed queues may be shared). |
| 80 | * | 107 | * |
| @@ -87,15 +114,19 @@ struct futex_q { | |||
| 87 | struct list_head list; | 114 | struct list_head list; |
| 88 | wait_queue_head_t waiters; | 115 | wait_queue_head_t waiters; |
| 89 | 116 | ||
| 90 | /* Which hash list lock to use. */ | 117 | /* Which hash list lock to use: */ |
| 91 | spinlock_t *lock_ptr; | 118 | spinlock_t *lock_ptr; |
| 92 | 119 | ||
| 93 | /* Key which the futex is hashed on. */ | 120 | /* Key which the futex is hashed on: */ |
| 94 | union futex_key key; | 121 | union futex_key key; |
| 95 | 122 | ||
| 96 | /* For fd, sigio sent using these. */ | 123 | /* For fd, sigio sent using these: */ |
| 97 | int fd; | 124 | int fd; |
| 98 | struct file *filp; | 125 | struct file *filp; |
| 126 | |||
| 127 | /* Optional priority inheritance state: */ | ||
| 128 | struct futex_pi_state *pi_state; | ||
| 129 | struct task_struct *task; | ||
| 99 | }; | 130 | }; |
| 100 | 131 | ||
| 101 | /* | 132 | /* |
| @@ -144,8 +175,9 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) | |||
| 144 | * | 175 | * |
| 145 | * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. | 176 | * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. |
| 146 | */ | 177 | */ |
| 147 | static int get_futex_key(unsigned long uaddr, union futex_key *key) | 178 | static int get_futex_key(u32 __user *uaddr, union futex_key *key) |
| 148 | { | 179 | { |
| 180 | unsigned long address = (unsigned long)uaddr; | ||
| 149 | struct mm_struct *mm = current->mm; | 181 | struct mm_struct *mm = current->mm; |
| 150 | struct vm_area_struct *vma; | 182 | struct vm_area_struct *vma; |
| 151 | struct page *page; | 183 | struct page *page; |
| @@ -154,16 +186,16 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
| 154 | /* | 186 | /* |
| 155 | * The futex address must be "naturally" aligned. | 187 | * The futex address must be "naturally" aligned. |
| 156 | */ | 188 | */ |
| 157 | key->both.offset = uaddr % PAGE_SIZE; | 189 | key->both.offset = address % PAGE_SIZE; |
| 158 | if (unlikely((key->both.offset % sizeof(u32)) != 0)) | 190 | if (unlikely((key->both.offset % sizeof(u32)) != 0)) |
| 159 | return -EINVAL; | 191 | return -EINVAL; |
| 160 | uaddr -= key->both.offset; | 192 | address -= key->both.offset; |
| 161 | 193 | ||
| 162 | /* | 194 | /* |
| 163 | * The futex is hashed differently depending on whether | 195 | * The futex is hashed differently depending on whether |
| 164 | * it's in a shared or private mapping. So check vma first. | 196 | * it's in a shared or private mapping. So check vma first. |
| 165 | */ | 197 | */ |
| 166 | vma = find_extend_vma(mm, uaddr); | 198 | vma = find_extend_vma(mm, address); |
| 167 | if (unlikely(!vma)) | 199 | if (unlikely(!vma)) |
| 168 | return -EFAULT; | 200 | return -EFAULT; |
| 169 | 201 | ||
| @@ -184,7 +216,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
| 184 | */ | 216 | */ |
| 185 | if (likely(!(vma->vm_flags & VM_MAYSHARE))) { | 217 | if (likely(!(vma->vm_flags & VM_MAYSHARE))) { |
| 186 | key->private.mm = mm; | 218 | key->private.mm = mm; |
| 187 | key->private.uaddr = uaddr; | 219 | key->private.address = address; |
| 188 | return 0; | 220 | return 0; |
| 189 | } | 221 | } |
| 190 | 222 | ||
| @@ -194,7 +226,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
| 194 | key->shared.inode = vma->vm_file->f_dentry->d_inode; | 226 | key->shared.inode = vma->vm_file->f_dentry->d_inode; |
| 195 | key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ | 227 | key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ |
| 196 | if (likely(!(vma->vm_flags & VM_NONLINEAR))) { | 228 | if (likely(!(vma->vm_flags & VM_NONLINEAR))) { |
| 197 | key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) | 229 | key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) |
| 198 | + vma->vm_pgoff); | 230 | + vma->vm_pgoff); |
| 199 | return 0; | 231 | return 0; |
| 200 | } | 232 | } |
| @@ -205,7 +237,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
| 205 | * from swap. But that's a lot of code to duplicate here | 237 | * from swap. But that's a lot of code to duplicate here |
| 206 | * for a rare case, so we simply fetch the page. | 238 | * for a rare case, so we simply fetch the page. |
| 207 | */ | 239 | */ |
| 208 | err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); | 240 | err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); |
| 209 | if (err >= 0) { | 241 | if (err >= 0) { |
| 210 | key->shared.pgoff = | 242 | key->shared.pgoff = |
| 211 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 243 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
| @@ -246,18 +278,259 @@ static void drop_key_refs(union futex_key *key) | |||
| 246 | } | 278 | } |
| 247 | } | 279 | } |
| 248 | 280 | ||
| 249 | static inline int get_futex_value_locked(int *dest, int __user *from) | 281 | static inline int get_futex_value_locked(u32 *dest, u32 __user *from) |
| 250 | { | 282 | { |
| 251 | int ret; | 283 | int ret; |
| 252 | 284 | ||
| 253 | inc_preempt_count(); | 285 | inc_preempt_count(); |
| 254 | ret = __copy_from_user_inatomic(dest, from, sizeof(int)); | 286 | ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); |
| 255 | dec_preempt_count(); | 287 | dec_preempt_count(); |
| 256 | 288 | ||
| 257 | return ret ? -EFAULT : 0; | 289 | return ret ? -EFAULT : 0; |
| 258 | } | 290 | } |
| 259 | 291 | ||
| 260 | /* | 292 | /* |
| 293 | * Fault handling. Called with current->mm->mmap_sem held. | ||
| 294 | */ | ||
| 295 | static int futex_handle_fault(unsigned long address, int attempt) | ||
| 296 | { | ||
| 297 | struct vm_area_struct * vma; | ||
| 298 | struct mm_struct *mm = current->mm; | ||
| 299 | |||
| 300 | if (attempt > 2 || !(vma = find_vma(mm, address)) || | ||
| 301 | vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) | ||
| 302 | return -EFAULT; | ||
| 303 | |||
| 304 | switch (handle_mm_fault(mm, vma, address, 1)) { | ||
| 305 | case VM_FAULT_MINOR: | ||
| 306 | current->min_flt++; | ||
| 307 | break; | ||
| 308 | case VM_FAULT_MAJOR: | ||
| 309 | current->maj_flt++; | ||
| 310 | break; | ||
| 311 | default: | ||
| 312 | return -EFAULT; | ||
| 313 | } | ||
| 314 | return 0; | ||
| 315 | } | ||
| 316 | |||
| 317 | /* | ||
| 318 | * PI code: | ||
| 319 | */ | ||
| 320 | static int refill_pi_state_cache(void) | ||
| 321 | { | ||
| 322 | struct futex_pi_state *pi_state; | ||
| 323 | |||
| 324 | if (likely(current->pi_state_cache)) | ||
| 325 | return 0; | ||
| 326 | |||
| 327 | pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); | ||
| 328 | |||
| 329 | if (!pi_state) | ||
| 330 | return -ENOMEM; | ||
| 331 | |||
| 332 | memset(pi_state, 0, sizeof(*pi_state)); | ||
| 333 | INIT_LIST_HEAD(&pi_state->list); | ||
| 334 | /* pi_mutex gets initialized later */ | ||
| 335 | pi_state->owner = NULL; | ||
| 336 | atomic_set(&pi_state->refcount, 1); | ||
| 337 | |||
| 338 | current->pi_state_cache = pi_state; | ||
| 339 | |||
| 340 | return 0; | ||
| 341 | } | ||
| 342 | |||
| 343 | static struct futex_pi_state * alloc_pi_state(void) | ||
| 344 | { | ||
| 345 | struct futex_pi_state *pi_state = current->pi_state_cache; | ||
| 346 | |||
| 347 | WARN_ON(!pi_state); | ||
| 348 | current->pi_state_cache = NULL; | ||
| 349 | |||
| 350 | return pi_state; | ||
| 351 | } | ||
| 352 | |||
| 353 | static void free_pi_state(struct futex_pi_state *pi_state) | ||
| 354 | { | ||
| 355 | if (!atomic_dec_and_test(&pi_state->refcount)) | ||
| 356 | return; | ||
| 357 | |||
| 358 | /* | ||
| 359 | * If pi_state->owner is NULL, the owner is most probably dying | ||
| 360 | * and has cleaned up the pi_state already | ||
| 361 | */ | ||
| 362 | if (pi_state->owner) { | ||
| 363 | spin_lock_irq(&pi_state->owner->pi_lock); | ||
| 364 | list_del_init(&pi_state->list); | ||
| 365 | spin_unlock_irq(&pi_state->owner->pi_lock); | ||
| 366 | |||
| 367 | rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); | ||
| 368 | } | ||
| 369 | |||
| 370 | if (current->pi_state_cache) | ||
| 371 | kfree(pi_state); | ||
| 372 | else { | ||
| 373 | /* | ||
| 374 | * pi_state->list is already empty. | ||
| 375 | * clear pi_state->owner. | ||
| 376 | * refcount is at 0 - put it back to 1. | ||
| 377 | */ | ||
| 378 | pi_state->owner = NULL; | ||
| 379 | atomic_set(&pi_state->refcount, 1); | ||
| 380 | current->pi_state_cache = pi_state; | ||
| 381 | } | ||
| 382 | } | ||
| 383 | |||
| 384 | /* | ||
| 385 | * Look up the task based on what TID userspace gave us. | ||
| 386 | * We dont trust it. | ||
| 387 | */ | ||
| 388 | static struct task_struct * futex_find_get_task(pid_t pid) | ||
| 389 | { | ||
| 390 | struct task_struct *p; | ||
| 391 | |||
| 392 | rcu_read_lock(); | ||
| 393 | p = find_task_by_pid(pid); | ||
| 394 | if (!p) | ||
| 395 | goto out_unlock; | ||
| 396 | if ((current->euid != p->euid) && (current->euid != p->uid)) { | ||
| 397 | p = NULL; | ||
| 398 | goto out_unlock; | ||
| 399 | } | ||
| 400 | if (p->exit_state != 0) { | ||
| 401 | p = NULL; | ||
| 402 | goto out_unlock; | ||
| 403 | } | ||
| 404 | get_task_struct(p); | ||
| 405 | out_unlock: | ||
| 406 | rcu_read_unlock(); | ||
| 407 | |||
| 408 | return p; | ||
| 409 | } | ||
| 410 | |||
| 411 | /* | ||
| 412 | * This task is holding PI mutexes at exit time => bad. | ||
| 413 | * Kernel cleans up PI-state, but userspace is likely hosed. | ||
| 414 | * (Robust-futex cleanup is separate and might save the day for userspace.) | ||
| 415 | */ | ||
| 416 | void exit_pi_state_list(struct task_struct *curr) | ||
| 417 | { | ||
| 418 | struct list_head *next, *head = &curr->pi_state_list; | ||
| 419 | struct futex_pi_state *pi_state; | ||
| 420 | struct futex_hash_bucket *hb; | ||
| 421 | union futex_key key; | ||
| 422 | |||
| 423 | /* | ||
| 424 | * We are a ZOMBIE and nobody can enqueue itself on | ||
| 425 | * pi_state_list anymore, but we have to be careful | ||
| 426 | * versus waiters unqueueing themselves: | ||
| 427 | */ | ||
| 428 | spin_lock_irq(&curr->pi_lock); | ||
| 429 | while (!list_empty(head)) { | ||
| 430 | |||
| 431 | next = head->next; | ||
| 432 | pi_state = list_entry(next, struct futex_pi_state, list); | ||
| 433 | key = pi_state->key; | ||
| 434 | hb = hash_futex(&key); | ||
| 435 | spin_unlock_irq(&curr->pi_lock); | ||
| 436 | |||
| 437 | spin_lock(&hb->lock); | ||
| 438 | |||
| 439 | spin_lock_irq(&curr->pi_lock); | ||
| 440 | /* | ||
| 441 | * We dropped the pi-lock, so re-check whether this | ||
| 442 | * task still owns the PI-state: | ||
| 443 | */ | ||
| 444 | if (head->next != next) { | ||
| 445 | spin_unlock(&hb->lock); | ||
| 446 | continue; | ||
| 447 | } | ||
| 448 | |||
| 449 | WARN_ON(pi_state->owner != curr); | ||
| 450 | WARN_ON(list_empty(&pi_state->list)); | ||
| 451 | list_del_init(&pi_state->list); | ||
| 452 | pi_state->owner = NULL; | ||
| 453 | spin_unlock_irq(&curr->pi_lock); | ||
| 454 | |||
| 455 | rt_mutex_unlock(&pi_state->pi_mutex); | ||
| 456 | |||
| 457 | spin_unlock(&hb->lock); | ||
| 458 | |||
| 459 | spin_lock_irq(&curr->pi_lock); | ||
| 460 | } | ||
| 461 | spin_unlock_irq(&curr->pi_lock); | ||
| 462 | } | ||
| 463 | |||
| 464 | static int | ||
| 465 | lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) | ||
| 466 | { | ||
| 467 | struct futex_pi_state *pi_state = NULL; | ||
| 468 | struct futex_q *this, *next; | ||
| 469 | struct list_head *head; | ||
| 470 | struct task_struct *p; | ||
| 471 | pid_t pid; | ||
| 472 | |||
| 473 | head = &hb->chain; | ||
| 474 | |||
| 475 | list_for_each_entry_safe(this, next, head, list) { | ||
| 476 | if (match_futex(&this->key, &me->key)) { | ||
| 477 | /* | ||
| 478 | * Another waiter already exists - bump up | ||
| 479 | * the refcount and return its pi_state: | ||
| 480 | */ | ||
| 481 | pi_state = this->pi_state; | ||
| 482 | /* | ||
| 483 | * Userspace might have messed up non PI and PI futexes | ||
| 484 | */ | ||
| 485 | if (unlikely(!pi_state)) | ||
| 486 | return -EINVAL; | ||
| 487 | |||
| 488 | WARN_ON(!atomic_read(&pi_state->refcount)); | ||
| 489 | |||
| 490 | atomic_inc(&pi_state->refcount); | ||
| 491 | me->pi_state = pi_state; | ||
| 492 | |||
| 493 | return 0; | ||
| 494 | } | ||
| 495 | } | ||
| 496 | |||
| 497 | /* | ||
| 498 | * We are the first waiter - try to look up the real owner and attach | ||
| 499 | * the new pi_state to it, but bail out when the owner died bit is set | ||
| 500 | * and TID = 0: | ||
| 501 | */ | ||
| 502 | pid = uval & FUTEX_TID_MASK; | ||
| 503 | if (!pid && (uval & FUTEX_OWNER_DIED)) | ||
| 504 | return -ESRCH; | ||
| 505 | p = futex_find_get_task(pid); | ||
| 506 | if (!p) | ||
| 507 | return -ESRCH; | ||
| 508 | |||
| 509 | pi_state = alloc_pi_state(); | ||
| 510 | |||
| 511 | /* | ||
| 512 | * Initialize the pi_mutex in locked state and make 'p' | ||
| 513 | * the owner of it: | ||
| 514 | */ | ||
| 515 | rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); | ||
| 516 | |||
| 517 | /* Store the key for possible exit cleanups: */ | ||
| 518 | pi_state->key = me->key; | ||
| 519 | |||
| 520 | spin_lock_irq(&p->pi_lock); | ||
| 521 | WARN_ON(!list_empty(&pi_state->list)); | ||
| 522 | list_add(&pi_state->list, &p->pi_state_list); | ||
| 523 | pi_state->owner = p; | ||
| 524 | spin_unlock_irq(&p->pi_lock); | ||
| 525 | |||
| 526 | put_task_struct(p); | ||
| 527 | |||
| 528 | me->pi_state = pi_state; | ||
| 529 | |||
| 530 | return 0; | ||
| 531 | } | ||
| 532 | |||
| 533 | /* | ||
| 261 | * The hash bucket lock must be held when this is called. | 534 | * The hash bucket lock must be held when this is called. |
| 262 | * Afterwards, the futex_q must not be accessed. | 535 | * Afterwards, the futex_q must not be accessed. |
| 263 | */ | 536 | */ |
| @@ -284,16 +557,105 @@ static void wake_futex(struct futex_q *q) | |||
| 284 | q->lock_ptr = NULL; | 557 | q->lock_ptr = NULL; |
| 285 | } | 558 | } |
| 286 | 559 | ||
| 560 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | ||
| 561 | { | ||
| 562 | struct task_struct *new_owner; | ||
| 563 | struct futex_pi_state *pi_state = this->pi_state; | ||
| 564 | u32 curval, newval; | ||
| 565 | |||
| 566 | if (!pi_state) | ||
| 567 | return -EINVAL; | ||
| 568 | |||
| 569 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); | ||
| 570 | |||
| 571 | /* | ||
| 572 | * This happens when we have stolen the lock and the original | ||
| 573 | * pending owner did not enqueue itself back on the rt_mutex. | ||
| 574 | * Thats not a tragedy. We know that way, that a lock waiter | ||
| 575 | * is on the fly. We make the futex_q waiter the pending owner. | ||
| 576 | */ | ||
| 577 | if (!new_owner) | ||
| 578 | new_owner = this->task; | ||
| 579 | |||
| 580 | /* | ||
| 581 | * We pass it to the next owner. (The WAITERS bit is always | ||
| 582 | * kept enabled while there is PI state around. We must also | ||
| 583 | * preserve the owner died bit.) | ||
| 584 | */ | ||
| 585 | if (!(uval & FUTEX_OWNER_DIED)) { | ||
| 586 | newval = FUTEX_WAITERS | new_owner->pid; | ||
| 587 | |||
| 588 | inc_preempt_count(); | ||
| 589 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | ||
| 590 | dec_preempt_count(); | ||
| 591 | if (curval == -EFAULT) | ||
| 592 | return -EFAULT; | ||
| 593 | if (curval != uval) | ||
| 594 | return -EINVAL; | ||
| 595 | } | ||
| 596 | |||
| 597 | spin_lock_irq(&pi_state->owner->pi_lock); | ||
| 598 | WARN_ON(list_empty(&pi_state->list)); | ||
| 599 | list_del_init(&pi_state->list); | ||
| 600 | spin_unlock_irq(&pi_state->owner->pi_lock); | ||
| 601 | |||
| 602 | spin_lock_irq(&new_owner->pi_lock); | ||
| 603 | WARN_ON(!list_empty(&pi_state->list)); | ||
| 604 | list_add(&pi_state->list, &new_owner->pi_state_list); | ||
| 605 | pi_state->owner = new_owner; | ||
| 606 | spin_unlock_irq(&new_owner->pi_lock); | ||
| 607 | |||
| 608 | rt_mutex_unlock(&pi_state->pi_mutex); | ||
| 609 | |||
| 610 | return 0; | ||
| 611 | } | ||
| 612 | |||
| 613 | static int unlock_futex_pi(u32 __user *uaddr, u32 uval) | ||
| 614 | { | ||
| 615 | u32 oldval; | ||
| 616 | |||
| 617 | /* | ||
| 618 | * There is no waiter, so we unlock the futex. The owner died | ||
| 619 | * bit has not to be preserved here. We are the owner: | ||
| 620 | */ | ||
| 621 | inc_preempt_count(); | ||
| 622 | oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); | ||
| 623 | dec_preempt_count(); | ||
| 624 | |||
| 625 | if (oldval == -EFAULT) | ||
| 626 | return oldval; | ||
| 627 | if (oldval != uval) | ||
| 628 | return -EAGAIN; | ||
| 629 | |||
| 630 | return 0; | ||
| 631 | } | ||
| 632 | |||
| 633 | /* | ||
| 634 | * Express the locking dependencies for lockdep: | ||
| 635 | */ | ||
| 636 | static inline void | ||
| 637 | double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) | ||
| 638 | { | ||
| 639 | if (hb1 <= hb2) { | ||
| 640 | spin_lock(&hb1->lock); | ||
| 641 | if (hb1 < hb2) | ||
| 642 | spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); | ||
| 643 | } else { /* hb1 > hb2 */ | ||
| 644 | spin_lock(&hb2->lock); | ||
| 645 | spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); | ||
| 646 | } | ||
| 647 | } | ||
| 648 | |||
| 287 | /* | 649 | /* |
| 288 | * Wake up all waiters hashed on the physical page that is mapped | 650 | * Wake up all waiters hashed on the physical page that is mapped |
| 289 | * to this virtual address: | 651 | * to this virtual address: |
| 290 | */ | 652 | */ |
| 291 | static int futex_wake(unsigned long uaddr, int nr_wake) | 653 | static int futex_wake(u32 __user *uaddr, int nr_wake) |
| 292 | { | 654 | { |
| 293 | union futex_key key; | 655 | struct futex_hash_bucket *hb; |
| 294 | struct futex_hash_bucket *bh; | ||
| 295 | struct list_head *head; | ||
| 296 | struct futex_q *this, *next; | 656 | struct futex_q *this, *next; |
| 657 | struct list_head *head; | ||
| 658 | union futex_key key; | ||
| 297 | int ret; | 659 | int ret; |
| 298 | 660 | ||
| 299 | down_read(¤t->mm->mmap_sem); | 661 | down_read(¤t->mm->mmap_sem); |
| @@ -302,19 +664,23 @@ static int futex_wake(unsigned long uaddr, int nr_wake) | |||
| 302 | if (unlikely(ret != 0)) | 664 | if (unlikely(ret != 0)) |
| 303 | goto out; | 665 | goto out; |
| 304 | 666 | ||
| 305 | bh = hash_futex(&key); | 667 | hb = hash_futex(&key); |
| 306 | spin_lock(&bh->lock); | 668 | spin_lock(&hb->lock); |
| 307 | head = &bh->chain; | 669 | head = &hb->chain; |
| 308 | 670 | ||
| 309 | list_for_each_entry_safe(this, next, head, list) { | 671 | list_for_each_entry_safe(this, next, head, list) { |
| 310 | if (match_futex (&this->key, &key)) { | 672 | if (match_futex (&this->key, &key)) { |
| 673 | if (this->pi_state) { | ||
| 674 | ret = -EINVAL; | ||
| 675 | break; | ||
| 676 | } | ||
| 311 | wake_futex(this); | 677 | wake_futex(this); |
| 312 | if (++ret >= nr_wake) | 678 | if (++ret >= nr_wake) |
| 313 | break; | 679 | break; |
| 314 | } | 680 | } |
| 315 | } | 681 | } |
| 316 | 682 | ||
| 317 | spin_unlock(&bh->lock); | 683 | spin_unlock(&hb->lock); |
| 318 | out: | 684 | out: |
| 319 | up_read(¤t->mm->mmap_sem); | 685 | up_read(¤t->mm->mmap_sem); |
| 320 | return ret; | 686 | return ret; |
| @@ -324,10 +690,12 @@ out: | |||
| 324 | * Wake up all waiters hashed on the physical page that is mapped | 690 | * Wake up all waiters hashed on the physical page that is mapped |
| 325 | * to this virtual address: | 691 | * to this virtual address: |
| 326 | */ | 692 | */ |
| 327 | static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) | 693 | static int |
| 694 | futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2, | ||
| 695 | int nr_wake, int nr_wake2, int op) | ||
| 328 | { | 696 | { |
| 329 | union futex_key key1, key2; | 697 | union futex_key key1, key2; |
| 330 | struct futex_hash_bucket *bh1, *bh2; | 698 | struct futex_hash_bucket *hb1, *hb2; |
| 331 | struct list_head *head; | 699 | struct list_head *head; |
| 332 | struct futex_q *this, *next; | 700 | struct futex_q *this, *next; |
| 333 | int ret, op_ret, attempt = 0; | 701 | int ret, op_ret, attempt = 0; |
| @@ -342,27 +710,25 @@ retryfull: | |||
| 342 | if (unlikely(ret != 0)) | 710 | if (unlikely(ret != 0)) |
| 343 | goto out; | 711 | goto out; |
| 344 | 712 | ||
| 345 | bh1 = hash_futex(&key1); | 713 | hb1 = hash_futex(&key1); |
| 346 | bh2 = hash_futex(&key2); | 714 | hb2 = hash_futex(&key2); |
| 347 | 715 | ||
| 348 | retry: | 716 | retry: |
| 349 | if (bh1 < bh2) | 717 | double_lock_hb(hb1, hb2); |
| 350 | spin_lock(&bh1->lock); | ||
| 351 | spin_lock(&bh2->lock); | ||
| 352 | if (bh1 > bh2) | ||
| 353 | spin_lock(&bh1->lock); | ||
| 354 | 718 | ||
| 355 | op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); | 719 | op_ret = futex_atomic_op_inuser(op, uaddr2); |
| 356 | if (unlikely(op_ret < 0)) { | 720 | if (unlikely(op_ret < 0)) { |
| 357 | int dummy; | 721 | u32 dummy; |
| 358 | 722 | ||
| 359 | spin_unlock(&bh1->lock); | 723 | spin_unlock(&hb1->lock); |
| 360 | if (bh1 != bh2) | 724 | if (hb1 != hb2) |
| 361 | spin_unlock(&bh2->lock); | 725 | spin_unlock(&hb2->lock); |
| 362 | 726 | ||
| 363 | #ifndef CONFIG_MMU | 727 | #ifndef CONFIG_MMU |
| 364 | /* we don't get EFAULT from MMU faults if we don't have an MMU, | 728 | /* |
| 365 | * but we might get them from range checking */ | 729 | * we don't get EFAULT from MMU faults if we don't have an MMU, |
| 730 | * but we might get them from range checking | ||
| 731 | */ | ||
| 366 | ret = op_ret; | 732 | ret = op_ret; |
| 367 | goto out; | 733 | goto out; |
| 368 | #endif | 734 | #endif |
| @@ -372,47 +738,36 @@ retry: | |||
| 372 | goto out; | 738 | goto out; |
| 373 | } | 739 | } |
| 374 | 740 | ||
| 375 | /* futex_atomic_op_inuser needs to both read and write | 741 | /* |
| 742 | * futex_atomic_op_inuser needs to both read and write | ||
| 376 | * *(int __user *)uaddr2, but we can't modify it | 743 | * *(int __user *)uaddr2, but we can't modify it |
| 377 | * non-atomically. Therefore, if get_user below is not | 744 | * non-atomically. Therefore, if get_user below is not |
| 378 | * enough, we need to handle the fault ourselves, while | 745 | * enough, we need to handle the fault ourselves, while |
| 379 | * still holding the mmap_sem. */ | 746 | * still holding the mmap_sem. |
| 747 | */ | ||
| 380 | if (attempt++) { | 748 | if (attempt++) { |
| 381 | struct vm_area_struct * vma; | 749 | if (futex_handle_fault((unsigned long)uaddr2, |
| 382 | struct mm_struct *mm = current->mm; | 750 | attempt)) { |
| 383 | 751 | ret = -EFAULT; | |
| 384 | ret = -EFAULT; | ||
| 385 | if (attempt >= 2 || | ||
| 386 | !(vma = find_vma(mm, uaddr2)) || | ||
| 387 | vma->vm_start > uaddr2 || | ||
| 388 | !(vma->vm_flags & VM_WRITE)) | ||
| 389 | goto out; | ||
| 390 | |||
| 391 | switch (handle_mm_fault(mm, vma, uaddr2, 1)) { | ||
| 392 | case VM_FAULT_MINOR: | ||
| 393 | current->min_flt++; | ||
| 394 | break; | ||
| 395 | case VM_FAULT_MAJOR: | ||
| 396 | current->maj_flt++; | ||
| 397 | break; | ||
| 398 | default: | ||
| 399 | goto out; | 752 | goto out; |
| 400 | } | 753 | } |
| 401 | goto retry; | 754 | goto retry; |
| 402 | } | 755 | } |
| 403 | 756 | ||
| 404 | /* If we would have faulted, release mmap_sem, | 757 | /* |
| 405 | * fault it in and start all over again. */ | 758 | * If we would have faulted, release mmap_sem, |
| 759 | * fault it in and start all over again. | ||
| 760 | */ | ||
| 406 | up_read(¤t->mm->mmap_sem); | 761 | up_read(¤t->mm->mmap_sem); |
| 407 | 762 | ||
| 408 | ret = get_user(dummy, (int __user *)uaddr2); | 763 | ret = get_user(dummy, uaddr2); |
| 409 | if (ret) | 764 | if (ret) |
| 410 | return ret; | 765 | return ret; |
| 411 | 766 | ||
| 412 | goto retryfull; | 767 | goto retryfull; |
| 413 | } | 768 | } |
| 414 | 769 | ||
| 415 | head = &bh1->chain; | 770 | head = &hb1->chain; |
| 416 | 771 | ||
| 417 | list_for_each_entry_safe(this, next, head, list) { | 772 | list_for_each_entry_safe(this, next, head, list) { |
| 418 | if (match_futex (&this->key, &key1)) { | 773 | if (match_futex (&this->key, &key1)) { |
| @@ -423,7 +778,7 @@ retry: | |||
| 423 | } | 778 | } |
| 424 | 779 | ||
| 425 | if (op_ret > 0) { | 780 | if (op_ret > 0) { |
| 426 | head = &bh2->chain; | 781 | head = &hb2->chain; |
| 427 | 782 | ||
| 428 | op_ret = 0; | 783 | op_ret = 0; |
| 429 | list_for_each_entry_safe(this, next, head, list) { | 784 | list_for_each_entry_safe(this, next, head, list) { |
| @@ -436,9 +791,9 @@ retry: | |||
| 436 | ret += op_ret; | 791 | ret += op_ret; |
| 437 | } | 792 | } |
| 438 | 793 | ||
| 439 | spin_unlock(&bh1->lock); | 794 | spin_unlock(&hb1->lock); |
| 440 | if (bh1 != bh2) | 795 | if (hb1 != hb2) |
| 441 | spin_unlock(&bh2->lock); | 796 | spin_unlock(&hb2->lock); |
| 442 | out: | 797 | out: |
| 443 | up_read(¤t->mm->mmap_sem); | 798 | up_read(¤t->mm->mmap_sem); |
| 444 | return ret; | 799 | return ret; |
| @@ -448,11 +803,11 @@ out: | |||
| 448 | * Requeue all waiters hashed on one physical page to another | 803 | * Requeue all waiters hashed on one physical page to another |
| 449 | * physical page. | 804 | * physical page. |
| 450 | */ | 805 | */ |
| 451 | static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, | 806 | static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, |
| 452 | int nr_wake, int nr_requeue, int *valp) | 807 | int nr_wake, int nr_requeue, u32 *cmpval) |
| 453 | { | 808 | { |
| 454 | union futex_key key1, key2; | 809 | union futex_key key1, key2; |
| 455 | struct futex_hash_bucket *bh1, *bh2; | 810 | struct futex_hash_bucket *hb1, *hb2; |
| 456 | struct list_head *head1; | 811 | struct list_head *head1; |
| 457 | struct futex_q *this, *next; | 812 | struct futex_q *this, *next; |
| 458 | int ret, drop_count = 0; | 813 | int ret, drop_count = 0; |
| @@ -467,68 +822,68 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, | |||
| 467 | if (unlikely(ret != 0)) | 822 | if (unlikely(ret != 0)) |
| 468 | goto out; | 823 | goto out; |
| 469 | 824 | ||
| 470 | bh1 = hash_futex(&key1); | 825 | hb1 = hash_futex(&key1); |
| 471 | bh2 = hash_futex(&key2); | 826 | hb2 = hash_futex(&key2); |
| 472 | 827 | ||
| 473 | if (bh1 < bh2) | 828 | double_lock_hb(hb1, hb2); |
| 474 | spin_lock(&bh1->lock); | ||
| 475 | spin_lock(&bh2->lock); | ||
| 476 | if (bh1 > bh2) | ||
| 477 | spin_lock(&bh1->lock); | ||
| 478 | 829 | ||
| 479 | if (likely(valp != NULL)) { | 830 | if (likely(cmpval != NULL)) { |
| 480 | int curval; | 831 | u32 curval; |
| 481 | 832 | ||
| 482 | ret = get_futex_value_locked(&curval, (int __user *)uaddr1); | 833 | ret = get_futex_value_locked(&curval, uaddr1); |
| 483 | 834 | ||
| 484 | if (unlikely(ret)) { | 835 | if (unlikely(ret)) { |
| 485 | spin_unlock(&bh1->lock); | 836 | spin_unlock(&hb1->lock); |
| 486 | if (bh1 != bh2) | 837 | if (hb1 != hb2) |
| 487 | spin_unlock(&bh2->lock); | 838 | spin_unlock(&hb2->lock); |
| 488 | 839 | ||
| 489 | /* If we would have faulted, release mmap_sem, fault | 840 | /* |
| 841 | * If we would have faulted, release mmap_sem, fault | ||
| 490 | * it in and start all over again. | 842 | * it in and start all over again. |
| 491 | */ | 843 | */ |
| 492 | up_read(¤t->mm->mmap_sem); | 844 | up_read(¤t->mm->mmap_sem); |
| 493 | 845 | ||
| 494 | ret = get_user(curval, (int __user *)uaddr1); | 846 | ret = get_user(curval, uaddr1); |
| 495 | 847 | ||
| 496 | if (!ret) | 848 | if (!ret) |
| 497 | goto retry; | 849 | goto retry; |
| 498 | 850 | ||
| 499 | return ret; | 851 | return ret; |
| 500 | } | 852 | } |
| 501 | if (curval != *valp) { | 853 | if (curval != *cmpval) { |
| 502 | ret = -EAGAIN; | 854 | ret = -EAGAIN; |
| 503 | goto out_unlock; | 855 | goto out_unlock; |
| 504 | } | 856 | } |
| 505 | } | 857 | } |
| 506 | 858 | ||
| 507 | head1 = &bh1->chain; | 859 | head1 = &hb1->chain; |
| 508 | list_for_each_entry_safe(this, next, head1, list) { | 860 | list_for_each_entry_safe(this, next, head1, list) { |
| 509 | if (!match_futex (&this->key, &key1)) | 861 | if (!match_futex (&this->key, &key1)) |
| 510 | continue; | 862 | continue; |
| 511 | if (++ret <= nr_wake) { | 863 | if (++ret <= nr_wake) { |
| 512 | wake_futex(this); | 864 | wake_futex(this); |
| 513 | } else { | 865 | } else { |
| 514 | list_move_tail(&this->list, &bh2->chain); | 866 | /* |
| 515 | this->lock_ptr = &bh2->lock; | 867 | * If key1 and key2 hash to the same bucket, no need to |
| 868 | * requeue. | ||
| 869 | */ | ||
| 870 | if (likely(head1 != &hb2->chain)) { | ||
| 871 | list_move_tail(&this->list, &hb2->chain); | ||
| 872 | this->lock_ptr = &hb2->lock; | ||
| 873 | } | ||
| 516 | this->key = key2; | 874 | this->key = key2; |
| 517 | get_key_refs(&key2); | 875 | get_key_refs(&key2); |
| 518 | drop_count++; | 876 | drop_count++; |
| 519 | 877 | ||
| 520 | if (ret - nr_wake >= nr_requeue) | 878 | if (ret - nr_wake >= nr_requeue) |
| 521 | break; | 879 | break; |
| 522 | /* Make sure to stop if key1 == key2 */ | ||
| 523 | if (head1 == &bh2->chain && head1 != &next->list) | ||
| 524 | head1 = &this->list; | ||
| 525 | } | 880 | } |
| 526 | } | 881 | } |
| 527 | 882 | ||
| 528 | out_unlock: | 883 | out_unlock: |
| 529 | spin_unlock(&bh1->lock); | 884 | spin_unlock(&hb1->lock); |
| 530 | if (bh1 != bh2) | 885 | if (hb1 != hb2) |
| 531 | spin_unlock(&bh2->lock); | 886 | spin_unlock(&hb2->lock); |
| 532 | 887 | ||
| 533 | /* drop_key_refs() must be called outside the spinlocks. */ | 888 | /* drop_key_refs() must be called outside the spinlocks. */ |
| 534 | while (--drop_count >= 0) | 889 | while (--drop_count >= 0) |
| @@ -543,7 +898,7 @@ out: | |||
| 543 | static inline struct futex_hash_bucket * | 898 | static inline struct futex_hash_bucket * |
| 544 | queue_lock(struct futex_q *q, int fd, struct file *filp) | 899 | queue_lock(struct futex_q *q, int fd, struct file *filp) |
| 545 | { | 900 | { |
| 546 | struct futex_hash_bucket *bh; | 901 | struct futex_hash_bucket *hb; |
| 547 | 902 | ||
| 548 | q->fd = fd; | 903 | q->fd = fd; |
| 549 | q->filp = filp; | 904 | q->filp = filp; |
| @@ -551,23 +906,24 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) | |||
| 551 | init_waitqueue_head(&q->waiters); | 906 | init_waitqueue_head(&q->waiters); |
| 552 | 907 | ||
| 553 | get_key_refs(&q->key); | 908 | get_key_refs(&q->key); |
| 554 | bh = hash_futex(&q->key); | 909 | hb = hash_futex(&q->key); |
| 555 | q->lock_ptr = &bh->lock; | 910 | q->lock_ptr = &hb->lock; |
| 556 | 911 | ||
| 557 | spin_lock(&bh->lock); | 912 | spin_lock(&hb->lock); |
| 558 | return bh; | 913 | return hb; |
| 559 | } | 914 | } |
| 560 | 915 | ||
| 561 | static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh) | 916 | static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) |
| 562 | { | 917 | { |
| 563 | list_add_tail(&q->list, &bh->chain); | 918 | list_add_tail(&q->list, &hb->chain); |
| 564 | spin_unlock(&bh->lock); | 919 | q->task = current; |
| 920 | spin_unlock(&hb->lock); | ||
| 565 | } | 921 | } |
| 566 | 922 | ||
| 567 | static inline void | 923 | static inline void |
| 568 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) | 924 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) |
| 569 | { | 925 | { |
| 570 | spin_unlock(&bh->lock); | 926 | spin_unlock(&hb->lock); |
| 571 | drop_key_refs(&q->key); | 927 | drop_key_refs(&q->key); |
| 572 | } | 928 | } |
| 573 | 929 | ||
| @@ -579,20 +935,22 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) | |||
| 579 | /* The key must be already stored in q->key. */ | 935 | /* The key must be already stored in q->key. */ |
| 580 | static void queue_me(struct futex_q *q, int fd, struct file *filp) | 936 | static void queue_me(struct futex_q *q, int fd, struct file *filp) |
| 581 | { | 937 | { |
| 582 | struct futex_hash_bucket *bh; | 938 | struct futex_hash_bucket *hb; |
| 583 | bh = queue_lock(q, fd, filp); | 939 | |
| 584 | __queue_me(q, bh); | 940 | hb = queue_lock(q, fd, filp); |
| 941 | __queue_me(q, hb); | ||
| 585 | } | 942 | } |
| 586 | 943 | ||
| 587 | /* Return 1 if we were still queued (ie. 0 means we were woken) */ | 944 | /* Return 1 if we were still queued (ie. 0 means we were woken) */ |
| 588 | static int unqueue_me(struct futex_q *q) | 945 | static int unqueue_me(struct futex_q *q) |
| 589 | { | 946 | { |
| 590 | int ret = 0; | ||
| 591 | spinlock_t *lock_ptr; | 947 | spinlock_t *lock_ptr; |
| 948 | int ret = 0; | ||
| 592 | 949 | ||
| 593 | /* In the common case we don't take the spinlock, which is nice. */ | 950 | /* In the common case we don't take the spinlock, which is nice. */ |
| 594 | retry: | 951 | retry: |
| 595 | lock_ptr = q->lock_ptr; | 952 | lock_ptr = q->lock_ptr; |
| 953 | barrier(); | ||
| 596 | if (lock_ptr != 0) { | 954 | if (lock_ptr != 0) { |
| 597 | spin_lock(lock_ptr); | 955 | spin_lock(lock_ptr); |
| 598 | /* | 956 | /* |
| @@ -614,6 +972,9 @@ static int unqueue_me(struct futex_q *q) | |||
| 614 | } | 972 | } |
| 615 | WARN_ON(list_empty(&q->list)); | 973 | WARN_ON(list_empty(&q->list)); |
| 616 | list_del(&q->list); | 974 | list_del(&q->list); |
| 975 | |||
| 976 | BUG_ON(q->pi_state); | ||
| 977 | |||
| 617 | spin_unlock(lock_ptr); | 978 | spin_unlock(lock_ptr); |
| 618 | ret = 1; | 979 | ret = 1; |
| 619 | } | 980 | } |
| @@ -622,21 +983,42 @@ static int unqueue_me(struct futex_q *q) | |||
| 622 | return ret; | 983 | return ret; |
| 623 | } | 984 | } |
| 624 | 985 | ||
| 625 | static int futex_wait(unsigned long uaddr, int val, unsigned long time) | 986 | /* |
| 987 | * PI futexes can not be requeued and must remove themself from the | ||
| 988 | * hash bucket. The hash bucket lock is held on entry and dropped here. | ||
| 989 | */ | ||
| 990 | static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) | ||
| 991 | { | ||
| 992 | WARN_ON(list_empty(&q->list)); | ||
| 993 | list_del(&q->list); | ||
| 994 | |||
| 995 | BUG_ON(!q->pi_state); | ||
| 996 | free_pi_state(q->pi_state); | ||
| 997 | q->pi_state = NULL; | ||
| 998 | |||
| 999 | spin_unlock(&hb->lock); | ||
| 1000 | |||
| 1001 | drop_key_refs(&q->key); | ||
| 1002 | } | ||
| 1003 | |||
| 1004 | static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) | ||
| 626 | { | 1005 | { |
| 627 | DECLARE_WAITQUEUE(wait, current); | 1006 | struct task_struct *curr = current; |
| 628 | int ret, curval; | 1007 | DECLARE_WAITQUEUE(wait, curr); |
| 1008 | struct futex_hash_bucket *hb; | ||
| 629 | struct futex_q q; | 1009 | struct futex_q q; |
| 630 | struct futex_hash_bucket *bh; | 1010 | u32 uval; |
| 1011 | int ret; | ||
| 631 | 1012 | ||
| 1013 | q.pi_state = NULL; | ||
| 632 | retry: | 1014 | retry: |
| 633 | down_read(¤t->mm->mmap_sem); | 1015 | down_read(&curr->mm->mmap_sem); |
| 634 | 1016 | ||
| 635 | ret = get_futex_key(uaddr, &q.key); | 1017 | ret = get_futex_key(uaddr, &q.key); |
| 636 | if (unlikely(ret != 0)) | 1018 | if (unlikely(ret != 0)) |
| 637 | goto out_release_sem; | 1019 | goto out_release_sem; |
| 638 | 1020 | ||
| 639 | bh = queue_lock(&q, -1, NULL); | 1021 | hb = queue_lock(&q, -1, NULL); |
| 640 | 1022 | ||
| 641 | /* | 1023 | /* |
| 642 | * Access the page AFTER the futex is queued. | 1024 | * Access the page AFTER the futex is queued. |
| @@ -658,37 +1040,35 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time) | |||
| 658 | * We hold the mmap semaphore, so the mapping cannot have changed | 1040 | * We hold the mmap semaphore, so the mapping cannot have changed |
| 659 | * since we looked it up in get_futex_key. | 1041 | * since we looked it up in get_futex_key. |
| 660 | */ | 1042 | */ |
| 661 | 1043 | ret = get_futex_value_locked(&uval, uaddr); | |
| 662 | ret = get_futex_value_locked(&curval, (int __user *)uaddr); | ||
| 663 | 1044 | ||
| 664 | if (unlikely(ret)) { | 1045 | if (unlikely(ret)) { |
| 665 | queue_unlock(&q, bh); | 1046 | queue_unlock(&q, hb); |
| 666 | 1047 | ||
| 667 | /* If we would have faulted, release mmap_sem, fault it in and | 1048 | /* |
| 1049 | * If we would have faulted, release mmap_sem, fault it in and | ||
| 668 | * start all over again. | 1050 | * start all over again. |
| 669 | */ | 1051 | */ |
| 670 | up_read(¤t->mm->mmap_sem); | 1052 | up_read(&curr->mm->mmap_sem); |
| 671 | 1053 | ||
| 672 | ret = get_user(curval, (int __user *)uaddr); | 1054 | ret = get_user(uval, uaddr); |
| 673 | 1055 | ||
| 674 | if (!ret) | 1056 | if (!ret) |
| 675 | goto retry; | 1057 | goto retry; |
| 676 | return ret; | 1058 | return ret; |
| 677 | } | 1059 | } |
| 678 | if (curval != val) { | 1060 | ret = -EWOULDBLOCK; |
| 679 | ret = -EWOULDBLOCK; | 1061 | if (uval != val) |
| 680 | queue_unlock(&q, bh); | 1062 | goto out_unlock_release_sem; |
| 681 | goto out_release_sem; | ||
| 682 | } | ||
| 683 | 1063 | ||
| 684 | /* Only actually queue if *uaddr contained val. */ | 1064 | /* Only actually queue if *uaddr contained val. */ |
| 685 | __queue_me(&q, bh); | 1065 | __queue_me(&q, hb); |
| 686 | 1066 | ||
| 687 | /* | 1067 | /* |
| 688 | * Now the futex is queued and we have checked the data, we | 1068 | * Now the futex is queued and we have checked the data, we |
| 689 | * don't want to hold mmap_sem while we sleep. | 1069 | * don't want to hold mmap_sem while we sleep. |
| 690 | */ | 1070 | */ |
| 691 | up_read(¤t->mm->mmap_sem); | 1071 | up_read(&curr->mm->mmap_sem); |
| 692 | 1072 | ||
| 693 | /* | 1073 | /* |
| 694 | * There might have been scheduling since the queue_me(), as we | 1074 | * There might have been scheduling since the queue_me(), as we |
| @@ -720,12 +1100,367 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time) | |||
| 720 | return 0; | 1100 | return 0; |
| 721 | if (time == 0) | 1101 | if (time == 0) |
| 722 | return -ETIMEDOUT; | 1102 | return -ETIMEDOUT; |
| 723 | /* We expect signal_pending(current), but another thread may | 1103 | /* |
| 724 | * have handled it for us already. */ | 1104 | * We expect signal_pending(current), but another thread may |
| 1105 | * have handled it for us already. | ||
| 1106 | */ | ||
| 725 | return -EINTR; | 1107 | return -EINTR; |
| 726 | 1108 | ||
| 1109 | out_unlock_release_sem: | ||
| 1110 | queue_unlock(&q, hb); | ||
| 1111 | |||
| 727 | out_release_sem: | 1112 | out_release_sem: |
| 1113 | up_read(&curr->mm->mmap_sem); | ||
| 1114 | return ret; | ||
| 1115 | } | ||
| 1116 | |||
| 1117 | /* | ||
| 1118 | * Userspace tried a 0 -> TID atomic transition of the futex value | ||
| 1119 | * and failed. The kernel side here does the whole locking operation: | ||
| 1120 | * if there are waiters then it will block, it does PI, etc. (Due to | ||
| 1121 | * races the kernel might see a 0 value of the futex too.) | ||
| 1122 | */ | ||
| 1123 | static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, | ||
| 1124 | long nsec, int trylock) | ||
| 1125 | { | ||
| 1126 | struct hrtimer_sleeper timeout, *to = NULL; | ||
| 1127 | struct task_struct *curr = current; | ||
| 1128 | struct futex_hash_bucket *hb; | ||
| 1129 | u32 uval, newval, curval; | ||
| 1130 | struct futex_q q; | ||
| 1131 | int ret, attempt = 0; | ||
| 1132 | |||
| 1133 | if (refill_pi_state_cache()) | ||
| 1134 | return -ENOMEM; | ||
| 1135 | |||
| 1136 | if (sec != MAX_SCHEDULE_TIMEOUT) { | ||
| 1137 | to = &timeout; | ||
| 1138 | hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); | ||
| 1139 | hrtimer_init_sleeper(to, current); | ||
| 1140 | to->timer.expires = ktime_set(sec, nsec); | ||
| 1141 | } | ||
| 1142 | |||
| 1143 | q.pi_state = NULL; | ||
| 1144 | retry: | ||
| 1145 | down_read(&curr->mm->mmap_sem); | ||
| 1146 | |||
| 1147 | ret = get_futex_key(uaddr, &q.key); | ||
| 1148 | if (unlikely(ret != 0)) | ||
| 1149 | goto out_release_sem; | ||
| 1150 | |||
| 1151 | hb = queue_lock(&q, -1, NULL); | ||
| 1152 | |||
| 1153 | retry_locked: | ||
| 1154 | /* | ||
| 1155 | * To avoid races, we attempt to take the lock here again | ||
| 1156 | * (by doing a 0 -> TID atomic cmpxchg), while holding all | ||
| 1157 | * the locks. It will most likely not succeed. | ||
| 1158 | */ | ||
| 1159 | newval = current->pid; | ||
| 1160 | |||
| 1161 | inc_preempt_count(); | ||
| 1162 | curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); | ||
| 1163 | dec_preempt_count(); | ||
| 1164 | |||
| 1165 | if (unlikely(curval == -EFAULT)) | ||
| 1166 | goto uaddr_faulted; | ||
| 1167 | |||
| 1168 | /* We own the lock already */ | ||
| 1169 | if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { | ||
| 1170 | if (!detect && 0) | ||
| 1171 | force_sig(SIGKILL, current); | ||
| 1172 | ret = -EDEADLK; | ||
| 1173 | goto out_unlock_release_sem; | ||
| 1174 | } | ||
| 1175 | |||
| 1176 | /* | ||
| 1177 | * Surprise - we got the lock. Just return | ||
| 1178 | * to userspace: | ||
| 1179 | */ | ||
| 1180 | if (unlikely(!curval)) | ||
| 1181 | goto out_unlock_release_sem; | ||
| 1182 | |||
| 1183 | uval = curval; | ||
| 1184 | newval = uval | FUTEX_WAITERS; | ||
| 1185 | |||
| 1186 | inc_preempt_count(); | ||
| 1187 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | ||
| 1188 | dec_preempt_count(); | ||
| 1189 | |||
| 1190 | if (unlikely(curval == -EFAULT)) | ||
| 1191 | goto uaddr_faulted; | ||
| 1192 | if (unlikely(curval != uval)) | ||
| 1193 | goto retry_locked; | ||
| 1194 | |||
| 1195 | /* | ||
| 1196 | * We dont have the lock. Look up the PI state (or create it if | ||
| 1197 | * we are the first waiter): | ||
| 1198 | */ | ||
| 1199 | ret = lookup_pi_state(uval, hb, &q); | ||
| 1200 | |||
| 1201 | if (unlikely(ret)) { | ||
| 1202 | /* | ||
| 1203 | * There were no waiters and the owner task lookup | ||
| 1204 | * failed. When the OWNER_DIED bit is set, then we | ||
| 1205 | * know that this is a robust futex and we actually | ||
| 1206 | * take the lock. This is safe as we are protected by | ||
| 1207 | * the hash bucket lock. We also set the waiters bit | ||
| 1208 | * unconditionally here, to simplify glibc handling of | ||
| 1209 | * multiple tasks racing to acquire the lock and | ||
| 1210 | * cleanup the problems which were left by the dead | ||
| 1211 | * owner. | ||
| 1212 | */ | ||
| 1213 | if (curval & FUTEX_OWNER_DIED) { | ||
| 1214 | uval = newval; | ||
| 1215 | newval = current->pid | | ||
| 1216 | FUTEX_OWNER_DIED | FUTEX_WAITERS; | ||
| 1217 | |||
| 1218 | inc_preempt_count(); | ||
| 1219 | curval = futex_atomic_cmpxchg_inatomic(uaddr, | ||
| 1220 | uval, newval); | ||
| 1221 | dec_preempt_count(); | ||
| 1222 | |||
| 1223 | if (unlikely(curval == -EFAULT)) | ||
| 1224 | goto uaddr_faulted; | ||
| 1225 | if (unlikely(curval != uval)) | ||
| 1226 | goto retry_locked; | ||
| 1227 | ret = 0; | ||
| 1228 | } | ||
| 1229 | goto out_unlock_release_sem; | ||
| 1230 | } | ||
| 1231 | |||
| 1232 | /* | ||
| 1233 | * Only actually queue now that the atomic ops are done: | ||
| 1234 | */ | ||
| 1235 | __queue_me(&q, hb); | ||
| 1236 | |||
| 1237 | /* | ||
| 1238 | * Now the futex is queued and we have checked the data, we | ||
| 1239 | * don't want to hold mmap_sem while we sleep. | ||
| 1240 | */ | ||
| 1241 | up_read(&curr->mm->mmap_sem); | ||
| 1242 | |||
| 1243 | WARN_ON(!q.pi_state); | ||
| 1244 | /* | ||
| 1245 | * Block on the PI mutex: | ||
| 1246 | */ | ||
| 1247 | if (!trylock) | ||
| 1248 | ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); | ||
| 1249 | else { | ||
| 1250 | ret = rt_mutex_trylock(&q.pi_state->pi_mutex); | ||
| 1251 | /* Fixup the trylock return value: */ | ||
| 1252 | ret = ret ? 0 : -EWOULDBLOCK; | ||
| 1253 | } | ||
| 1254 | |||
| 1255 | down_read(&curr->mm->mmap_sem); | ||
| 1256 | spin_lock(q.lock_ptr); | ||
| 1257 | |||
| 1258 | /* | ||
| 1259 | * Got the lock. We might not be the anticipated owner if we | ||
| 1260 | * did a lock-steal - fix up the PI-state in that case. | ||
| 1261 | */ | ||
| 1262 | if (!ret && q.pi_state->owner != curr) { | ||
| 1263 | u32 newtid = current->pid | FUTEX_WAITERS; | ||
| 1264 | |||
| 1265 | /* Owner died? */ | ||
| 1266 | if (q.pi_state->owner != NULL) { | ||
| 1267 | spin_lock_irq(&q.pi_state->owner->pi_lock); | ||
| 1268 | WARN_ON(list_empty(&q.pi_state->list)); | ||
| 1269 | list_del_init(&q.pi_state->list); | ||
| 1270 | spin_unlock_irq(&q.pi_state->owner->pi_lock); | ||
| 1271 | } else | ||
| 1272 | newtid |= FUTEX_OWNER_DIED; | ||
| 1273 | |||
| 1274 | q.pi_state->owner = current; | ||
| 1275 | |||
| 1276 | spin_lock_irq(¤t->pi_lock); | ||
| 1277 | WARN_ON(!list_empty(&q.pi_state->list)); | ||
| 1278 | list_add(&q.pi_state->list, ¤t->pi_state_list); | ||
| 1279 | spin_unlock_irq(¤t->pi_lock); | ||
| 1280 | |||
| 1281 | /* Unqueue and drop the lock */ | ||
| 1282 | unqueue_me_pi(&q, hb); | ||
| 1283 | up_read(&curr->mm->mmap_sem); | ||
| 1284 | /* | ||
| 1285 | * We own it, so we have to replace the pending owner | ||
| 1286 | * TID. This must be atomic as we have preserve the | ||
| 1287 | * owner died bit here. | ||
| 1288 | */ | ||
| 1289 | ret = get_user(uval, uaddr); | ||
| 1290 | while (!ret) { | ||
| 1291 | newval = (uval & FUTEX_OWNER_DIED) | newtid; | ||
| 1292 | curval = futex_atomic_cmpxchg_inatomic(uaddr, | ||
| 1293 | uval, newval); | ||
| 1294 | if (curval == -EFAULT) | ||
| 1295 | ret = -EFAULT; | ||
| 1296 | if (curval == uval) | ||
| 1297 | break; | ||
| 1298 | uval = curval; | ||
| 1299 | } | ||
| 1300 | } else { | ||
| 1301 | /* | ||
| 1302 | * Catch the rare case, where the lock was released | ||
| 1303 | * when we were on the way back before we locked | ||
| 1304 | * the hash bucket. | ||
| 1305 | */ | ||
| 1306 | if (ret && q.pi_state->owner == curr) { | ||
| 1307 | if (rt_mutex_trylock(&q.pi_state->pi_mutex)) | ||
| 1308 | ret = 0; | ||
| 1309 | } | ||
| 1310 | /* Unqueue and drop the lock */ | ||
| 1311 | unqueue_me_pi(&q, hb); | ||
| 1312 | up_read(&curr->mm->mmap_sem); | ||
| 1313 | } | ||
| 1314 | |||
| 1315 | if (!detect && ret == -EDEADLK && 0) | ||
| 1316 | force_sig(SIGKILL, current); | ||
| 1317 | |||
| 1318 | return ret != -EINTR ? ret : -ERESTARTNOINTR; | ||
| 1319 | |||
| 1320 | out_unlock_release_sem: | ||
| 1321 | queue_unlock(&q, hb); | ||
| 1322 | |||
| 1323 | out_release_sem: | ||
| 1324 | up_read(&curr->mm->mmap_sem); | ||
| 1325 | return ret; | ||
| 1326 | |||
| 1327 | uaddr_faulted: | ||
| 1328 | /* | ||
| 1329 | * We have to r/w *(int __user *)uaddr, but we can't modify it | ||
| 1330 | * non-atomically. Therefore, if get_user below is not | ||
| 1331 | * enough, we need to handle the fault ourselves, while | ||
| 1332 | * still holding the mmap_sem. | ||
| 1333 | */ | ||
| 1334 | if (attempt++) { | ||
| 1335 | if (futex_handle_fault((unsigned long)uaddr, attempt)) { | ||
| 1336 | ret = -EFAULT; | ||
| 1337 | goto out_unlock_release_sem; | ||
| 1338 | } | ||
| 1339 | goto retry_locked; | ||
| 1340 | } | ||
| 1341 | |||
| 1342 | queue_unlock(&q, hb); | ||
| 1343 | up_read(&curr->mm->mmap_sem); | ||
| 1344 | |||
| 1345 | ret = get_user(uval, uaddr); | ||
| 1346 | if (!ret && (uval != -EFAULT)) | ||
| 1347 | goto retry; | ||
| 1348 | |||
| 1349 | return ret; | ||
| 1350 | } | ||
| 1351 | |||
| 1352 | /* | ||
| 1353 | * Userspace attempted a TID -> 0 atomic transition, and failed. | ||
| 1354 | * This is the in-kernel slowpath: we look up the PI state (if any), | ||
| 1355 | * and do the rt-mutex unlock. | ||
| 1356 | */ | ||
| 1357 | static int futex_unlock_pi(u32 __user *uaddr) | ||
| 1358 | { | ||
| 1359 | struct futex_hash_bucket *hb; | ||
| 1360 | struct futex_q *this, *next; | ||
| 1361 | u32 uval; | ||
| 1362 | struct list_head *head; | ||
| 1363 | union futex_key key; | ||
| 1364 | int ret, attempt = 0; | ||
| 1365 | |||
| 1366 | retry: | ||
| 1367 | if (get_user(uval, uaddr)) | ||
| 1368 | return -EFAULT; | ||
| 1369 | /* | ||
| 1370 | * We release only a lock we actually own: | ||
| 1371 | */ | ||
| 1372 | if ((uval & FUTEX_TID_MASK) != current->pid) | ||
| 1373 | return -EPERM; | ||
| 1374 | /* | ||
| 1375 | * First take all the futex related locks: | ||
| 1376 | */ | ||
| 1377 | down_read(¤t->mm->mmap_sem); | ||
| 1378 | |||
| 1379 | ret = get_futex_key(uaddr, &key); | ||
| 1380 | if (unlikely(ret != 0)) | ||
| 1381 | goto out; | ||
| 1382 | |||
| 1383 | hb = hash_futex(&key); | ||
| 1384 | spin_lock(&hb->lock); | ||
| 1385 | |||
| 1386 | retry_locked: | ||
| 1387 | /* | ||
| 1388 | * To avoid races, try to do the TID -> 0 atomic transition | ||
| 1389 | * again. If it succeeds then we can return without waking | ||
| 1390 | * anyone else up: | ||
| 1391 | */ | ||
| 1392 | if (!(uval & FUTEX_OWNER_DIED)) { | ||
| 1393 | inc_preempt_count(); | ||
| 1394 | uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); | ||
| 1395 | dec_preempt_count(); | ||
| 1396 | } | ||
| 1397 | |||
| 1398 | if (unlikely(uval == -EFAULT)) | ||
| 1399 | goto pi_faulted; | ||
| 1400 | /* | ||
| 1401 | * Rare case: we managed to release the lock atomically, | ||
| 1402 | * no need to wake anyone else up: | ||
| 1403 | */ | ||
| 1404 | if (unlikely(uval == current->pid)) | ||
| 1405 | goto out_unlock; | ||
| 1406 | |||
| 1407 | /* | ||
| 1408 | * Ok, other tasks may need to be woken up - check waiters | ||
| 1409 | * and do the wakeup if necessary: | ||
| 1410 | */ | ||
| 1411 | head = &hb->chain; | ||
| 1412 | |||
| 1413 | list_for_each_entry_safe(this, next, head, list) { | ||
| 1414 | if (!match_futex (&this->key, &key)) | ||
| 1415 | continue; | ||
| 1416 | ret = wake_futex_pi(uaddr, uval, this); | ||
| 1417 | /* | ||
| 1418 | * The atomic access to the futex value | ||
| 1419 | * generated a pagefault, so retry the | ||
| 1420 | * user-access and the wakeup: | ||
| 1421 | */ | ||
| 1422 | if (ret == -EFAULT) | ||
| 1423 | goto pi_faulted; | ||
| 1424 | goto out_unlock; | ||
| 1425 | } | ||
| 1426 | /* | ||
| 1427 | * No waiters - kernel unlocks the futex: | ||
| 1428 | */ | ||
| 1429 | if (!(uval & FUTEX_OWNER_DIED)) { | ||
| 1430 | ret = unlock_futex_pi(uaddr, uval); | ||
| 1431 | if (ret == -EFAULT) | ||
| 1432 | goto pi_faulted; | ||
| 1433 | } | ||
| 1434 | |||
| 1435 | out_unlock: | ||
| 1436 | spin_unlock(&hb->lock); | ||
| 1437 | out: | ||
| 728 | up_read(¤t->mm->mmap_sem); | 1438 | up_read(¤t->mm->mmap_sem); |
| 1439 | |||
| 1440 | return ret; | ||
| 1441 | |||
| 1442 | pi_faulted: | ||
| 1443 | /* | ||
| 1444 | * We have to r/w *(int __user *)uaddr, but we can't modify it | ||
| 1445 | * non-atomically. Therefore, if get_user below is not | ||
| 1446 | * enough, we need to handle the fault ourselves, while | ||
| 1447 | * still holding the mmap_sem. | ||
| 1448 | */ | ||
| 1449 | if (attempt++) { | ||
| 1450 | if (futex_handle_fault((unsigned long)uaddr, attempt)) { | ||
| 1451 | ret = -EFAULT; | ||
| 1452 | goto out_unlock; | ||
| 1453 | } | ||
| 1454 | goto retry_locked; | ||
| 1455 | } | ||
| 1456 | |||
| 1457 | spin_unlock(&hb->lock); | ||
| 1458 | up_read(¤t->mm->mmap_sem); | ||
| 1459 | |||
| 1460 | ret = get_user(uval, uaddr); | ||
| 1461 | if (!ret && (uval != -EFAULT)) | ||
| 1462 | goto retry; | ||
| 1463 | |||
| 729 | return ret; | 1464 | return ret; |
| 730 | } | 1465 | } |
| 731 | 1466 | ||
| @@ -735,6 +1470,7 @@ static int futex_close(struct inode *inode, struct file *filp) | |||
| 735 | 1470 | ||
| 736 | unqueue_me(q); | 1471 | unqueue_me(q); |
| 737 | kfree(q); | 1472 | kfree(q); |
| 1473 | |||
| 738 | return 0; | 1474 | return 0; |
| 739 | } | 1475 | } |
| 740 | 1476 | ||
| @@ -766,7 +1502,7 @@ static struct file_operations futex_fops = { | |||
| 766 | * Signal allows caller to avoid the race which would occur if they | 1502 | * Signal allows caller to avoid the race which would occur if they |
| 767 | * set the sigio stuff up afterwards. | 1503 | * set the sigio stuff up afterwards. |
| 768 | */ | 1504 | */ |
| 769 | static int futex_fd(unsigned long uaddr, int signal) | 1505 | static int futex_fd(u32 __user *uaddr, int signal) |
| 770 | { | 1506 | { |
| 771 | struct futex_q *q; | 1507 | struct futex_q *q; |
| 772 | struct file *filp; | 1508 | struct file *filp; |
| @@ -803,6 +1539,7 @@ static int futex_fd(unsigned long uaddr, int signal) | |||
| 803 | err = -ENOMEM; | 1539 | err = -ENOMEM; |
| 804 | goto error; | 1540 | goto error; |
| 805 | } | 1541 | } |
| 1542 | q->pi_state = NULL; | ||
| 806 | 1543 | ||
| 807 | down_read(¤t->mm->mmap_sem); | 1544 | down_read(¤t->mm->mmap_sem); |
| 808 | err = get_futex_key(uaddr, &q->key); | 1545 | err = get_futex_key(uaddr, &q->key); |
| @@ -840,7 +1577,7 @@ error: | |||
| 840 | * Implementation: user-space maintains a per-thread list of locks it | 1577 | * Implementation: user-space maintains a per-thread list of locks it |
| 841 | * is holding. Upon do_exit(), the kernel carefully walks this list, | 1578 | * is holding. Upon do_exit(), the kernel carefully walks this list, |
| 842 | * and marks all locks that are owned by this thread with the | 1579 | * and marks all locks that are owned by this thread with the |
| 843 | * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is | 1580 | * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is |
| 844 | * always manipulated with the lock held, so the list is private and | 1581 | * always manipulated with the lock held, so the list is private and |
| 845 | * per-thread. Userspace also maintains a per-thread 'list_op_pending' | 1582 | * per-thread. Userspace also maintains a per-thread 'list_op_pending' |
| 846 | * field, to allow the kernel to clean up if the thread dies after | 1583 | * field, to allow the kernel to clean up if the thread dies after |
| @@ -887,7 +1624,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, | |||
| 887 | struct task_struct *p; | 1624 | struct task_struct *p; |
| 888 | 1625 | ||
| 889 | ret = -ESRCH; | 1626 | ret = -ESRCH; |
| 890 | read_lock(&tasklist_lock); | 1627 | rcu_read_lock(); |
| 891 | p = find_task_by_pid(pid); | 1628 | p = find_task_by_pid(pid); |
| 892 | if (!p) | 1629 | if (!p) |
| 893 | goto err_unlock; | 1630 | goto err_unlock; |
| @@ -896,7 +1633,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, | |||
| 896 | !capable(CAP_SYS_PTRACE)) | 1633 | !capable(CAP_SYS_PTRACE)) |
| 897 | goto err_unlock; | 1634 | goto err_unlock; |
| 898 | head = p->robust_list; | 1635 | head = p->robust_list; |
| 899 | read_unlock(&tasklist_lock); | 1636 | rcu_read_unlock(); |
| 900 | } | 1637 | } |
| 901 | 1638 | ||
| 902 | if (put_user(sizeof(*head), len_ptr)) | 1639 | if (put_user(sizeof(*head), len_ptr)) |
| @@ -904,7 +1641,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, | |||
| 904 | return put_user(head, head_ptr); | 1641 | return put_user(head, head_ptr); |
| 905 | 1642 | ||
| 906 | err_unlock: | 1643 | err_unlock: |
| 907 | read_unlock(&tasklist_lock); | 1644 | rcu_read_unlock(); |
| 908 | 1645 | ||
| 909 | return ret; | 1646 | return ret; |
| 910 | } | 1647 | } |
| @@ -913,9 +1650,9 @@ err_unlock: | |||
| 913 | * Process a futex-list entry, check whether it's owned by the | 1650 | * Process a futex-list entry, check whether it's owned by the |
| 914 | * dying task, and do notification if so: | 1651 | * dying task, and do notification if so: |
| 915 | */ | 1652 | */ |
| 916 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) | 1653 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) |
| 917 | { | 1654 | { |
| 918 | u32 uval; | 1655 | u32 uval, nval, mval; |
| 919 | 1656 | ||
| 920 | retry: | 1657 | retry: |
| 921 | if (get_user(uval, uaddr)) | 1658 | if (get_user(uval, uaddr)) |
| @@ -932,17 +1669,45 @@ retry: | |||
| 932 | * thread-death.) The rest of the cleanup is done in | 1669 | * thread-death.) The rest of the cleanup is done in |
| 933 | * userspace. | 1670 | * userspace. |
| 934 | */ | 1671 | */ |
| 935 | if (futex_atomic_cmpxchg_inatomic(uaddr, uval, | 1672 | mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; |
| 936 | uval | FUTEX_OWNER_DIED) != uval) | 1673 | nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); |
| 1674 | |||
| 1675 | if (nval == -EFAULT) | ||
| 1676 | return -1; | ||
| 1677 | |||
| 1678 | if (nval != uval) | ||
| 937 | goto retry; | 1679 | goto retry; |
| 938 | 1680 | ||
| 939 | if (uval & FUTEX_WAITERS) | 1681 | /* |
| 940 | futex_wake((unsigned long)uaddr, 1); | 1682 | * Wake robust non-PI futexes here. The wakeup of |
| 1683 | * PI futexes happens in exit_pi_state(): | ||
| 1684 | */ | ||
| 1685 | if (!pi) { | ||
| 1686 | if (uval & FUTEX_WAITERS) | ||
| 1687 | futex_wake(uaddr, 1); | ||
| 1688 | } | ||
| 941 | } | 1689 | } |
| 942 | return 0; | 1690 | return 0; |
| 943 | } | 1691 | } |
| 944 | 1692 | ||
| 945 | /* | 1693 | /* |
| 1694 | * Fetch a robust-list pointer. Bit 0 signals PI futexes: | ||
| 1695 | */ | ||
| 1696 | static inline int fetch_robust_entry(struct robust_list __user **entry, | ||
| 1697 | struct robust_list __user **head, int *pi) | ||
| 1698 | { | ||
| 1699 | unsigned long uentry; | ||
| 1700 | |||
| 1701 | if (get_user(uentry, (unsigned long *)head)) | ||
| 1702 | return -EFAULT; | ||
| 1703 | |||
| 1704 | *entry = (void *)(uentry & ~1UL); | ||
| 1705 | *pi = uentry & 1; | ||
| 1706 | |||
| 1707 | return 0; | ||
| 1708 | } | ||
| 1709 | |||
| 1710 | /* | ||
| 946 | * Walk curr->robust_list (very carefully, it's a userspace list!) | 1711 | * Walk curr->robust_list (very carefully, it's a userspace list!) |
| 947 | * and mark any locks found there dead, and notify any waiters. | 1712 | * and mark any locks found there dead, and notify any waiters. |
| 948 | * | 1713 | * |
| @@ -952,14 +1717,14 @@ void exit_robust_list(struct task_struct *curr) | |||
| 952 | { | 1717 | { |
| 953 | struct robust_list_head __user *head = curr->robust_list; | 1718 | struct robust_list_head __user *head = curr->robust_list; |
| 954 | struct robust_list __user *entry, *pending; | 1719 | struct robust_list __user *entry, *pending; |
| 955 | unsigned int limit = ROBUST_LIST_LIMIT; | 1720 | unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; |
| 956 | unsigned long futex_offset; | 1721 | unsigned long futex_offset; |
| 957 | 1722 | ||
| 958 | /* | 1723 | /* |
| 959 | * Fetch the list head (which was registered earlier, via | 1724 | * Fetch the list head (which was registered earlier, via |
| 960 | * sys_set_robust_list()): | 1725 | * sys_set_robust_list()): |
| 961 | */ | 1726 | */ |
| 962 | if (get_user(entry, &head->list.next)) | 1727 | if (fetch_robust_entry(&entry, &head->list.next, &pi)) |
| 963 | return; | 1728 | return; |
| 964 | /* | 1729 | /* |
| 965 | * Fetch the relative futex offset: | 1730 | * Fetch the relative futex offset: |
| @@ -970,24 +1735,25 @@ void exit_robust_list(struct task_struct *curr) | |||
| 970 | * Fetch any possibly pending lock-add first, and handle it | 1735 | * Fetch any possibly pending lock-add first, and handle it |
| 971 | * if it exists: | 1736 | * if it exists: |
| 972 | */ | 1737 | */ |
| 973 | if (get_user(pending, &head->list_op_pending)) | 1738 | if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) |
| 974 | return; | 1739 | return; |
| 1740 | |||
| 975 | if (pending) | 1741 | if (pending) |
| 976 | handle_futex_death((void *)pending + futex_offset, curr); | 1742 | handle_futex_death((void *)pending + futex_offset, curr, pip); |
| 977 | 1743 | ||
| 978 | while (entry != &head->list) { | 1744 | while (entry != &head->list) { |
| 979 | /* | 1745 | /* |
| 980 | * A pending lock might already be on the list, so | 1746 | * A pending lock might already be on the list, so |
| 981 | * dont process it twice: | 1747 | * don't process it twice: |
| 982 | */ | 1748 | */ |
| 983 | if (entry != pending) | 1749 | if (entry != pending) |
| 984 | if (handle_futex_death((void *)entry + futex_offset, | 1750 | if (handle_futex_death((void *)entry + futex_offset, |
| 985 | curr)) | 1751 | curr, pi)) |
| 986 | return; | 1752 | return; |
| 987 | /* | 1753 | /* |
| 988 | * Fetch the next entry in the list: | 1754 | * Fetch the next entry in the list: |
| 989 | */ | 1755 | */ |
| 990 | if (get_user(entry, &entry->next)) | 1756 | if (fetch_robust_entry(&entry, &entry->next, &pi)) |
| 991 | return; | 1757 | return; |
| 992 | /* | 1758 | /* |
| 993 | * Avoid excessively long or circular lists: | 1759 | * Avoid excessively long or circular lists: |
| @@ -999,8 +1765,8 @@ void exit_robust_list(struct task_struct *curr) | |||
| 999 | } | 1765 | } |
| 1000 | } | 1766 | } |
| 1001 | 1767 | ||
| 1002 | long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | 1768 | long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, |
| 1003 | unsigned long uaddr2, int val2, int val3) | 1769 | u32 __user *uaddr2, u32 val2, u32 val3) |
| 1004 | { | 1770 | { |
| 1005 | int ret; | 1771 | int ret; |
| 1006 | 1772 | ||
| @@ -1024,6 +1790,15 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | |||
| 1024 | case FUTEX_WAKE_OP: | 1790 | case FUTEX_WAKE_OP: |
| 1025 | ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); | 1791 | ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); |
| 1026 | break; | 1792 | break; |
| 1793 | case FUTEX_LOCK_PI: | ||
| 1794 | ret = futex_lock_pi(uaddr, val, timeout, val2, 0); | ||
| 1795 | break; | ||
| 1796 | case FUTEX_UNLOCK_PI: | ||
| 1797 | ret = futex_unlock_pi(uaddr); | ||
| 1798 | break; | ||
| 1799 | case FUTEX_TRYLOCK_PI: | ||
| 1800 | ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); | ||
| 1801 | break; | ||
| 1027 | default: | 1802 | default: |
| 1028 | ret = -ENOSYS; | 1803 | ret = -ENOSYS; |
| 1029 | } | 1804 | } |
| @@ -1031,29 +1806,33 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | |||
| 1031 | } | 1806 | } |
| 1032 | 1807 | ||
| 1033 | 1808 | ||
| 1034 | asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, | 1809 | asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, |
| 1035 | struct timespec __user *utime, u32 __user *uaddr2, | 1810 | struct timespec __user *utime, u32 __user *uaddr2, |
| 1036 | int val3) | 1811 | u32 val3) |
| 1037 | { | 1812 | { |
| 1038 | struct timespec t; | 1813 | struct timespec t; |
| 1039 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; | 1814 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; |
| 1040 | int val2 = 0; | 1815 | u32 val2 = 0; |
| 1041 | 1816 | ||
| 1042 | if (utime && (op == FUTEX_WAIT)) { | 1817 | if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { |
| 1043 | if (copy_from_user(&t, utime, sizeof(t)) != 0) | 1818 | if (copy_from_user(&t, utime, sizeof(t)) != 0) |
| 1044 | return -EFAULT; | 1819 | return -EFAULT; |
| 1045 | if (!timespec_valid(&t)) | 1820 | if (!timespec_valid(&t)) |
| 1046 | return -EINVAL; | 1821 | return -EINVAL; |
| 1047 | timeout = timespec_to_jiffies(&t) + 1; | 1822 | if (op == FUTEX_WAIT) |
| 1823 | timeout = timespec_to_jiffies(&t) + 1; | ||
| 1824 | else { | ||
| 1825 | timeout = t.tv_sec; | ||
| 1826 | val2 = t.tv_nsec; | ||
| 1827 | } | ||
| 1048 | } | 1828 | } |
| 1049 | /* | 1829 | /* |
| 1050 | * requeue parameter in 'utime' if op == FUTEX_REQUEUE. | 1830 | * requeue parameter in 'utime' if op == FUTEX_REQUEUE. |
| 1051 | */ | 1831 | */ |
| 1052 | if (op >= FUTEX_REQUEUE) | 1832 | if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) |
| 1053 | val2 = (int) (unsigned long) utime; | 1833 | val2 = (u32) (unsigned long) utime; |
| 1054 | 1834 | ||
| 1055 | return do_futex((unsigned long)uaddr, op, val, timeout, | 1835 | return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); |
| 1056 | (unsigned long)uaddr2, val2, val3); | ||
| 1057 | } | 1836 | } |
| 1058 | 1837 | ||
| 1059 | static int futexfs_get_sb(struct file_system_type *fs_type, | 1838 | static int futexfs_get_sb(struct file_system_type *fs_type, |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 1ab6a0ea3d14..c5cca3f65cb7 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
| @@ -12,6 +12,23 @@ | |||
| 12 | 12 | ||
| 13 | #include <asm/uaccess.h> | 13 | #include <asm/uaccess.h> |
| 14 | 14 | ||
| 15 | |||
| 16 | /* | ||
| 17 | * Fetch a robust-list pointer. Bit 0 signals PI futexes: | ||
| 18 | */ | ||
| 19 | static inline int | ||
| 20 | fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, | ||
| 21 | compat_uptr_t *head, int *pi) | ||
| 22 | { | ||
| 23 | if (get_user(*uentry, head)) | ||
| 24 | return -EFAULT; | ||
| 25 | |||
| 26 | *entry = compat_ptr((*uentry) & ~1); | ||
| 27 | *pi = (unsigned int)(*uentry) & 1; | ||
| 28 | |||
| 29 | return 0; | ||
| 30 | } | ||
| 31 | |||
| 15 | /* | 32 | /* |
| 16 | * Walk curr->robust_list (very carefully, it's a userspace list!) | 33 | * Walk curr->robust_list (very carefully, it's a userspace list!) |
| 17 | * and mark any locks found there dead, and notify any waiters. | 34 | * and mark any locks found there dead, and notify any waiters. |
| @@ -22,17 +39,16 @@ void compat_exit_robust_list(struct task_struct *curr) | |||
| 22 | { | 39 | { |
| 23 | struct compat_robust_list_head __user *head = curr->compat_robust_list; | 40 | struct compat_robust_list_head __user *head = curr->compat_robust_list; |
| 24 | struct robust_list __user *entry, *pending; | 41 | struct robust_list __user *entry, *pending; |
| 42 | unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; | ||
| 25 | compat_uptr_t uentry, upending; | 43 | compat_uptr_t uentry, upending; |
| 26 | unsigned int limit = ROBUST_LIST_LIMIT; | ||
| 27 | compat_long_t futex_offset; | 44 | compat_long_t futex_offset; |
| 28 | 45 | ||
| 29 | /* | 46 | /* |
| 30 | * Fetch the list head (which was registered earlier, via | 47 | * Fetch the list head (which was registered earlier, via |
| 31 | * sys_set_robust_list()): | 48 | * sys_set_robust_list()): |
| 32 | */ | 49 | */ |
| 33 | if (get_user(uentry, &head->list.next)) | 50 | if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) |
| 34 | return; | 51 | return; |
| 35 | entry = compat_ptr(uentry); | ||
| 36 | /* | 52 | /* |
| 37 | * Fetch the relative futex offset: | 53 | * Fetch the relative futex offset: |
| 38 | */ | 54 | */ |
| @@ -42,11 +58,11 @@ void compat_exit_robust_list(struct task_struct *curr) | |||
| 42 | * Fetch any possibly pending lock-add first, and handle it | 58 | * Fetch any possibly pending lock-add first, and handle it |
| 43 | * if it exists: | 59 | * if it exists: |
| 44 | */ | 60 | */ |
| 45 | if (get_user(upending, &head->list_op_pending)) | 61 | if (fetch_robust_entry(&upending, &pending, |
| 62 | &head->list_op_pending, &pip)) | ||
| 46 | return; | 63 | return; |
| 47 | pending = compat_ptr(upending); | ||
| 48 | if (upending) | 64 | if (upending) |
| 49 | handle_futex_death((void *)pending + futex_offset, curr); | 65 | handle_futex_death((void *)pending + futex_offset, curr, pip); |
| 50 | 66 | ||
| 51 | while (compat_ptr(uentry) != &head->list) { | 67 | while (compat_ptr(uentry) != &head->list) { |
| 52 | /* | 68 | /* |
| @@ -55,15 +71,15 @@ void compat_exit_robust_list(struct task_struct *curr) | |||
| 55 | */ | 71 | */ |
| 56 | if (entry != pending) | 72 | if (entry != pending) |
| 57 | if (handle_futex_death((void *)entry + futex_offset, | 73 | if (handle_futex_death((void *)entry + futex_offset, |
| 58 | curr)) | 74 | curr, pi)) |
| 59 | return; | 75 | return; |
| 60 | 76 | ||
| 61 | /* | 77 | /* |
| 62 | * Fetch the next entry in the list: | 78 | * Fetch the next entry in the list: |
| 63 | */ | 79 | */ |
| 64 | if (get_user(uentry, (compat_uptr_t *)&entry->next)) | 80 | if (fetch_robust_entry(&uentry, &entry, |
| 81 | (compat_uptr_t *)&entry->next, &pi)) | ||
| 65 | return; | 82 | return; |
| 66 | entry = compat_ptr(uentry); | ||
| 67 | /* | 83 | /* |
| 68 | * Avoid excessively long or circular lists: | 84 | * Avoid excessively long or circular lists: |
| 69 | */ | 85 | */ |
| @@ -129,16 +145,20 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, | |||
| 129 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; | 145 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; |
| 130 | int val2 = 0; | 146 | int val2 = 0; |
| 131 | 147 | ||
| 132 | if (utime && (op == FUTEX_WAIT)) { | 148 | if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { |
| 133 | if (get_compat_timespec(&t, utime)) | 149 | if (get_compat_timespec(&t, utime)) |
| 134 | return -EFAULT; | 150 | return -EFAULT; |
| 135 | if (!timespec_valid(&t)) | 151 | if (!timespec_valid(&t)) |
| 136 | return -EINVAL; | 152 | return -EINVAL; |
| 137 | timeout = timespec_to_jiffies(&t) + 1; | 153 | if (op == FUTEX_WAIT) |
| 154 | timeout = timespec_to_jiffies(&t) + 1; | ||
| 155 | else { | ||
| 156 | timeout = t.tv_sec; | ||
| 157 | val2 = t.tv_nsec; | ||
| 158 | } | ||
| 138 | } | 159 | } |
| 139 | if (op >= FUTEX_REQUEUE) | 160 | if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) |
| 140 | val2 = (int) (unsigned long) utime; | 161 | val2 = (int) (unsigned long) utime; |
| 141 | 162 | ||
| 142 | return do_futex((unsigned long)uaddr, op, val, timeout, | 163 | return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); |
| 143 | (unsigned long)uaddr2, val2, val3); | ||
| 144 | } | 164 | } |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 18324305724a..d0ba190dfeb6 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
| @@ -98,7 +98,6 @@ static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) = | |||
| 98 | 98 | ||
| 99 | /** | 99 | /** |
| 100 | * ktime_get_ts - get the monotonic clock in timespec format | 100 | * ktime_get_ts - get the monotonic clock in timespec format |
| 101 | * | ||
| 102 | * @ts: pointer to timespec variable | 101 | * @ts: pointer to timespec variable |
| 103 | * | 102 | * |
| 104 | * The function calculates the monotonic clock from the realtime | 103 | * The function calculates the monotonic clock from the realtime |
| @@ -188,7 +187,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) | |||
| 188 | { | 187 | { |
| 189 | struct hrtimer_base *new_base; | 188 | struct hrtimer_base *new_base; |
| 190 | 189 | ||
| 191 | new_base = &__get_cpu_var(hrtimer_bases[base->index]); | 190 | new_base = &__get_cpu_var(hrtimer_bases)[base->index]; |
| 192 | 191 | ||
| 193 | if (base != new_base) { | 192 | if (base != new_base) { |
| 194 | /* | 193 | /* |
| @@ -238,7 +237,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) | |||
| 238 | # ifndef CONFIG_KTIME_SCALAR | 237 | # ifndef CONFIG_KTIME_SCALAR |
| 239 | /** | 238 | /** |
| 240 | * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable | 239 | * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable |
| 241 | * | ||
| 242 | * @kt: addend | 240 | * @kt: addend |
| 243 | * @nsec: the scalar nsec value to add | 241 | * @nsec: the scalar nsec value to add |
| 244 | * | 242 | * |
| @@ -299,7 +297,6 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) | |||
| 299 | 297 | ||
| 300 | /** | 298 | /** |
| 301 | * hrtimer_forward - forward the timer expiry | 299 | * hrtimer_forward - forward the timer expiry |
| 302 | * | ||
| 303 | * @timer: hrtimer to forward | 300 | * @timer: hrtimer to forward |
| 304 | * @now: forward past this time | 301 | * @now: forward past this time |
| 305 | * @interval: the interval to forward | 302 | * @interval: the interval to forward |
| @@ -411,7 +408,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | |||
| 411 | 408 | ||
| 412 | /** | 409 | /** |
| 413 | * hrtimer_start - (re)start an relative timer on the current CPU | 410 | * hrtimer_start - (re)start an relative timer on the current CPU |
| 414 | * | ||
| 415 | * @timer: the timer to be added | 411 | * @timer: the timer to be added |
| 416 | * @tim: expiry time | 412 | * @tim: expiry time |
| 417 | * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) | 413 | * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) |
| @@ -460,14 +456,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start); | |||
| 460 | 456 | ||
| 461 | /** | 457 | /** |
| 462 | * hrtimer_try_to_cancel - try to deactivate a timer | 458 | * hrtimer_try_to_cancel - try to deactivate a timer |
| 463 | * | ||
| 464 | * @timer: hrtimer to stop | 459 | * @timer: hrtimer to stop |
| 465 | * | 460 | * |
| 466 | * Returns: | 461 | * Returns: |
| 467 | * 0 when the timer was not active | 462 | * 0 when the timer was not active |
| 468 | * 1 when the timer was active | 463 | * 1 when the timer was active |
| 469 | * -1 when the timer is currently excuting the callback function and | 464 | * -1 when the timer is currently excuting the callback function and |
| 470 | * can not be stopped | 465 | * cannot be stopped |
| 471 | */ | 466 | */ |
| 472 | int hrtimer_try_to_cancel(struct hrtimer *timer) | 467 | int hrtimer_try_to_cancel(struct hrtimer *timer) |
| 473 | { | 468 | { |
| @@ -489,7 +484,6 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); | |||
| 489 | 484 | ||
| 490 | /** | 485 | /** |
| 491 | * hrtimer_cancel - cancel a timer and wait for the handler to finish. | 486 | * hrtimer_cancel - cancel a timer and wait for the handler to finish. |
| 492 | * | ||
| 493 | * @timer: the timer to be cancelled | 487 | * @timer: the timer to be cancelled |
| 494 | * | 488 | * |
| 495 | * Returns: | 489 | * Returns: |
| @@ -510,7 +504,6 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); | |||
| 510 | 504 | ||
| 511 | /** | 505 | /** |
| 512 | * hrtimer_get_remaining - get remaining time for the timer | 506 | * hrtimer_get_remaining - get remaining time for the timer |
| 513 | * | ||
| 514 | * @timer: the timer to read | 507 | * @timer: the timer to read |
| 515 | */ | 508 | */ |
| 516 | ktime_t hrtimer_get_remaining(const struct hrtimer *timer) | 509 | ktime_t hrtimer_get_remaining(const struct hrtimer *timer) |
| @@ -564,7 +557,6 @@ ktime_t hrtimer_get_next_event(void) | |||
| 564 | 557 | ||
| 565 | /** | 558 | /** |
| 566 | * hrtimer_init - initialize a timer to the given clock | 559 | * hrtimer_init - initialize a timer to the given clock |
| 567 | * | ||
| 568 | * @timer: the timer to be initialized | 560 | * @timer: the timer to be initialized |
| 569 | * @clock_id: the clock to be used | 561 | * @clock_id: the clock to be used |
| 570 | * @mode: timer mode abs/rel | 562 | * @mode: timer mode abs/rel |
| @@ -576,7 +568,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
| 576 | 568 | ||
| 577 | memset(timer, 0, sizeof(struct hrtimer)); | 569 | memset(timer, 0, sizeof(struct hrtimer)); |
| 578 | 570 | ||
| 579 | bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); | 571 | bases = __raw_get_cpu_var(hrtimer_bases); |
| 580 | 572 | ||
| 581 | if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) | 573 | if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) |
| 582 | clock_id = CLOCK_MONOTONIC; | 574 | clock_id = CLOCK_MONOTONIC; |
| @@ -588,7 +580,6 @@ EXPORT_SYMBOL_GPL(hrtimer_init); | |||
| 588 | 580 | ||
| 589 | /** | 581 | /** |
| 590 | * hrtimer_get_res - get the timer resolution for a clock | 582 | * hrtimer_get_res - get the timer resolution for a clock |
| 591 | * | ||
| 592 | * @which_clock: which clock to query | 583 | * @which_clock: which clock to query |
| 593 | * @tp: pointer to timespec variable to store the resolution | 584 | * @tp: pointer to timespec variable to store the resolution |
| 594 | * | 585 | * |
| @@ -599,7 +590,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) | |||
| 599 | { | 590 | { |
| 600 | struct hrtimer_base *bases; | 591 | struct hrtimer_base *bases; |
| 601 | 592 | ||
| 602 | bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); | 593 | bases = __raw_get_cpu_var(hrtimer_bases); |
| 603 | *tp = ktime_to_timespec(bases[which_clock].resolution); | 594 | *tp = ktime_to_timespec(bases[which_clock].resolution); |
| 604 | 595 | ||
| 605 | return 0; | 596 | return 0; |
| @@ -678,7 +669,7 @@ static int hrtimer_wakeup(struct hrtimer *timer) | |||
| 678 | return HRTIMER_NORESTART; | 669 | return HRTIMER_NORESTART; |
| 679 | } | 670 | } |
| 680 | 671 | ||
| 681 | void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, task_t *task) | 672 | void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) |
| 682 | { | 673 | { |
| 683 | sl->timer.function = hrtimer_wakeup; | 674 | sl->timer.function = hrtimer_wakeup; |
| 684 | sl->task = task; | 675 | sl->task = task; |
| @@ -702,7 +693,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod | |||
| 702 | return t->task == NULL; | 693 | return t->task == NULL; |
| 703 | } | 694 | } |
| 704 | 695 | ||
| 705 | static long __sched nanosleep_restart(struct restart_block *restart) | 696 | long __sched hrtimer_nanosleep_restart(struct restart_block *restart) |
| 706 | { | 697 | { |
| 707 | struct hrtimer_sleeper t; | 698 | struct hrtimer_sleeper t; |
| 708 | struct timespec __user *rmtp; | 699 | struct timespec __user *rmtp; |
| @@ -711,13 +702,13 @@ static long __sched nanosleep_restart(struct restart_block *restart) | |||
| 711 | 702 | ||
| 712 | restart->fn = do_no_restart_syscall; | 703 | restart->fn = do_no_restart_syscall; |
| 713 | 704 | ||
| 714 | hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS); | 705 | hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS); |
| 715 | t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0; | 706 | t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; |
| 716 | 707 | ||
| 717 | if (do_nanosleep(&t, HRTIMER_ABS)) | 708 | if (do_nanosleep(&t, HRTIMER_ABS)) |
| 718 | return 0; | 709 | return 0; |
| 719 | 710 | ||
| 720 | rmtp = (struct timespec __user *) restart->arg2; | 711 | rmtp = (struct timespec __user *) restart->arg1; |
| 721 | if (rmtp) { | 712 | if (rmtp) { |
| 722 | time = ktime_sub(t.timer.expires, t.timer.base->get_time()); | 713 | time = ktime_sub(t.timer.expires, t.timer.base->get_time()); |
| 723 | if (time.tv64 <= 0) | 714 | if (time.tv64 <= 0) |
| @@ -727,7 +718,7 @@ static long __sched nanosleep_restart(struct restart_block *restart) | |||
| 727 | return -EFAULT; | 718 | return -EFAULT; |
| 728 | } | 719 | } |
| 729 | 720 | ||
| 730 | restart->fn = nanosleep_restart; | 721 | restart->fn = hrtimer_nanosleep_restart; |
| 731 | 722 | ||
| 732 | /* The other values in restart are already filled in */ | 723 | /* The other values in restart are already filled in */ |
| 733 | return -ERESTART_RESTARTBLOCK; | 724 | return -ERESTART_RESTARTBLOCK; |
| @@ -760,11 +751,11 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, | |||
| 760 | } | 751 | } |
| 761 | 752 | ||
| 762 | restart = ¤t_thread_info()->restart_block; | 753 | restart = ¤t_thread_info()->restart_block; |
| 763 | restart->fn = nanosleep_restart; | 754 | restart->fn = hrtimer_nanosleep_restart; |
| 764 | restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF; | 755 | restart->arg0 = (unsigned long) t.timer.base->index; |
| 765 | restart->arg1 = t.timer.expires.tv64 >> 32; | 756 | restart->arg1 = (unsigned long) rmtp; |
| 766 | restart->arg2 = (unsigned long) rmtp; | 757 | restart->arg2 = t.timer.expires.tv64 & 0xFFFFFFFF; |
| 767 | restart->arg3 = (unsigned long) t.timer.base->index; | 758 | restart->arg3 = t.timer.expires.tv64 >> 32; |
| 768 | 759 | ||
| 769 | return -ERESTART_RESTARTBLOCK; | 760 | return -ERESTART_RESTARTBLOCK; |
| 770 | } | 761 | } |
| @@ -791,8 +782,10 @@ static void __devinit init_hrtimers_cpu(int cpu) | |||
| 791 | struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu); | 782 | struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu); |
| 792 | int i; | 783 | int i; |
| 793 | 784 | ||
| 794 | for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) | 785 | for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { |
| 795 | spin_lock_init(&base->lock); | 786 | spin_lock_init(&base->lock); |
| 787 | lockdep_set_class(&base->lock, &base->lock_key); | ||
| 788 | } | ||
| 796 | } | 789 | } |
| 797 | 790 | ||
| 798 | #ifdef CONFIG_HOTPLUG_CPU | 791 | #ifdef CONFIG_HOTPLUG_CPU |
| @@ -842,7 +835,7 @@ static void migrate_hrtimers(int cpu) | |||
| 842 | } | 835 | } |
| 843 | #endif /* CONFIG_HOTPLUG_CPU */ | 836 | #endif /* CONFIG_HOTPLUG_CPU */ |
| 844 | 837 | ||
| 845 | static int hrtimer_cpu_notify(struct notifier_block *self, | 838 | static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, |
| 846 | unsigned long action, void *hcpu) | 839 | unsigned long action, void *hcpu) |
| 847 | { | 840 | { |
| 848 | long cpu = (long)hcpu; | 841 | long cpu = (long)hcpu; |
| @@ -866,7 +859,7 @@ static int hrtimer_cpu_notify(struct notifier_block *self, | |||
| 866 | return NOTIFY_OK; | 859 | return NOTIFY_OK; |
| 867 | } | 860 | } |
| 868 | 861 | ||
| 869 | static struct notifier_block hrtimers_nb = { | 862 | static struct notifier_block __cpuinitdata hrtimers_nb = { |
| 870 | .notifier_call = hrtimer_cpu_notify, | 863 | .notifier_call = hrtimer_cpu_notify, |
| 871 | }; | 864 | }; |
| 872 | 865 | ||
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 9f77f50d8143..1dab0ac3f797 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | 1 | ||
| 2 | obj-y := handle.o manage.o spurious.o | 2 | obj-y := handle.o manage.o spurious.o resend.o chip.o |
| 3 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o | 3 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o |
| 4 | obj-$(CONFIG_PROC_FS) += proc.o | 4 | obj-$(CONFIG_PROC_FS) += proc.o |
| 5 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o | 5 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o |
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 3467097ca61a..533068cfb607 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c | |||
| @@ -11,12 +11,14 @@ | |||
| 11 | #include <linux/interrupt.h> | 11 | #include <linux/interrupt.h> |
| 12 | #include <linux/delay.h> | 12 | #include <linux/delay.h> |
| 13 | 13 | ||
| 14 | #include "internals.h" | ||
| 15 | |||
| 14 | /* | 16 | /* |
| 15 | * Autodetection depends on the fact that any interrupt that | 17 | * Autodetection depends on the fact that any interrupt that |
| 16 | * comes in on to an unassigned handler will get stuck with | 18 | * comes in on to an unassigned handler will get stuck with |
| 17 | * "IRQ_WAITING" cleared and the interrupt disabled. | 19 | * "IRQ_WAITING" cleared and the interrupt disabled. |
| 18 | */ | 20 | */ |
| 19 | static DECLARE_MUTEX(probe_sem); | 21 | static DEFINE_MUTEX(probing_active); |
| 20 | 22 | ||
| 21 | /** | 23 | /** |
| 22 | * probe_irq_on - begin an interrupt autodetect | 24 | * probe_irq_on - begin an interrupt autodetect |
| @@ -27,11 +29,11 @@ static DECLARE_MUTEX(probe_sem); | |||
| 27 | */ | 29 | */ |
| 28 | unsigned long probe_irq_on(void) | 30 | unsigned long probe_irq_on(void) |
| 29 | { | 31 | { |
| 30 | unsigned long val; | 32 | struct irq_desc *desc; |
| 31 | irq_desc_t *desc; | 33 | unsigned long mask; |
| 32 | unsigned int i; | 34 | unsigned int i; |
| 33 | 35 | ||
| 34 | down(&probe_sem); | 36 | mutex_lock(&probing_active); |
| 35 | /* | 37 | /* |
| 36 | * something may have generated an irq long ago and we want to | 38 | * something may have generated an irq long ago and we want to |
| 37 | * flush such a longstanding irq before considering it as spurious. | 39 | * flush such a longstanding irq before considering it as spurious. |
| @@ -40,8 +42,21 @@ unsigned long probe_irq_on(void) | |||
| 40 | desc = irq_desc + i; | 42 | desc = irq_desc + i; |
| 41 | 43 | ||
| 42 | spin_lock_irq(&desc->lock); | 44 | spin_lock_irq(&desc->lock); |
| 43 | if (!irq_desc[i].action) | 45 | if (!desc->action && !(desc->status & IRQ_NOPROBE)) { |
| 44 | irq_desc[i].handler->startup(i); | 46 | /* |
| 47 | * An old-style architecture might still have | ||
| 48 | * the handle_bad_irq handler there: | ||
| 49 | */ | ||
| 50 | compat_irq_chip_set_default_handler(desc); | ||
| 51 | |||
| 52 | /* | ||
| 53 | * Some chips need to know about probing in | ||
| 54 | * progress: | ||
| 55 | */ | ||
| 56 | if (desc->chip->set_type) | ||
| 57 | desc->chip->set_type(i, IRQ_TYPE_PROBE); | ||
| 58 | desc->chip->startup(i); | ||
| 59 | } | ||
| 45 | spin_unlock_irq(&desc->lock); | 60 | spin_unlock_irq(&desc->lock); |
| 46 | } | 61 | } |
| 47 | 62 | ||
| @@ -57,9 +72,9 @@ unsigned long probe_irq_on(void) | |||
| 57 | desc = irq_desc + i; | 72 | desc = irq_desc + i; |
| 58 | 73 | ||
| 59 | spin_lock_irq(&desc->lock); | 74 | spin_lock_irq(&desc->lock); |
| 60 | if (!desc->action) { | 75 | if (!desc->action && !(desc->status & IRQ_NOPROBE)) { |
| 61 | desc->status |= IRQ_AUTODETECT | IRQ_WAITING; | 76 | desc->status |= IRQ_AUTODETECT | IRQ_WAITING; |
| 62 | if (desc->handler->startup(i)) | 77 | if (desc->chip->startup(i)) |
| 63 | desc->status |= IRQ_PENDING; | 78 | desc->status |= IRQ_PENDING; |
| 64 | } | 79 | } |
| 65 | spin_unlock_irq(&desc->lock); | 80 | spin_unlock_irq(&desc->lock); |
| @@ -73,11 +88,11 @@ unsigned long probe_irq_on(void) | |||
| 73 | /* | 88 | /* |
| 74 | * Now filter out any obviously spurious interrupts | 89 | * Now filter out any obviously spurious interrupts |
| 75 | */ | 90 | */ |
| 76 | val = 0; | 91 | mask = 0; |
| 77 | for (i = 0; i < NR_IRQS; i++) { | 92 | for (i = 0; i < NR_IRQS; i++) { |
| 78 | irq_desc_t *desc = irq_desc + i; | ||
| 79 | unsigned int status; | 93 | unsigned int status; |
| 80 | 94 | ||
| 95 | desc = irq_desc + i; | ||
| 81 | spin_lock_irq(&desc->lock); | 96 | spin_lock_irq(&desc->lock); |
| 82 | status = desc->status; | 97 | status = desc->status; |
| 83 | 98 | ||
| @@ -85,17 +100,16 @@ unsigned long probe_irq_on(void) | |||
| 85 | /* It triggered already - consider it spurious. */ | 100 | /* It triggered already - consider it spurious. */ |
| 86 | if (!(status & IRQ_WAITING)) { | 101 | if (!(status & IRQ_WAITING)) { |
| 87 | desc->status = status & ~IRQ_AUTODETECT; | 102 | desc->status = status & ~IRQ_AUTODETECT; |
| 88 | desc->handler->shutdown(i); | 103 | desc->chip->shutdown(i); |
| 89 | } else | 104 | } else |
| 90 | if (i < 32) | 105 | if (i < 32) |
| 91 | val |= 1 << i; | 106 | mask |= 1 << i; |
| 92 | } | 107 | } |
| 93 | spin_unlock_irq(&desc->lock); | 108 | spin_unlock_irq(&desc->lock); |
| 94 | } | 109 | } |
| 95 | 110 | ||
| 96 | return val; | 111 | return mask; |
| 97 | } | 112 | } |
| 98 | |||
| 99 | EXPORT_SYMBOL(probe_irq_on); | 113 | EXPORT_SYMBOL(probe_irq_on); |
| 100 | 114 | ||
| 101 | /** | 115 | /** |
| @@ -117,7 +131,7 @@ unsigned int probe_irq_mask(unsigned long val) | |||
| 117 | 131 | ||
| 118 | mask = 0; | 132 | mask = 0; |
| 119 | for (i = 0; i < NR_IRQS; i++) { | 133 | for (i = 0; i < NR_IRQS; i++) { |
| 120 | irq_desc_t *desc = irq_desc + i; | 134 | struct irq_desc *desc = irq_desc + i; |
| 121 | unsigned int status; | 135 | unsigned int status; |
| 122 | 136 | ||
| 123 | spin_lock_irq(&desc->lock); | 137 | spin_lock_irq(&desc->lock); |
| @@ -128,11 +142,11 @@ unsigned int probe_irq_mask(unsigned long val) | |||
| 128 | mask |= 1 << i; | 142 | mask |= 1 << i; |
| 129 | 143 | ||
| 130 | desc->status = status & ~IRQ_AUTODETECT; | 144 | desc->status = status & ~IRQ_AUTODETECT; |
| 131 | desc->handler->shutdown(i); | 145 | desc->chip->shutdown(i); |
| 132 | } | 146 | } |
| 133 | spin_unlock_irq(&desc->lock); | 147 | spin_unlock_irq(&desc->lock); |
| 134 | } | 148 | } |
| 135 | up(&probe_sem); | 149 | mutex_unlock(&probing_active); |
| 136 | 150 | ||
| 137 | return mask & val; | 151 | return mask & val; |
| 138 | } | 152 | } |
| @@ -160,7 +174,7 @@ int probe_irq_off(unsigned long val) | |||
| 160 | int i, irq_found = 0, nr_irqs = 0; | 174 | int i, irq_found = 0, nr_irqs = 0; |
| 161 | 175 | ||
| 162 | for (i = 0; i < NR_IRQS; i++) { | 176 | for (i = 0; i < NR_IRQS; i++) { |
| 163 | irq_desc_t *desc = irq_desc + i; | 177 | struct irq_desc *desc = irq_desc + i; |
| 164 | unsigned int status; | 178 | unsigned int status; |
| 165 | 179 | ||
| 166 | spin_lock_irq(&desc->lock); | 180 | spin_lock_irq(&desc->lock); |
| @@ -173,16 +187,16 @@ int probe_irq_off(unsigned long val) | |||
| 173 | nr_irqs++; | 187 | nr_irqs++; |
| 174 | } | 188 | } |
| 175 | desc->status = status & ~IRQ_AUTODETECT; | 189 | desc->status = status & ~IRQ_AUTODETECT; |
| 176 | desc->handler->shutdown(i); | 190 | desc->chip->shutdown(i); |
| 177 | } | 191 | } |
| 178 | spin_unlock_irq(&desc->lock); | 192 | spin_unlock_irq(&desc->lock); |
| 179 | } | 193 | } |
| 180 | up(&probe_sem); | 194 | mutex_unlock(&probing_active); |
| 181 | 195 | ||
| 182 | if (nr_irqs > 1) | 196 | if (nr_irqs > 1) |
| 183 | irq_found = -irq_found; | 197 | irq_found = -irq_found; |
| 198 | |||
| 184 | return irq_found; | 199 | return irq_found; |
| 185 | } | 200 | } |
| 186 | |||
| 187 | EXPORT_SYMBOL(probe_irq_off); | 201 | EXPORT_SYMBOL(probe_irq_off); |
| 188 | 202 | ||
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c new file mode 100644 index 000000000000..736cb0bd498f --- /dev/null +++ b/kernel/irq/chip.c | |||
| @@ -0,0 +1,533 @@ | |||
| 1 | /* | ||
| 2 | * linux/kernel/irq/chip.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar | ||
| 5 | * Copyright (C) 2005-2006, Thomas Gleixner, Russell King | ||
| 6 | * | ||
| 7 | * This file contains the core interrupt handling code, for irq-chip | ||
| 8 | * based architectures. | ||
| 9 | * | ||
| 10 | * Detailed information is available in Documentation/DocBook/genericirq | ||
| 11 | */ | ||
| 12 | |||
| 13 | #include <linux/irq.h> | ||
| 14 | #include <linux/module.h> | ||
| 15 | #include <linux/interrupt.h> | ||
| 16 | #include <linux/kernel_stat.h> | ||
| 17 | |||
| 18 | #include "internals.h" | ||
| 19 | |||
| 20 | /** | ||
| 21 | * set_irq_chip - set the irq chip for an irq | ||
| 22 | * @irq: irq number | ||
| 23 | * @chip: pointer to irq chip description structure | ||
| 24 | */ | ||
| 25 | int set_irq_chip(unsigned int irq, struct irq_chip *chip) | ||
| 26 | { | ||
| 27 | struct irq_desc *desc; | ||
| 28 | unsigned long flags; | ||
| 29 | |||
| 30 | if (irq >= NR_IRQS) { | ||
| 31 | printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq); | ||
| 32 | WARN_ON(1); | ||
| 33 | return -EINVAL; | ||
| 34 | } | ||
| 35 | |||
| 36 | if (!chip) | ||
| 37 | chip = &no_irq_chip; | ||
| 38 | |||
| 39 | desc = irq_desc + irq; | ||
| 40 | spin_lock_irqsave(&desc->lock, flags); | ||
| 41 | irq_chip_set_defaults(chip); | ||
| 42 | desc->chip = chip; | ||
| 43 | spin_unlock_irqrestore(&desc->lock, flags); | ||
| 44 | |||
| 45 | return 0; | ||
| 46 | } | ||
| 47 | EXPORT_SYMBOL(set_irq_chip); | ||
| 48 | |||
| 49 | /** | ||
| 50 | * set_irq_type - set the irq type for an irq | ||
| 51 | * @irq: irq number | ||
| 52 | * @type: interrupt type - see include/linux/interrupt.h | ||
| 53 | */ | ||
| 54 | int set_irq_type(unsigned int irq, unsigned int type) | ||
| 55 | { | ||
| 56 | struct irq_desc *desc; | ||
| 57 | unsigned long flags; | ||
| 58 | int ret = -ENXIO; | ||
| 59 | |||
| 60 | if (irq >= NR_IRQS) { | ||
| 61 | printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); | ||
| 62 | return -ENODEV; | ||
| 63 | } | ||
| 64 | |||
| 65 | desc = irq_desc + irq; | ||
| 66 | if (desc->chip->set_type) { | ||
| 67 | spin_lock_irqsave(&desc->lock, flags); | ||
| 68 | ret = desc->chip->set_type(irq, type); | ||
| 69 | spin_unlock_irqrestore(&desc->lock, flags); | ||
| 70 | } | ||
| 71 | return ret; | ||
| 72 | } | ||
| 73 | EXPORT_SYMBOL(set_irq_type); | ||
| 74 | |||
| 75 | /** | ||
| 76 | * set_irq_data - set irq type data for an irq | ||
| 77 | * @irq: Interrupt number | ||
| 78 | * @data: Pointer to interrupt specific data | ||
| 79 | * | ||
| 80 | * Set the hardware irq controller data for an irq | ||
| 81 | */ | ||
| 82 | int set_irq_data(unsigned int irq, void *data) | ||
| 83 | { | ||
| 84 | struct irq_desc *desc; | ||
| 85 | unsigned long flags; | ||
| 86 | |||
| 87 | if (irq >= NR_IRQS) { | ||
| 88 | printk(KERN_ERR | ||
| 89 | "Trying to install controller data for IRQ%d\n", irq); | ||
| 90 | return -EINVAL; | ||
| 91 | } | ||
| 92 | |||
| 93 | desc = irq_desc + irq; | ||
| 94 | spin_lock_irqsave(&desc->lock, flags); | ||
| 95 | desc->handler_data = data; | ||
| 96 | spin_unlock_irqrestore(&desc->lock, flags); | ||
| 97 | return 0; | ||
| 98 | } | ||
| 99 | EXPORT_SYMBOL(set_irq_data); | ||
| 100 | |||
| 101 | /** | ||
| 102 | * set_irq_chip_data - set irq chip data for an irq | ||
| 103 | * @irq: Interrupt number | ||
| 104 | * @data: Pointer to chip specific data | ||
| 105 | * | ||
| 106 | * Set the hardware irq chip data for an irq | ||
| 107 | */ | ||
| 108 | int set_irq_chip_data(unsigned int irq, void *data) | ||
| 109 | { | ||
| 110 | struct irq_desc *desc = irq_desc + irq; | ||
| 111 | unsigned long flags; | ||
| 112 | |||
| 113 | if (irq >= NR_IRQS || !desc->chip) { | ||
| 114 | printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); | ||
| 115 | return -EINVAL; | ||
| 116 | } | ||
| 117 | |||
| 118 | spin_lock_irqsave(&desc->lock, flags); | ||
| 119 | desc->chip_data = data; | ||
| 120 | spin_unlock_irqrestore(&desc->lock, flags); | ||
| 121 | |||
| 122 | return 0; | ||
| 123 | } | ||
| 124 | EXPORT_SYMBOL(set_irq_chip_data); | ||
| 125 | |||
| 126 | /* | ||
| 127 | * default enable function | ||
| 128 | */ | ||
| 129 | static void default_enable(unsigned int irq) | ||
| 130 | { | ||
| 131 | struct irq_desc *desc = irq_desc + irq; | ||
| 132 | |||
| 133 | desc->chip->unmask(irq); | ||
| 134 | desc->status &= ~IRQ_MASKED; | ||
| 135 | } | ||
| 136 | |||
| 137 | /* | ||
| 138 | * default disable function | ||
| 139 | */ | ||
| 140 | static void default_disable(unsigned int irq) | ||
| 141 | { | ||
| 142 | struct irq_desc *desc = irq_desc + irq; | ||
| 143 | |||
| 144 | if (!(desc->status & IRQ_DELAYED_DISABLE)) | ||
| 145 | desc->chip->mask(irq); | ||
| 146 | } | ||
| 147 | |||
| 148 | /* | ||
| 149 | * default startup function | ||
| 150 | */ | ||
| 151 | static unsigned int default_startup(unsigned int irq) | ||
| 152 | { | ||
| 153 | irq_desc[irq].chip->enable(irq); | ||
| 154 | |||
| 155 | return 0; | ||
| 156 | } | ||
| 157 | |||
| 158 | /* | ||
| 159 | * Fixup enable/disable function pointers | ||
| 160 | */ | ||
| 161 | void irq_chip_set_defaults(struct irq_chip *chip) | ||
| 162 | { | ||
| 163 | if (!chip->enable) | ||
| 164 | chip->enable = default_enable; | ||
| 165 | if (!chip->disable) | ||
| 166 | chip->disable = default_disable; | ||
| 167 | if (!chip->startup) | ||
| 168 | chip->startup = default_startup; | ||
| 169 | if (!chip->shutdown) | ||
| 170 | chip->shutdown = chip->disable; | ||
| 171 | if (!chip->name) | ||
| 172 | chip->name = chip->typename; | ||
| 173 | } | ||
| 174 | |||
| 175 | static inline void mask_ack_irq(struct irq_desc *desc, int irq) | ||
| 176 | { | ||
| 177 | if (desc->chip->mask_ack) | ||
| 178 | desc->chip->mask_ack(irq); | ||
| 179 | else { | ||
| 180 | desc->chip->mask(irq); | ||
| 181 | desc->chip->ack(irq); | ||
| 182 | } | ||
| 183 | } | ||
| 184 | |||
| 185 | /** | ||
| 186 | * handle_simple_irq - Simple and software-decoded IRQs. | ||
| 187 | * @irq: the interrupt number | ||
| 188 | * @desc: the interrupt description structure for this irq | ||
| 189 | * @regs: pointer to a register structure | ||
| 190 | * | ||
| 191 | * Simple interrupts are either sent from a demultiplexing interrupt | ||
| 192 | * handler or come from hardware, where no interrupt hardware control | ||
| 193 | * is necessary. | ||
| 194 | * | ||
| 195 | * Note: The caller is expected to handle the ack, clear, mask and | ||
| 196 | * unmask issues if necessary. | ||
| 197 | */ | ||
| 198 | void fastcall | ||
| 199 | handle_simple_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) | ||
| 200 | { | ||
| 201 | struct irqaction *action; | ||
| 202 | irqreturn_t action_ret; | ||
| 203 | const unsigned int cpu = smp_processor_id(); | ||
| 204 | |||
| 205 | spin_lock(&desc->lock); | ||
| 206 | |||
| 207 | if (unlikely(desc->status & IRQ_INPROGRESS)) | ||
| 208 | goto out_unlock; | ||
| 209 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | ||
| 210 | kstat_cpu(cpu).irqs[irq]++; | ||
| 211 | |||
| 212 | action = desc->action; | ||
| 213 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) | ||
| 214 | goto out_unlock; | ||
| 215 | |||
| 216 | desc->status |= IRQ_INPROGRESS; | ||
| 217 | spin_unlock(&desc->lock); | ||
| 218 | |||
| 219 | action_ret = handle_IRQ_event(irq, regs, action); | ||
| 220 | if (!noirqdebug) | ||
| 221 | note_interrupt(irq, desc, action_ret, regs); | ||
| 222 | |||
| 223 | spin_lock(&desc->lock); | ||
| 224 | desc->status &= ~IRQ_INPROGRESS; | ||
| 225 | out_unlock: | ||
| 226 | spin_unlock(&desc->lock); | ||
| 227 | } | ||
| 228 | |||
| 229 | /** | ||
| 230 | * handle_level_irq - Level type irq handler | ||
| 231 | * @irq: the interrupt number | ||
| 232 | * @desc: the interrupt description structure for this irq | ||
| 233 | * @regs: pointer to a register structure | ||
| 234 | * | ||
| 235 | * Level type interrupts are active as long as the hardware line has | ||
| 236 | * the active level. This may require to mask the interrupt and unmask | ||
| 237 | * it after the associated handler has acknowledged the device, so the | ||
| 238 | * interrupt line is back to inactive. | ||
| 239 | */ | ||
| 240 | void fastcall | ||
| 241 | handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) | ||
| 242 | { | ||
| 243 | unsigned int cpu = smp_processor_id(); | ||
| 244 | struct irqaction *action; | ||
| 245 | irqreturn_t action_ret; | ||
| 246 | |||
| 247 | spin_lock(&desc->lock); | ||
| 248 | mask_ack_irq(desc, irq); | ||
| 249 | |||
| 250 | if (unlikely(desc->status & IRQ_INPROGRESS)) | ||
| 251 | goto out_unlock; | ||
| 252 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | ||
| 253 | kstat_cpu(cpu).irqs[irq]++; | ||
| 254 | |||
| 255 | /* | ||
| 256 | * If its disabled or no action available | ||
| 257 | * keep it masked and get out of here | ||
| 258 | */ | ||
| 259 | action = desc->action; | ||
| 260 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { | ||
| 261 | desc->status |= IRQ_PENDING; | ||
| 262 | goto out_unlock; | ||
| 263 | } | ||
| 264 | |||
| 265 | desc->status |= IRQ_INPROGRESS; | ||
| 266 | desc->status &= ~IRQ_PENDING; | ||
| 267 | spin_unlock(&desc->lock); | ||
| 268 | |||
| 269 | action_ret = handle_IRQ_event(irq, regs, action); | ||
| 270 | if (!noirqdebug) | ||
| 271 | note_interrupt(irq, desc, action_ret, regs); | ||
| 272 | |||
| 273 | spin_lock(&desc->lock); | ||
| 274 | desc->status &= ~IRQ_INPROGRESS; | ||
| 275 | if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) | ||
| 276 | desc->chip->unmask(irq); | ||
| 277 | out_unlock: | ||
| 278 | spin_unlock(&desc->lock); | ||
| 279 | } | ||
| 280 | |||
| 281 | /** | ||
| 282 | * handle_fasteoi_irq - irq handler for transparent controllers | ||
| 283 | * @irq: the interrupt number | ||
| 284 | * @desc: the interrupt description structure for this irq | ||
| 285 | * @regs: pointer to a register structure | ||
| 286 | * | ||
| 287 | * Only a single callback will be issued to the chip: an ->eoi() | ||
| 288 | * call when the interrupt has been serviced. This enables support | ||
| 289 | * for modern forms of interrupt handlers, which handle the flow | ||
| 290 | * details in hardware, transparently. | ||
| 291 | */ | ||
| 292 | void fastcall | ||
| 293 | handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc, | ||
| 294 | struct pt_regs *regs) | ||
| 295 | { | ||
| 296 | unsigned int cpu = smp_processor_id(); | ||
| 297 | struct irqaction *action; | ||
| 298 | irqreturn_t action_ret; | ||
| 299 | |||
| 300 | spin_lock(&desc->lock); | ||
| 301 | |||
| 302 | if (unlikely(desc->status & IRQ_INPROGRESS)) | ||
| 303 | goto out; | ||
| 304 | |||
| 305 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | ||
| 306 | kstat_cpu(cpu).irqs[irq]++; | ||
| 307 | |||
| 308 | /* | ||
| 309 | * If its disabled or no action available | ||
| 310 | * keep it masked and get out of here | ||
| 311 | */ | ||
| 312 | action = desc->action; | ||
| 313 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { | ||
| 314 | desc->status |= IRQ_PENDING; | ||
| 315 | goto out; | ||
| 316 | } | ||
| 317 | |||
| 318 | desc->status |= IRQ_INPROGRESS; | ||
| 319 | desc->status &= ~IRQ_PENDING; | ||
| 320 | spin_unlock(&desc->lock); | ||
| 321 | |||
| 322 | action_ret = handle_IRQ_event(irq, regs, action); | ||
| 323 | if (!noirqdebug) | ||
| 324 | note_interrupt(irq, desc, action_ret, regs); | ||
| 325 | |||
| 326 | spin_lock(&desc->lock); | ||
| 327 | desc->status &= ~IRQ_INPROGRESS; | ||
| 328 | out: | ||
| 329 | desc->chip->eoi(irq); | ||
| 330 | |||
| 331 | spin_unlock(&desc->lock); | ||
| 332 | } | ||
| 333 | |||
| 334 | /** | ||
| 335 | * handle_edge_irq - edge type IRQ handler | ||
| 336 | * @irq: the interrupt number | ||
| 337 | * @desc: the interrupt description structure for this irq | ||
| 338 | * @regs: pointer to a register structure | ||
| 339 | * | ||
| 340 | * Interrupt occures on the falling and/or rising edge of a hardware | ||
| 341 | * signal. The occurence is latched into the irq controller hardware | ||
| 342 | * and must be acked in order to be reenabled. After the ack another | ||
| 343 | * interrupt can happen on the same source even before the first one | ||
| 344 | * is handled by the assosiacted event handler. If this happens it | ||
| 345 | * might be necessary to disable (mask) the interrupt depending on the | ||
| 346 | * controller hardware. This requires to reenable the interrupt inside | ||
| 347 | * of the loop which handles the interrupts which have arrived while | ||
| 348 | * the handler was running. If all pending interrupts are handled, the | ||
| 349 | * loop is left. | ||
| 350 | */ | ||
| 351 | void fastcall | ||
| 352 | handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) | ||
| 353 | { | ||
| 354 | const unsigned int cpu = smp_processor_id(); | ||
| 355 | |||
| 356 | spin_lock(&desc->lock); | ||
| 357 | |||
| 358 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | ||
| 359 | |||
| 360 | /* | ||
| 361 | * If we're currently running this IRQ, or its disabled, | ||
| 362 | * we shouldn't process the IRQ. Mark it pending, handle | ||
| 363 | * the necessary masking and go out | ||
| 364 | */ | ||
| 365 | if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || | ||
| 366 | !desc->action)) { | ||
| 367 | desc->status |= (IRQ_PENDING | IRQ_MASKED); | ||
| 368 | mask_ack_irq(desc, irq); | ||
| 369 | goto out_unlock; | ||
| 370 | } | ||
| 371 | |||
| 372 | kstat_cpu(cpu).irqs[irq]++; | ||
| 373 | |||
| 374 | /* Start handling the irq */ | ||
| 375 | desc->chip->ack(irq); | ||
| 376 | |||
| 377 | /* Mark the IRQ currently in progress.*/ | ||
| 378 | desc->status |= IRQ_INPROGRESS; | ||
| 379 | |||
| 380 | do { | ||
| 381 | struct irqaction *action = desc->action; | ||
| 382 | irqreturn_t action_ret; | ||
| 383 | |||
| 384 | if (unlikely(!action)) { | ||
| 385 | desc->chip->mask(irq); | ||
| 386 | goto out_unlock; | ||
| 387 | } | ||
| 388 | |||
| 389 | /* | ||
| 390 | * When another irq arrived while we were handling | ||
| 391 | * one, we could have masked the irq. | ||
| 392 | * Renable it, if it was not disabled in meantime. | ||
| 393 | */ | ||
| 394 | if (unlikely((desc->status & | ||
| 395 | (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == | ||
| 396 | (IRQ_PENDING | IRQ_MASKED))) { | ||
| 397 | desc->chip->unmask(irq); | ||
| 398 | desc->status &= ~IRQ_MASKED; | ||
| 399 | } | ||
| 400 | |||
| 401 | desc->status &= ~IRQ_PENDING; | ||
| 402 | spin_unlock(&desc->lock); | ||
| 403 | action_ret = handle_IRQ_event(irq, regs, action); | ||
| 404 | if (!noirqdebug) | ||
| 405 | note_interrupt(irq, desc, action_ret, regs); | ||
| 406 | spin_lock(&desc->lock); | ||
| 407 | |||
| 408 | } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); | ||
| 409 | |||
| 410 | desc->status &= ~IRQ_INPROGRESS; | ||
| 411 | out_unlock: | ||
| 412 | spin_unlock(&desc->lock); | ||
| 413 | } | ||
| 414 | |||
| 415 | #ifdef CONFIG_SMP | ||
| 416 | /** | ||
| 417 | * handle_percpu_IRQ - Per CPU local irq handler | ||
| 418 | * @irq: the interrupt number | ||
| 419 | * @desc: the interrupt description structure for this irq | ||
| 420 | * @regs: pointer to a register structure | ||
| 421 | * | ||
| 422 | * Per CPU interrupts on SMP machines without locking requirements | ||
| 423 | */ | ||
| 424 | void fastcall | ||
| 425 | handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) | ||
| 426 | { | ||
| 427 | irqreturn_t action_ret; | ||
| 428 | |||
| 429 | kstat_this_cpu.irqs[irq]++; | ||
| 430 | |||
| 431 | if (desc->chip->ack) | ||
| 432 | desc->chip->ack(irq); | ||
| 433 | |||
| 434 | action_ret = handle_IRQ_event(irq, regs, desc->action); | ||
| 435 | if (!noirqdebug) | ||
| 436 | note_interrupt(irq, desc, action_ret, regs); | ||
| 437 | |||
| 438 | if (desc->chip->eoi) | ||
| 439 | desc->chip->eoi(irq); | ||
| 440 | } | ||
| 441 | |||
| 442 | #endif /* CONFIG_SMP */ | ||
| 443 | |||
| 444 | void | ||
| 445 | __set_irq_handler(unsigned int irq, | ||
| 446 | void fastcall (*handle)(unsigned int, irq_desc_t *, | ||
| 447 | struct pt_regs *), | ||
| 448 | int is_chained) | ||
| 449 | { | ||
| 450 | struct irq_desc *desc; | ||
| 451 | unsigned long flags; | ||
| 452 | |||
| 453 | if (irq >= NR_IRQS) { | ||
| 454 | printk(KERN_ERR | ||
| 455 | "Trying to install type control for IRQ%d\n", irq); | ||
| 456 | return; | ||
| 457 | } | ||
| 458 | |||
| 459 | desc = irq_desc + irq; | ||
| 460 | |||
| 461 | if (!handle) | ||
| 462 | handle = handle_bad_irq; | ||
| 463 | |||
| 464 | if (desc->chip == &no_irq_chip) { | ||
| 465 | printk(KERN_WARNING "Trying to install %sinterrupt handler " | ||
| 466 | "for IRQ%d\n", is_chained ? "chained " : " ", irq); | ||
| 467 | /* | ||
| 468 | * Some ARM implementations install a handler for really dumb | ||
| 469 | * interrupt hardware without setting an irq_chip. This worked | ||
| 470 | * with the ARM no_irq_chip but the check in setup_irq would | ||
| 471 | * prevent us to setup the interrupt at all. Switch it to | ||
| 472 | * dummy_irq_chip for easy transition. | ||
| 473 | */ | ||
| 474 | desc->chip = &dummy_irq_chip; | ||
| 475 | } | ||
| 476 | |||
| 477 | spin_lock_irqsave(&desc->lock, flags); | ||
| 478 | |||
| 479 | /* Uninstall? */ | ||
| 480 | if (handle == handle_bad_irq) { | ||
| 481 | if (desc->chip != &no_irq_chip) { | ||
| 482 | desc->chip->mask(irq); | ||
| 483 | desc->chip->ack(irq); | ||
| 484 | } | ||
| 485 | desc->status |= IRQ_DISABLED; | ||
| 486 | desc->depth = 1; | ||
| 487 | } | ||
| 488 | desc->handle_irq = handle; | ||
| 489 | |||
| 490 | if (handle != handle_bad_irq && is_chained) { | ||
| 491 | desc->status &= ~IRQ_DISABLED; | ||
| 492 | desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; | ||
| 493 | desc->depth = 0; | ||
| 494 | desc->chip->unmask(irq); | ||
| 495 | } | ||
| 496 | spin_unlock_irqrestore(&desc->lock, flags); | ||
| 497 | } | ||
| 498 | |||
| 499 | void | ||
| 500 | set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, | ||
| 501 | void fastcall (*handle)(unsigned int, | ||
| 502 | struct irq_desc *, | ||
| 503 | struct pt_regs *)) | ||
| 504 | { | ||
| 505 | set_irq_chip(irq, chip); | ||
| 506 | __set_irq_handler(irq, handle, 0); | ||
| 507 | } | ||
| 508 | |||
| 509 | /* | ||
| 510 | * Get a descriptive string for the highlevel handler, for | ||
| 511 | * /proc/interrupts output: | ||
| 512 | */ | ||
| 513 | const char * | ||
| 514 | handle_irq_name(void fastcall (*handle)(unsigned int, struct irq_desc *, | ||
| 515 | struct pt_regs *)) | ||
| 516 | { | ||
| 517 | if (handle == handle_level_irq) | ||
| 518 | return "level "; | ||
| 519 | if (handle == handle_fasteoi_irq) | ||
| 520 | return "fasteoi"; | ||
| 521 | if (handle == handle_edge_irq) | ||
| 522 | return "edge "; | ||
| 523 | if (handle == handle_simple_irq) | ||
| 524 | return "simple "; | ||
| 525 | #ifdef CONFIG_SMP | ||
| 526 | if (handle == handle_percpu_irq) | ||
| 527 | return "percpu "; | ||
| 528 | #endif | ||
| 529 | if (handle == handle_bad_irq) | ||
| 530 | return "bad "; | ||
| 531 | |||
| 532 | return NULL; | ||
| 533 | } | ||
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 0f6530117105..4c6cdbaed661 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
| @@ -1,9 +1,13 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * linux/kernel/irq/handle.c | 2 | * linux/kernel/irq/handle.c |
| 3 | * | 3 | * |
| 4 | * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar | 4 | * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar |
| 5 | * Copyright (C) 2005-2006, Thomas Gleixner, Russell King | ||
| 5 | * | 6 | * |
| 6 | * This file contains the core interrupt handling code. | 7 | * This file contains the core interrupt handling code. |
| 8 | * | ||
| 9 | * Detailed information is available in Documentation/DocBook/genericirq | ||
| 10 | * | ||
| 7 | */ | 11 | */ |
| 8 | 12 | ||
| 9 | #include <linux/irq.h> | 13 | #include <linux/irq.h> |
| @@ -14,11 +18,27 @@ | |||
| 14 | 18 | ||
| 15 | #include "internals.h" | 19 | #include "internals.h" |
| 16 | 20 | ||
| 21 | /** | ||
| 22 | * handle_bad_irq - handle spurious and unhandled irqs | ||
| 23 | * @irq: the interrupt number | ||
| 24 | * @desc: description of the interrupt | ||
| 25 | * @regs: pointer to a register structure | ||
| 26 | * | ||
| 27 | * Handles spurious and unhandled IRQ's. It also prints a debugmessage. | ||
| 28 | */ | ||
| 29 | void fastcall | ||
| 30 | handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) | ||
| 31 | { | ||
| 32 | print_irq_desc(irq, desc); | ||
| 33 | kstat_this_cpu.irqs[irq]++; | ||
| 34 | ack_bad_irq(irq); | ||
| 35 | } | ||
| 36 | |||
| 17 | /* | 37 | /* |
| 18 | * Linux has a controller-independent interrupt architecture. | 38 | * Linux has a controller-independent interrupt architecture. |
| 19 | * Every controller has a 'controller-template', that is used | 39 | * Every controller has a 'controller-template', that is used |
| 20 | * by the main code to do the right thing. Each driver-visible | 40 | * by the main code to do the right thing. Each driver-visible |
| 21 | * interrupt source is transparently wired to the apropriate | 41 | * interrupt source is transparently wired to the appropriate |
| 22 | * controller. Thus drivers need not be aware of the | 42 | * controller. Thus drivers need not be aware of the |
| 23 | * interrupt-controller. | 43 | * interrupt-controller. |
| 24 | * | 44 | * |
| @@ -28,41 +48,68 @@ | |||
| 28 | * | 48 | * |
| 29 | * Controller mappings for all interrupt sources: | 49 | * Controller mappings for all interrupt sources: |
| 30 | */ | 50 | */ |
| 31 | irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { | 51 | struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = { |
| 32 | [0 ... NR_IRQS-1] = { | 52 | [0 ... NR_IRQS-1] = { |
| 33 | .status = IRQ_DISABLED, | 53 | .status = IRQ_DISABLED, |
| 34 | .handler = &no_irq_type, | 54 | .chip = &no_irq_chip, |
| 35 | .lock = SPIN_LOCK_UNLOCKED | 55 | .handle_irq = handle_bad_irq, |
| 56 | .depth = 1, | ||
| 57 | .lock = SPIN_LOCK_UNLOCKED, | ||
| 58 | #ifdef CONFIG_SMP | ||
| 59 | .affinity = CPU_MASK_ALL | ||
| 60 | #endif | ||
| 36 | } | 61 | } |
| 37 | }; | 62 | }; |
| 38 | 63 | ||
| 39 | /* | 64 | /* |
| 40 | * Generic 'no controller' code | 65 | * What should we do if we get a hw irq event on an illegal vector? |
| 66 | * Each architecture has to answer this themself. | ||
| 41 | */ | 67 | */ |
| 42 | static void end_none(unsigned int irq) { } | 68 | static void ack_bad(unsigned int irq) |
| 43 | static void enable_none(unsigned int irq) { } | ||
| 44 | static void disable_none(unsigned int irq) { } | ||
| 45 | static void shutdown_none(unsigned int irq) { } | ||
| 46 | static unsigned int startup_none(unsigned int irq) { return 0; } | ||
| 47 | |||
| 48 | static void ack_none(unsigned int irq) | ||
| 49 | { | 69 | { |
| 50 | /* | 70 | print_irq_desc(irq, irq_desc + irq); |
| 51 | * 'what should we do if we get a hw irq event on an illegal vector'. | ||
| 52 | * each architecture has to answer this themself. | ||
| 53 | */ | ||
| 54 | ack_bad_irq(irq); | 71 | ack_bad_irq(irq); |
| 55 | } | 72 | } |
| 56 | 73 | ||
| 57 | struct hw_interrupt_type no_irq_type = { | 74 | /* |
| 58 | .typename = "none", | 75 | * NOP functions |
| 59 | .startup = startup_none, | 76 | */ |
| 60 | .shutdown = shutdown_none, | 77 | static void noop(unsigned int irq) |
| 61 | .enable = enable_none, | 78 | { |
| 62 | .disable = disable_none, | 79 | } |
| 63 | .ack = ack_none, | 80 | |
| 64 | .end = end_none, | 81 | static unsigned int noop_ret(unsigned int irq) |
| 65 | .set_affinity = NULL | 82 | { |
| 83 | return 0; | ||
| 84 | } | ||
| 85 | |||
| 86 | /* | ||
| 87 | * Generic no controller implementation | ||
| 88 | */ | ||
| 89 | struct irq_chip no_irq_chip = { | ||
| 90 | .name = "none", | ||
| 91 | .startup = noop_ret, | ||
| 92 | .shutdown = noop, | ||
| 93 | .enable = noop, | ||
| 94 | .disable = noop, | ||
| 95 | .ack = ack_bad, | ||
| 96 | .end = noop, | ||
| 97 | }; | ||
| 98 | |||
| 99 | /* | ||
| 100 | * Generic dummy implementation which can be used for | ||
| 101 | * real dumb interrupt sources | ||
| 102 | */ | ||
| 103 | struct irq_chip dummy_irq_chip = { | ||
| 104 | .name = "dummy", | ||
| 105 | .startup = noop_ret, | ||
| 106 | .shutdown = noop, | ||
| 107 | .enable = noop, | ||
| 108 | .disable = noop, | ||
| 109 | .ack = noop, | ||
| 110 | .mask = noop, | ||
| 111 | .unmask = noop, | ||
| 112 | .end = noop, | ||
| 66 | }; | 113 | }; |
| 67 | 114 | ||
| 68 | /* | 115 | /* |
| @@ -73,17 +120,24 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs) | |||
| 73 | return IRQ_NONE; | 120 | return IRQ_NONE; |
| 74 | } | 121 | } |
| 75 | 122 | ||
| 76 | /* | 123 | /** |
| 77 | * Have got an event to handle: | 124 | * handle_IRQ_event - irq action chain handler |
| 125 | * @irq: the interrupt number | ||
| 126 | * @regs: pointer to a register structure | ||
| 127 | * @action: the interrupt action chain for this irq | ||
| 128 | * | ||
| 129 | * Handles the action chain of an irq event | ||
| 78 | */ | 130 | */ |
| 79 | fastcall irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, | 131 | irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, |
| 80 | struct irqaction *action) | 132 | struct irqaction *action) |
| 81 | { | 133 | { |
| 82 | irqreturn_t ret, retval = IRQ_NONE; | 134 | irqreturn_t ret, retval = IRQ_NONE; |
| 83 | unsigned int status = 0; | 135 | unsigned int status = 0; |
| 84 | 136 | ||
| 85 | if (!(action->flags & SA_INTERRUPT)) | 137 | handle_dynamic_tick(action); |
| 86 | local_irq_enable(); | 138 | |
| 139 | if (!(action->flags & IRQF_DISABLED)) | ||
| 140 | local_irq_enable_in_hardirq(); | ||
| 87 | 141 | ||
| 88 | do { | 142 | do { |
| 89 | ret = action->handler(irq, action->dev_id, regs); | 143 | ret = action->handler(irq, action->dev_id, regs); |
| @@ -93,22 +147,30 @@ fastcall irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, | |||
| 93 | action = action->next; | 147 | action = action->next; |
| 94 | } while (action); | 148 | } while (action); |
| 95 | 149 | ||
| 96 | if (status & SA_SAMPLE_RANDOM) | 150 | if (status & IRQF_SAMPLE_RANDOM) |
| 97 | add_interrupt_randomness(irq); | 151 | add_interrupt_randomness(irq); |
| 98 | local_irq_disable(); | 152 | local_irq_disable(); |
| 99 | 153 | ||
| 100 | return retval; | 154 | return retval; |
| 101 | } | 155 | } |
| 102 | 156 | ||
| 103 | /* | 157 | #ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ |
| 104 | * do_IRQ handles all normal device IRQ's (the special | 158 | /** |
| 159 | * __do_IRQ - original all in one highlevel IRQ handler | ||
| 160 | * @irq: the interrupt number | ||
| 161 | * @regs: pointer to a register structure | ||
| 162 | * | ||
| 163 | * __do_IRQ handles all normal device IRQ's (the special | ||
| 105 | * SMP cross-CPU interrupts have their own specific | 164 | * SMP cross-CPU interrupts have their own specific |
| 106 | * handlers). | 165 | * handlers). |
| 166 | * | ||
| 167 | * This is the original x86 implementation which is used for every | ||
| 168 | * interrupt type. | ||
| 107 | */ | 169 | */ |
| 108 | fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) | 170 | fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) |
| 109 | { | 171 | { |
| 110 | irq_desc_t *desc = irq_desc + irq; | 172 | struct irq_desc *desc = irq_desc + irq; |
| 111 | struct irqaction * action; | 173 | struct irqaction *action; |
| 112 | unsigned int status; | 174 | unsigned int status; |
| 113 | 175 | ||
| 114 | kstat_this_cpu.irqs[irq]++; | 176 | kstat_this_cpu.irqs[irq]++; |
| @@ -118,16 +180,16 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) | |||
| 118 | /* | 180 | /* |
| 119 | * No locking required for CPU-local interrupts: | 181 | * No locking required for CPU-local interrupts: |
| 120 | */ | 182 | */ |
| 121 | if (desc->handler->ack) | 183 | if (desc->chip->ack) |
| 122 | desc->handler->ack(irq); | 184 | desc->chip->ack(irq); |
| 123 | action_ret = handle_IRQ_event(irq, regs, desc->action); | 185 | action_ret = handle_IRQ_event(irq, regs, desc->action); |
| 124 | desc->handler->end(irq); | 186 | desc->chip->end(irq); |
| 125 | return 1; | 187 | return 1; |
| 126 | } | 188 | } |
| 127 | 189 | ||
| 128 | spin_lock(&desc->lock); | 190 | spin_lock(&desc->lock); |
| 129 | if (desc->handler->ack) | 191 | if (desc->chip->ack) |
| 130 | desc->handler->ack(irq); | 192 | desc->chip->ack(irq); |
| 131 | /* | 193 | /* |
| 132 | * REPLAY is when Linux resends an IRQ that was dropped earlier | 194 | * REPLAY is when Linux resends an IRQ that was dropped earlier |
| 133 | * WAITING is used by probe to mark irqs that are being tested | 195 | * WAITING is used by probe to mark irqs that are being tested |
| @@ -187,9 +249,26 @@ out: | |||
| 187 | * The ->end() handler has to deal with interrupts which got | 249 | * The ->end() handler has to deal with interrupts which got |
| 188 | * disabled while the handler was running. | 250 | * disabled while the handler was running. |
| 189 | */ | 251 | */ |
| 190 | desc->handler->end(irq); | 252 | desc->chip->end(irq); |
| 191 | spin_unlock(&desc->lock); | 253 | spin_unlock(&desc->lock); |
| 192 | 254 | ||
| 193 | return 1; | 255 | return 1; |
| 194 | } | 256 | } |
| 257 | #endif | ||
| 258 | |||
| 259 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
| 260 | |||
| 261 | /* | ||
| 262 | * lockdep: we want to handle all irq_desc locks as a single lock-class: | ||
| 263 | */ | ||
| 264 | static struct lock_class_key irq_desc_lock_class; | ||
| 265 | |||
| 266 | void early_init_irq_lock_class(void) | ||
| 267 | { | ||
| 268 | int i; | ||
| 269 | |||
| 270 | for (i = 0; i < NR_IRQS; i++) | ||
| 271 | lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class); | ||
| 272 | } | ||
| 195 | 273 | ||
| 274 | #endif | ||
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 46feba630266..08a849a22447 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
| @@ -4,6 +4,12 @@ | |||
| 4 | 4 | ||
| 5 | extern int noirqdebug; | 5 | extern int noirqdebug; |
| 6 | 6 | ||
| 7 | /* Set default functions for irq_chip structures: */ | ||
| 8 | extern void irq_chip_set_defaults(struct irq_chip *chip); | ||
| 9 | |||
| 10 | /* Set default handler: */ | ||
| 11 | extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); | ||
| 12 | |||
| 7 | #ifdef CONFIG_PROC_FS | 13 | #ifdef CONFIG_PROC_FS |
| 8 | extern void register_irq_proc(unsigned int irq); | 14 | extern void register_irq_proc(unsigned int irq); |
| 9 | extern void register_handler_proc(unsigned int irq, struct irqaction *action); | 15 | extern void register_handler_proc(unsigned int irq, struct irqaction *action); |
| @@ -16,3 +22,43 @@ static inline void unregister_handler_proc(unsigned int irq, | |||
| 16 | struct irqaction *action) { } | 22 | struct irqaction *action) { } |
| 17 | #endif | 23 | #endif |
| 18 | 24 | ||
| 25 | /* | ||
| 26 | * Debugging printout: | ||
| 27 | */ | ||
| 28 | |||
| 29 | #include <linux/kallsyms.h> | ||
| 30 | |||
| 31 | #define P(f) if (desc->status & f) printk("%14s set\n", #f) | ||
| 32 | |||
| 33 | static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) | ||
| 34 | { | ||
| 35 | printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", | ||
| 36 | irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); | ||
| 37 | printk("->handle_irq(): %p, ", desc->handle_irq); | ||
| 38 | print_symbol("%s\n", (unsigned long)desc->handle_irq); | ||
| 39 | printk("->chip(): %p, ", desc->chip); | ||
| 40 | print_symbol("%s\n", (unsigned long)desc->chip); | ||
| 41 | printk("->action(): %p\n", desc->action); | ||
| 42 | if (desc->action) { | ||
| 43 | printk("->action->handler(): %p, ", desc->action->handler); | ||
| 44 | print_symbol("%s\n", (unsigned long)desc->action->handler); | ||
| 45 | } | ||
| 46 | |||
| 47 | P(IRQ_INPROGRESS); | ||
| 48 | P(IRQ_DISABLED); | ||
| 49 | P(IRQ_PENDING); | ||
| 50 | P(IRQ_REPLAY); | ||
| 51 | P(IRQ_AUTODETECT); | ||
| 52 | P(IRQ_WAITING); | ||
| 53 | P(IRQ_LEVEL); | ||
| 54 | P(IRQ_MASKED); | ||
| 55 | #ifdef CONFIG_IRQ_PER_CPU | ||
| 56 | P(IRQ_PER_CPU); | ||
| 57 | #endif | ||
| 58 | P(IRQ_NOPROBE); | ||
| 59 | P(IRQ_NOREQUEST); | ||
| 60 | P(IRQ_NOAUTOEN); | ||
| 61 | } | ||
| 62 | |||
| 63 | #undef P | ||
| 64 | |||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1279e3499534..92be519eff26 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -1,12 +1,12 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * linux/kernel/irq/manage.c | 2 | * linux/kernel/irq/manage.c |
| 3 | * | 3 | * |
| 4 | * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar | 4 | * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar |
| 5 | * Copyright (C) 2005-2006 Thomas Gleixner | ||
| 5 | * | 6 | * |
| 6 | * This file contains driver APIs to the irq subsystem. | 7 | * This file contains driver APIs to the irq subsystem. |
| 7 | */ | 8 | */ |
| 8 | 9 | ||
| 9 | #include <linux/config.h> | ||
| 10 | #include <linux/irq.h> | 10 | #include <linux/irq.h> |
| 11 | #include <linux/module.h> | 11 | #include <linux/module.h> |
| 12 | #include <linux/random.h> | 12 | #include <linux/random.h> |
| @@ -16,12 +16,6 @@ | |||
| 16 | 16 | ||
| 17 | #ifdef CONFIG_SMP | 17 | #ifdef CONFIG_SMP |
| 18 | 18 | ||
| 19 | cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; | ||
| 20 | |||
| 21 | #if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE) | ||
| 22 | cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS]; | ||
| 23 | #endif | ||
| 24 | |||
| 25 | /** | 19 | /** |
| 26 | * synchronize_irq - wait for pending IRQ handlers (on other CPUs) | 20 | * synchronize_irq - wait for pending IRQ handlers (on other CPUs) |
| 27 | * @irq: interrupt number to wait for | 21 | * @irq: interrupt number to wait for |
| @@ -42,7 +36,6 @@ void synchronize_irq(unsigned int irq) | |||
| 42 | while (desc->status & IRQ_INPROGRESS) | 36 | while (desc->status & IRQ_INPROGRESS) |
| 43 | cpu_relax(); | 37 | cpu_relax(); |
| 44 | } | 38 | } |
| 45 | |||
| 46 | EXPORT_SYMBOL(synchronize_irq); | 39 | EXPORT_SYMBOL(synchronize_irq); |
| 47 | 40 | ||
| 48 | #endif | 41 | #endif |
| @@ -60,7 +53,7 @@ EXPORT_SYMBOL(synchronize_irq); | |||
| 60 | */ | 53 | */ |
| 61 | void disable_irq_nosync(unsigned int irq) | 54 | void disable_irq_nosync(unsigned int irq) |
| 62 | { | 55 | { |
| 63 | irq_desc_t *desc = irq_desc + irq; | 56 | struct irq_desc *desc = irq_desc + irq; |
| 64 | unsigned long flags; | 57 | unsigned long flags; |
| 65 | 58 | ||
| 66 | if (irq >= NR_IRQS) | 59 | if (irq >= NR_IRQS) |
| @@ -69,11 +62,10 @@ void disable_irq_nosync(unsigned int irq) | |||
| 69 | spin_lock_irqsave(&desc->lock, flags); | 62 | spin_lock_irqsave(&desc->lock, flags); |
| 70 | if (!desc->depth++) { | 63 | if (!desc->depth++) { |
| 71 | desc->status |= IRQ_DISABLED; | 64 | desc->status |= IRQ_DISABLED; |
| 72 | desc->handler->disable(irq); | 65 | desc->chip->disable(irq); |
| 73 | } | 66 | } |
| 74 | spin_unlock_irqrestore(&desc->lock, flags); | 67 | spin_unlock_irqrestore(&desc->lock, flags); |
| 75 | } | 68 | } |
| 76 | |||
| 77 | EXPORT_SYMBOL(disable_irq_nosync); | 69 | EXPORT_SYMBOL(disable_irq_nosync); |
| 78 | 70 | ||
| 79 | /** | 71 | /** |
| @@ -90,7 +82,7 @@ EXPORT_SYMBOL(disable_irq_nosync); | |||
| 90 | */ | 82 | */ |
| 91 | void disable_irq(unsigned int irq) | 83 | void disable_irq(unsigned int irq) |
| 92 | { | 84 | { |
| 93 | irq_desc_t *desc = irq_desc + irq; | 85 | struct irq_desc *desc = irq_desc + irq; |
| 94 | 86 | ||
| 95 | if (irq >= NR_IRQS) | 87 | if (irq >= NR_IRQS) |
| 96 | return; | 88 | return; |
| @@ -99,7 +91,6 @@ void disable_irq(unsigned int irq) | |||
| 99 | if (desc->action) | 91 | if (desc->action) |
| 100 | synchronize_irq(irq); | 92 | synchronize_irq(irq); |
| 101 | } | 93 | } |
| 102 | |||
| 103 | EXPORT_SYMBOL(disable_irq); | 94 | EXPORT_SYMBOL(disable_irq); |
| 104 | 95 | ||
| 105 | /** | 96 | /** |
| @@ -114,7 +105,7 @@ EXPORT_SYMBOL(disable_irq); | |||
| 114 | */ | 105 | */ |
| 115 | void enable_irq(unsigned int irq) | 106 | void enable_irq(unsigned int irq) |
| 116 | { | 107 | { |
| 117 | irq_desc_t *desc = irq_desc + irq; | 108 | struct irq_desc *desc = irq_desc + irq; |
| 118 | unsigned long flags; | 109 | unsigned long flags; |
| 119 | 110 | ||
| 120 | if (irq >= NR_IRQS) | 111 | if (irq >= NR_IRQS) |
| @@ -123,17 +114,15 @@ void enable_irq(unsigned int irq) | |||
| 123 | spin_lock_irqsave(&desc->lock, flags); | 114 | spin_lock_irqsave(&desc->lock, flags); |
| 124 | switch (desc->depth) { | 115 | switch (desc->depth) { |
| 125 | case 0: | 116 | case 0: |
| 117 | printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); | ||
| 126 | WARN_ON(1); | 118 | WARN_ON(1); |
| 127 | break; | 119 | break; |
| 128 | case 1: { | 120 | case 1: { |
| 129 | unsigned int status = desc->status & ~IRQ_DISABLED; | 121 | unsigned int status = desc->status & ~IRQ_DISABLED; |
| 130 | 122 | ||
| 131 | desc->status = status; | 123 | /* Prevent probing on this irq: */ |
| 132 | if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { | 124 | desc->status = status | IRQ_NOPROBE; |
| 133 | desc->status = status | IRQ_REPLAY; | 125 | check_irq_resend(desc, irq); |
| 134 | hw_resend_irq(desc->handler,irq); | ||
| 135 | } | ||
| 136 | desc->handler->enable(irq); | ||
| 137 | /* fall-through */ | 126 | /* fall-through */ |
| 138 | } | 127 | } |
| 139 | default: | 128 | default: |
| @@ -141,9 +130,53 @@ void enable_irq(unsigned int irq) | |||
| 141 | } | 130 | } |
| 142 | spin_unlock_irqrestore(&desc->lock, flags); | 131 | spin_unlock_irqrestore(&desc->lock, flags); |
| 143 | } | 132 | } |
| 144 | |||
| 145 | EXPORT_SYMBOL(enable_irq); | 133 | EXPORT_SYMBOL(enable_irq); |
| 146 | 134 | ||
| 135 | /** | ||
| 136 | * set_irq_wake - control irq power management wakeup | ||
| 137 | * @irq: interrupt to control | ||
| 138 | * @on: enable/disable power management wakeup | ||
| 139 | * | ||
| 140 | * Enable/disable power management wakeup mode, which is | ||
| 141 | * disabled by default. Enables and disables must match, | ||
| 142 | * just as they match for non-wakeup mode support. | ||
| 143 | * | ||
| 144 | * Wakeup mode lets this IRQ wake the system from sleep | ||
| 145 | * states like "suspend to RAM". | ||
| 146 | */ | ||
| 147 | int set_irq_wake(unsigned int irq, unsigned int on) | ||
| 148 | { | ||
| 149 | struct irq_desc *desc = irq_desc + irq; | ||
| 150 | unsigned long flags; | ||
| 151 | int ret = -ENXIO; | ||
| 152 | int (*set_wake)(unsigned, unsigned) = desc->chip->set_wake; | ||
| 153 | |||
| 154 | /* wakeup-capable irqs can be shared between drivers that | ||
| 155 | * don't need to have the same sleep mode behaviors. | ||
| 156 | */ | ||
| 157 | spin_lock_irqsave(&desc->lock, flags); | ||
| 158 | if (on) { | ||
| 159 | if (desc->wake_depth++ == 0) | ||
| 160 | desc->status |= IRQ_WAKEUP; | ||
| 161 | else | ||
| 162 | set_wake = NULL; | ||
| 163 | } else { | ||
| 164 | if (desc->wake_depth == 0) { | ||
| 165 | printk(KERN_WARNING "Unbalanced IRQ %d " | ||
| 166 | "wake disable\n", irq); | ||
| 167 | WARN_ON(1); | ||
| 168 | } else if (--desc->wake_depth == 0) | ||
| 169 | desc->status &= ~IRQ_WAKEUP; | ||
| 170 | else | ||
| 171 | set_wake = NULL; | ||
| 172 | } | ||
| 173 | if (set_wake) | ||
| 174 | ret = desc->chip->set_wake(irq, on); | ||
| 175 | spin_unlock_irqrestore(&desc->lock, flags); | ||
| 176 | return ret; | ||
| 177 | } | ||
| 178 | EXPORT_SYMBOL(set_irq_wake); | ||
| 179 | |||
| 147 | /* | 180 | /* |
| 148 | * Internal function that tells the architecture code whether a | 181 | * Internal function that tells the architecture code whether a |
| 149 | * particular irq has been exclusively allocated or is available | 182 | * particular irq has been exclusively allocated or is available |
| @@ -153,22 +186,33 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) | |||
| 153 | { | 186 | { |
| 154 | struct irqaction *action; | 187 | struct irqaction *action; |
| 155 | 188 | ||
| 156 | if (irq >= NR_IRQS) | 189 | if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST) |
| 157 | return 0; | 190 | return 0; |
| 158 | 191 | ||
| 159 | action = irq_desc[irq].action; | 192 | action = irq_desc[irq].action; |
| 160 | if (action) | 193 | if (action) |
| 161 | if (irqflags & action->flags & SA_SHIRQ) | 194 | if (irqflags & action->flags & IRQF_SHARED) |
| 162 | action = NULL; | 195 | action = NULL; |
| 163 | 196 | ||
| 164 | return !action; | 197 | return !action; |
| 165 | } | 198 | } |
| 166 | 199 | ||
| 200 | void compat_irq_chip_set_default_handler(struct irq_desc *desc) | ||
| 201 | { | ||
| 202 | /* | ||
| 203 | * If the architecture still has not overriden | ||
| 204 | * the flow handler then zap the default. This | ||
| 205 | * should catch incorrect flow-type setting. | ||
| 206 | */ | ||
| 207 | if (desc->handle_irq == &handle_bad_irq) | ||
| 208 | desc->handle_irq = NULL; | ||
| 209 | } | ||
| 210 | |||
| 167 | /* | 211 | /* |
| 168 | * Internal function to register an irqaction - typically used to | 212 | * Internal function to register an irqaction - typically used to |
| 169 | * allocate special interrupts that are part of the architecture. | 213 | * allocate special interrupts that are part of the architecture. |
| 170 | */ | 214 | */ |
| 171 | int setup_irq(unsigned int irq, struct irqaction * new) | 215 | int setup_irq(unsigned int irq, struct irqaction *new) |
| 172 | { | 216 | { |
| 173 | struct irq_desc *desc = irq_desc + irq; | 217 | struct irq_desc *desc = irq_desc + irq; |
| 174 | struct irqaction *old, **p; | 218 | struct irqaction *old, **p; |
| @@ -178,14 +222,14 @@ int setup_irq(unsigned int irq, struct irqaction * new) | |||
| 178 | if (irq >= NR_IRQS) | 222 | if (irq >= NR_IRQS) |
| 179 | return -EINVAL; | 223 | return -EINVAL; |
| 180 | 224 | ||
| 181 | if (desc->handler == &no_irq_type) | 225 | if (desc->chip == &no_irq_chip) |
| 182 | return -ENOSYS; | 226 | return -ENOSYS; |
| 183 | /* | 227 | /* |
| 184 | * Some drivers like serial.c use request_irq() heavily, | 228 | * Some drivers like serial.c use request_irq() heavily, |
| 185 | * so we have to be careful not to interfere with a | 229 | * so we have to be careful not to interfere with a |
| 186 | * running system. | 230 | * running system. |
| 187 | */ | 231 | */ |
| 188 | if (new->flags & SA_SAMPLE_RANDOM) { | 232 | if (new->flags & IRQF_SAMPLE_RANDOM) { |
| 189 | /* | 233 | /* |
| 190 | * This function might sleep, we want to call it first, | 234 | * This function might sleep, we want to call it first, |
| 191 | * outside of the atomic block. | 235 | * outside of the atomic block. |
| @@ -200,16 +244,24 @@ int setup_irq(unsigned int irq, struct irqaction * new) | |||
| 200 | /* | 244 | /* |
| 201 | * The following block of code has to be executed atomically | 245 | * The following block of code has to be executed atomically |
| 202 | */ | 246 | */ |
| 203 | spin_lock_irqsave(&desc->lock,flags); | 247 | spin_lock_irqsave(&desc->lock, flags); |
| 204 | p = &desc->action; | 248 | p = &desc->action; |
| 205 | if ((old = *p) != NULL) { | 249 | old = *p; |
| 206 | /* Can't share interrupts unless both agree to */ | 250 | if (old) { |
| 207 | if (!(old->flags & new->flags & SA_SHIRQ)) | 251 | /* |
| 252 | * Can't share interrupts unless both agree to and are | ||
| 253 | * the same type (level, edge, polarity). So both flag | ||
| 254 | * fields must have IRQF_SHARED set and the bits which | ||
| 255 | * set the trigger type must match. | ||
| 256 | */ | ||
| 257 | if (!((old->flags & new->flags) & IRQF_SHARED) || | ||
| 258 | ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) | ||
| 208 | goto mismatch; | 259 | goto mismatch; |
| 209 | 260 | ||
| 210 | #if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) | 261 | #if defined(CONFIG_IRQ_PER_CPU) |
| 211 | /* All handlers must agree on per-cpuness */ | 262 | /* All handlers must agree on per-cpuness */ |
| 212 | if ((old->flags & IRQ_PER_CPU) != (new->flags & IRQ_PER_CPU)) | 263 | if ((old->flags & IRQF_PERCPU) != |
| 264 | (new->flags & IRQF_PERCPU)) | ||
| 213 | goto mismatch; | 265 | goto mismatch; |
| 214 | #endif | 266 | #endif |
| 215 | 267 | ||
| @@ -222,20 +274,45 @@ int setup_irq(unsigned int irq, struct irqaction * new) | |||
| 222 | } | 274 | } |
| 223 | 275 | ||
| 224 | *p = new; | 276 | *p = new; |
| 225 | #if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) | 277 | #if defined(CONFIG_IRQ_PER_CPU) |
| 226 | if (new->flags & SA_PERCPU_IRQ) | 278 | if (new->flags & IRQF_PERCPU) |
| 227 | desc->status |= IRQ_PER_CPU; | 279 | desc->status |= IRQ_PER_CPU; |
| 228 | #endif | 280 | #endif |
| 229 | if (!shared) { | 281 | if (!shared) { |
| 230 | desc->depth = 0; | 282 | irq_chip_set_defaults(desc->chip); |
| 231 | desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | | 283 | |
| 232 | IRQ_WAITING | IRQ_INPROGRESS); | 284 | /* Setup the type (level, edge polarity) if configured: */ |
| 233 | if (desc->handler->startup) | 285 | if (new->flags & IRQF_TRIGGER_MASK) { |
| 234 | desc->handler->startup(irq); | 286 | if (desc->chip && desc->chip->set_type) |
| 235 | else | 287 | desc->chip->set_type(irq, |
| 236 | desc->handler->enable(irq); | 288 | new->flags & IRQF_TRIGGER_MASK); |
| 289 | else | ||
| 290 | /* | ||
| 291 | * IRQF_TRIGGER_* but the PIC does not support | ||
| 292 | * multiple flow-types? | ||
| 293 | */ | ||
| 294 | printk(KERN_WARNING "No IRQF_TRIGGER set_type " | ||
| 295 | "function for IRQ %d (%s)\n", irq, | ||
| 296 | desc->chip ? desc->chip->name : | ||
| 297 | "unknown"); | ||
| 298 | } else | ||
| 299 | compat_irq_chip_set_default_handler(desc); | ||
| 300 | |||
| 301 | desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | | ||
| 302 | IRQ_INPROGRESS); | ||
| 303 | |||
| 304 | if (!(desc->status & IRQ_NOAUTOEN)) { | ||
| 305 | desc->depth = 0; | ||
| 306 | desc->status &= ~IRQ_DISABLED; | ||
| 307 | if (desc->chip->startup) | ||
| 308 | desc->chip->startup(irq); | ||
| 309 | else | ||
| 310 | desc->chip->enable(irq); | ||
| 311 | } else | ||
| 312 | /* Undo nested disables: */ | ||
| 313 | desc->depth = 1; | ||
| 237 | } | 314 | } |
| 238 | spin_unlock_irqrestore(&desc->lock,flags); | 315 | spin_unlock_irqrestore(&desc->lock, flags); |
| 239 | 316 | ||
| 240 | new->irq = irq; | 317 | new->irq = irq; |
| 241 | register_irq_proc(irq); | 318 | register_irq_proc(irq); |
| @@ -246,8 +323,8 @@ int setup_irq(unsigned int irq, struct irqaction * new) | |||
| 246 | 323 | ||
| 247 | mismatch: | 324 | mismatch: |
| 248 | spin_unlock_irqrestore(&desc->lock, flags); | 325 | spin_unlock_irqrestore(&desc->lock, flags); |
| 249 | if (!(new->flags & SA_PROBEIRQ)) { | 326 | if (!(new->flags & IRQF_PROBE_SHARED)) { |
| 250 | printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__); | 327 | printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); |
| 251 | dump_stack(); | 328 | dump_stack(); |
| 252 | } | 329 | } |
| 253 | return -EBUSY; | 330 | return -EBUSY; |
| @@ -278,10 +355,10 @@ void free_irq(unsigned int irq, void *dev_id) | |||
| 278 | return; | 355 | return; |
| 279 | 356 | ||
| 280 | desc = irq_desc + irq; | 357 | desc = irq_desc + irq; |
| 281 | spin_lock_irqsave(&desc->lock,flags); | 358 | spin_lock_irqsave(&desc->lock, flags); |
| 282 | p = &desc->action; | 359 | p = &desc->action; |
| 283 | for (;;) { | 360 | for (;;) { |
| 284 | struct irqaction * action = *p; | 361 | struct irqaction *action = *p; |
| 285 | 362 | ||
| 286 | if (action) { | 363 | if (action) { |
| 287 | struct irqaction **pp = p; | 364 | struct irqaction **pp = p; |
| @@ -295,18 +372,18 @@ void free_irq(unsigned int irq, void *dev_id) | |||
| 295 | 372 | ||
| 296 | /* Currently used only by UML, might disappear one day.*/ | 373 | /* Currently used only by UML, might disappear one day.*/ |
| 297 | #ifdef CONFIG_IRQ_RELEASE_METHOD | 374 | #ifdef CONFIG_IRQ_RELEASE_METHOD |
| 298 | if (desc->handler->release) | 375 | if (desc->chip->release) |
| 299 | desc->handler->release(irq, dev_id); | 376 | desc->chip->release(irq, dev_id); |
| 300 | #endif | 377 | #endif |
| 301 | 378 | ||
| 302 | if (!desc->action) { | 379 | if (!desc->action) { |
| 303 | desc->status |= IRQ_DISABLED; | 380 | desc->status |= IRQ_DISABLED; |
| 304 | if (desc->handler->shutdown) | 381 | if (desc->chip->shutdown) |
| 305 | desc->handler->shutdown(irq); | 382 | desc->chip->shutdown(irq); |
| 306 | else | 383 | else |
| 307 | desc->handler->disable(irq); | 384 | desc->chip->disable(irq); |
| 308 | } | 385 | } |
| 309 | spin_unlock_irqrestore(&desc->lock,flags); | 386 | spin_unlock_irqrestore(&desc->lock, flags); |
| 310 | unregister_handler_proc(irq, action); | 387 | unregister_handler_proc(irq, action); |
| 311 | 388 | ||
| 312 | /* Make sure it's not being used on another CPU */ | 389 | /* Make sure it's not being used on another CPU */ |
| @@ -314,12 +391,11 @@ void free_irq(unsigned int irq, void *dev_id) | |||
| 314 | kfree(action); | 391 | kfree(action); |
| 315 | return; | 392 | return; |
| 316 | } | 393 | } |
| 317 | printk(KERN_ERR "Trying to free free IRQ%d\n",irq); | 394 | printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq); |
| 318 | spin_unlock_irqrestore(&desc->lock,flags); | 395 | spin_unlock_irqrestore(&desc->lock, flags); |
| 319 | return; | 396 | return; |
| 320 | } | 397 | } |
| 321 | } | 398 | } |
| 322 | |||
| 323 | EXPORT_SYMBOL(free_irq); | 399 | EXPORT_SYMBOL(free_irq); |
| 324 | 400 | ||
| 325 | /** | 401 | /** |
| @@ -346,28 +422,36 @@ EXPORT_SYMBOL(free_irq); | |||
| 346 | * | 422 | * |
| 347 | * Flags: | 423 | * Flags: |
| 348 | * | 424 | * |
| 349 | * SA_SHIRQ Interrupt is shared | 425 | * IRQF_SHARED Interrupt is shared |
| 350 | * SA_INTERRUPT Disable local interrupts while processing | 426 | * IRQF_DISABLED Disable local interrupts while processing |
| 351 | * SA_SAMPLE_RANDOM The interrupt can be used for entropy | 427 | * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy |
| 352 | * | 428 | * |
| 353 | */ | 429 | */ |
| 354 | int request_irq(unsigned int irq, | 430 | int request_irq(unsigned int irq, |
| 355 | irqreturn_t (*handler)(int, void *, struct pt_regs *), | 431 | irqreturn_t (*handler)(int, void *, struct pt_regs *), |
| 356 | unsigned long irqflags, const char * devname, void *dev_id) | 432 | unsigned long irqflags, const char *devname, void *dev_id) |
| 357 | { | 433 | { |
| 358 | struct irqaction * action; | 434 | struct irqaction *action; |
| 359 | int retval; | 435 | int retval; |
| 360 | 436 | ||
| 437 | #ifdef CONFIG_LOCKDEP | ||
| 438 | /* | ||
| 439 | * Lockdep wants atomic interrupt handlers: | ||
| 440 | */ | ||
| 441 | irqflags |= SA_INTERRUPT; | ||
| 442 | #endif | ||
| 361 | /* | 443 | /* |
| 362 | * Sanity-check: shared interrupts must pass in a real dev-ID, | 444 | * Sanity-check: shared interrupts must pass in a real dev-ID, |
| 363 | * otherwise we'll have trouble later trying to figure out | 445 | * otherwise we'll have trouble later trying to figure out |
| 364 | * which interrupt is which (messes up the interrupt freeing | 446 | * which interrupt is which (messes up the interrupt freeing |
| 365 | * logic etc). | 447 | * logic etc). |
| 366 | */ | 448 | */ |
| 367 | if ((irqflags & SA_SHIRQ) && !dev_id) | 449 | if ((irqflags & IRQF_SHARED) && !dev_id) |
| 368 | return -EINVAL; | 450 | return -EINVAL; |
| 369 | if (irq >= NR_IRQS) | 451 | if (irq >= NR_IRQS) |
| 370 | return -EINVAL; | 452 | return -EINVAL; |
| 453 | if (irq_desc[irq].status & IRQ_NOREQUEST) | ||
| 454 | return -EINVAL; | ||
| 371 | if (!handler) | 455 | if (!handler) |
| 372 | return -EINVAL; | 456 | return -EINVAL; |
| 373 | 457 | ||
| @@ -390,6 +474,5 @@ int request_irq(unsigned int irq, | |||
| 390 | 474 | ||
| 391 | return retval; | 475 | return retval; |
| 392 | } | 476 | } |
| 393 | |||
| 394 | EXPORT_SYMBOL(request_irq); | 477 | EXPORT_SYMBOL(request_irq); |
| 395 | 478 | ||
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index a12d00eb5e7c..a57ebe9fa6f6 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c | |||
| @@ -3,19 +3,19 @@ | |||
| 3 | 3 | ||
| 4 | void set_pending_irq(unsigned int irq, cpumask_t mask) | 4 | void set_pending_irq(unsigned int irq, cpumask_t mask) |
| 5 | { | 5 | { |
| 6 | irq_desc_t *desc = irq_desc + irq; | 6 | struct irq_desc *desc = irq_desc + irq; |
| 7 | unsigned long flags; | 7 | unsigned long flags; |
| 8 | 8 | ||
| 9 | spin_lock_irqsave(&desc->lock, flags); | 9 | spin_lock_irqsave(&desc->lock, flags); |
| 10 | desc->move_irq = 1; | 10 | desc->move_irq = 1; |
| 11 | pending_irq_cpumask[irq] = mask; | 11 | irq_desc[irq].pending_mask = mask; |
| 12 | spin_unlock_irqrestore(&desc->lock, flags); | 12 | spin_unlock_irqrestore(&desc->lock, flags); |
| 13 | } | 13 | } |
| 14 | 14 | ||
| 15 | void move_native_irq(int irq) | 15 | void move_native_irq(int irq) |
| 16 | { | 16 | { |
| 17 | struct irq_desc *desc = irq_desc + irq; | ||
| 17 | cpumask_t tmp; | 18 | cpumask_t tmp; |
| 18 | irq_desc_t *desc = irq_descp(irq); | ||
| 19 | 19 | ||
| 20 | if (likely(!desc->move_irq)) | 20 | if (likely(!desc->move_irq)) |
| 21 | return; | 21 | return; |
| @@ -30,15 +30,15 @@ void move_native_irq(int irq) | |||
| 30 | 30 | ||
| 31 | desc->move_irq = 0; | 31 | desc->move_irq = 0; |
| 32 | 32 | ||
| 33 | if (unlikely(cpus_empty(pending_irq_cpumask[irq]))) | 33 | if (unlikely(cpus_empty(irq_desc[irq].pending_mask))) |
| 34 | return; | 34 | return; |
| 35 | 35 | ||
| 36 | if (!desc->handler->set_affinity) | 36 | if (!desc->chip->set_affinity) |
| 37 | return; | 37 | return; |
| 38 | 38 | ||
| 39 | assert_spin_locked(&desc->lock); | 39 | assert_spin_locked(&desc->lock); |
| 40 | 40 | ||
| 41 | cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map); | 41 | cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map); |
| 42 | 42 | ||
| 43 | /* | 43 | /* |
| 44 | * If there was a valid mask to work with, please | 44 | * If there was a valid mask to work with, please |
| @@ -51,12 +51,12 @@ void move_native_irq(int irq) | |||
| 51 | */ | 51 | */ |
| 52 | if (likely(!cpus_empty(tmp))) { | 52 | if (likely(!cpus_empty(tmp))) { |
| 53 | if (likely(!(desc->status & IRQ_DISABLED))) | 53 | if (likely(!(desc->status & IRQ_DISABLED))) |
| 54 | desc->handler->disable(irq); | 54 | desc->chip->disable(irq); |
| 55 | 55 | ||
| 56 | desc->handler->set_affinity(irq,tmp); | 56 | desc->chip->set_affinity(irq,tmp); |
| 57 | 57 | ||
| 58 | if (likely(!(desc->status & IRQ_DISABLED))) | 58 | if (likely(!(desc->status & IRQ_DISABLED))) |
| 59 | desc->handler->enable(irq); | 59 | desc->chip->enable(irq); |
| 60 | } | 60 | } |
| 61 | cpus_clear(pending_irq_cpumask[irq]); | 61 | cpus_clear(irq_desc[irq].pending_mask); |
| 62 | } | 62 | } |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index afacd6f585fa..607c7809ad01 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
| @@ -12,15 +12,10 @@ | |||
| 12 | 12 | ||
| 13 | #include "internals.h" | 13 | #include "internals.h" |
| 14 | 14 | ||
| 15 | static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; | 15 | static struct proc_dir_entry *root_irq_dir; |
| 16 | 16 | ||
| 17 | #ifdef CONFIG_SMP | 17 | #ifdef CONFIG_SMP |
| 18 | 18 | ||
| 19 | /* | ||
| 20 | * The /proc/irq/<irq>/smp_affinity values: | ||
| 21 | */ | ||
| 22 | static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; | ||
| 23 | |||
| 24 | #ifdef CONFIG_GENERIC_PENDING_IRQ | 19 | #ifdef CONFIG_GENERIC_PENDING_IRQ |
| 25 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) | 20 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) |
| 26 | { | 21 | { |
| @@ -36,15 +31,15 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) | |||
| 36 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) | 31 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) |
| 37 | { | 32 | { |
| 38 | set_balance_irq_affinity(irq, mask_val); | 33 | set_balance_irq_affinity(irq, mask_val); |
| 39 | irq_affinity[irq] = mask_val; | 34 | irq_desc[irq].affinity = mask_val; |
| 40 | irq_desc[irq].handler->set_affinity(irq, mask_val); | 35 | irq_desc[irq].chip->set_affinity(irq, mask_val); |
| 41 | } | 36 | } |
| 42 | #endif | 37 | #endif |
| 43 | 38 | ||
| 44 | static int irq_affinity_read_proc(char *page, char **start, off_t off, | 39 | static int irq_affinity_read_proc(char *page, char **start, off_t off, |
| 45 | int count, int *eof, void *data) | 40 | int count, int *eof, void *data) |
| 46 | { | 41 | { |
| 47 | int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]); | 42 | int len = cpumask_scnprintf(page, count, irq_desc[(long)data].affinity); |
| 48 | 43 | ||
| 49 | if (count - len < 2) | 44 | if (count - len < 2) |
| 50 | return -EINVAL; | 45 | return -EINVAL; |
| @@ -59,7 +54,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, | |||
| 59 | unsigned int irq = (int)(long)data, full_count = count, err; | 54 | unsigned int irq = (int)(long)data, full_count = count, err; |
| 60 | cpumask_t new_value, tmp; | 55 | cpumask_t new_value, tmp; |
| 61 | 56 | ||
| 62 | if (!irq_desc[irq].handler->set_affinity || no_irq_affinity) | 57 | if (!irq_desc[irq].chip->set_affinity || no_irq_affinity) |
| 63 | return -EIO; | 58 | return -EIO; |
| 64 | 59 | ||
| 65 | err = cpumask_parse(buffer, count, new_value); | 60 | err = cpumask_parse(buffer, count, new_value); |
| @@ -102,7 +97,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) | |||
| 102 | { | 97 | { |
| 103 | char name [MAX_NAMELEN]; | 98 | char name [MAX_NAMELEN]; |
| 104 | 99 | ||
| 105 | if (!irq_dir[irq] || action->dir || !action->name || | 100 | if (!irq_desc[irq].dir || action->dir || !action->name || |
| 106 | !name_unique(irq, action)) | 101 | !name_unique(irq, action)) |
| 107 | return; | 102 | return; |
| 108 | 103 | ||
| @@ -110,7 +105,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) | |||
| 110 | snprintf(name, MAX_NAMELEN, "%s", action->name); | 105 | snprintf(name, MAX_NAMELEN, "%s", action->name); |
| 111 | 106 | ||
| 112 | /* create /proc/irq/1234/handler/ */ | 107 | /* create /proc/irq/1234/handler/ */ |
| 113 | action->dir = proc_mkdir(name, irq_dir[irq]); | 108 | action->dir = proc_mkdir(name, irq_desc[irq].dir); |
| 114 | } | 109 | } |
| 115 | 110 | ||
| 116 | #undef MAX_NAMELEN | 111 | #undef MAX_NAMELEN |
| @@ -122,22 +117,22 @@ void register_irq_proc(unsigned int irq) | |||
| 122 | char name [MAX_NAMELEN]; | 117 | char name [MAX_NAMELEN]; |
| 123 | 118 | ||
| 124 | if (!root_irq_dir || | 119 | if (!root_irq_dir || |
| 125 | (irq_desc[irq].handler == &no_irq_type) || | 120 | (irq_desc[irq].chip == &no_irq_chip) || |
| 126 | irq_dir[irq]) | 121 | irq_desc[irq].dir) |
| 127 | return; | 122 | return; |
| 128 | 123 | ||
| 129 | memset(name, 0, MAX_NAMELEN); | 124 | memset(name, 0, MAX_NAMELEN); |
| 130 | sprintf(name, "%d", irq); | 125 | sprintf(name, "%d", irq); |
| 131 | 126 | ||
| 132 | /* create /proc/irq/1234 */ | 127 | /* create /proc/irq/1234 */ |
| 133 | irq_dir[irq] = proc_mkdir(name, root_irq_dir); | 128 | irq_desc[irq].dir = proc_mkdir(name, root_irq_dir); |
| 134 | 129 | ||
| 135 | #ifdef CONFIG_SMP | 130 | #ifdef CONFIG_SMP |
| 136 | { | 131 | { |
| 137 | struct proc_dir_entry *entry; | 132 | struct proc_dir_entry *entry; |
| 138 | 133 | ||
| 139 | /* create /proc/irq/<irq>/smp_affinity */ | 134 | /* create /proc/irq/<irq>/smp_affinity */ |
| 140 | entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]); | 135 | entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); |
| 141 | 136 | ||
| 142 | if (entry) { | 137 | if (entry) { |
| 143 | entry->nlink = 1; | 138 | entry->nlink = 1; |
| @@ -145,7 +140,6 @@ void register_irq_proc(unsigned int irq) | |||
| 145 | entry->read_proc = irq_affinity_read_proc; | 140 | entry->read_proc = irq_affinity_read_proc; |
| 146 | entry->write_proc = irq_affinity_write_proc; | 141 | entry->write_proc = irq_affinity_write_proc; |
| 147 | } | 142 | } |
| 148 | smp_affinity_entry[irq] = entry; | ||
| 149 | } | 143 | } |
| 150 | #endif | 144 | #endif |
| 151 | } | 145 | } |
| @@ -155,7 +149,7 @@ void register_irq_proc(unsigned int irq) | |||
| 155 | void unregister_handler_proc(unsigned int irq, struct irqaction *action) | 149 | void unregister_handler_proc(unsigned int irq, struct irqaction *action) |
| 156 | { | 150 | { |
| 157 | if (action->dir) | 151 | if (action->dir) |
| 158 | remove_proc_entry(action->dir->name, irq_dir[irq]); | 152 | remove_proc_entry(action->dir->name, irq_desc[irq].dir); |
| 159 | } | 153 | } |
| 160 | 154 | ||
| 161 | void init_irq_proc(void) | 155 | void init_irq_proc(void) |
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c new file mode 100644 index 000000000000..35f10f7ff94a --- /dev/null +++ b/kernel/irq/resend.c | |||
| @@ -0,0 +1,77 @@ | |||
| 1 | /* | ||
| 2 | * linux/kernel/irq/resend.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar | ||
| 5 | * Copyright (C) 2005-2006, Thomas Gleixner | ||
| 6 | * | ||
| 7 | * This file contains the IRQ-resend code | ||
| 8 | * | ||
| 9 | * If the interrupt is waiting to be processed, we try to re-run it. | ||
| 10 | * We can't directly run it from here since the caller might be in an | ||
| 11 | * interrupt-protected region. Not all irq controller chips can | ||
| 12 | * retrigger interrupts at the hardware level, so in those cases | ||
| 13 | * we allow the resending of IRQs via a tasklet. | ||
| 14 | */ | ||
| 15 | |||
| 16 | #include <linux/irq.h> | ||
| 17 | #include <linux/module.h> | ||
| 18 | #include <linux/random.h> | ||
| 19 | #include <linux/interrupt.h> | ||
| 20 | |||
| 21 | #include "internals.h" | ||
| 22 | |||
| 23 | #ifdef CONFIG_HARDIRQS_SW_RESEND | ||
| 24 | |||
| 25 | /* Bitmap to handle software resend of interrupts: */ | ||
| 26 | static DECLARE_BITMAP(irqs_resend, NR_IRQS); | ||
| 27 | |||
| 28 | /* | ||
| 29 | * Run software resends of IRQ's | ||
| 30 | */ | ||
| 31 | static void resend_irqs(unsigned long arg) | ||
| 32 | { | ||
| 33 | struct irq_desc *desc; | ||
| 34 | int irq; | ||
| 35 | |||
| 36 | while (!bitmap_empty(irqs_resend, NR_IRQS)) { | ||
| 37 | irq = find_first_bit(irqs_resend, NR_IRQS); | ||
| 38 | clear_bit(irq, irqs_resend); | ||
| 39 | desc = irq_desc + irq; | ||
| 40 | local_irq_disable(); | ||
| 41 | desc->handle_irq(irq, desc, NULL); | ||
| 42 | local_irq_enable(); | ||
| 43 | } | ||
| 44 | } | ||
| 45 | |||
| 46 | /* Tasklet to handle resend: */ | ||
| 47 | static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); | ||
| 48 | |||
| 49 | #endif | ||
| 50 | |||
| 51 | /* | ||
| 52 | * IRQ resend | ||
| 53 | * | ||
| 54 | * Is called with interrupts disabled and desc->lock held. | ||
| 55 | */ | ||
| 56 | void check_irq_resend(struct irq_desc *desc, unsigned int irq) | ||
| 57 | { | ||
| 58 | unsigned int status = desc->status; | ||
| 59 | |||
| 60 | /* | ||
| 61 | * Make sure the interrupt is enabled, before resending it: | ||
| 62 | */ | ||
| 63 | desc->chip->enable(irq); | ||
| 64 | |||
| 65 | if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { | ||
| 66 | desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; | ||
| 67 | |||
| 68 | if (!desc->chip || !desc->chip->retrigger || | ||
| 69 | !desc->chip->retrigger(irq)) { | ||
| 70 | #ifdef CONFIG_HARDIRQS_SW_RESEND | ||
| 71 | /* Set it pending and activate the softirq: */ | ||
| 72 | set_bit(irq, irqs_resend); | ||
| 73 | tasklet_schedule(&resend_tasklet); | ||
| 74 | #endif | ||
| 75 | } | ||
| 76 | } | ||
| 77 | } | ||
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index b2fb3c18d06b..417e98092cf2 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
| @@ -16,39 +16,39 @@ static int irqfixup __read_mostly; | |||
| 16 | /* | 16 | /* |
| 17 | * Recovery handler for misrouted interrupts. | 17 | * Recovery handler for misrouted interrupts. |
| 18 | */ | 18 | */ |
| 19 | |||
| 20 | static int misrouted_irq(int irq, struct pt_regs *regs) | 19 | static int misrouted_irq(int irq, struct pt_regs *regs) |
| 21 | { | 20 | { |
| 22 | int i; | 21 | int i; |
| 23 | irq_desc_t *desc; | ||
| 24 | int ok = 0; | 22 | int ok = 0; |
| 25 | int work = 0; /* Did we do work for a real IRQ */ | 23 | int work = 0; /* Did we do work for a real IRQ */ |
| 26 | 24 | ||
| 27 | for(i = 1; i < NR_IRQS; i++) { | 25 | for (i = 1; i < NR_IRQS; i++) { |
| 26 | struct irq_desc *desc = irq_desc + i; | ||
| 28 | struct irqaction *action; | 27 | struct irqaction *action; |
| 29 | 28 | ||
| 30 | if (i == irq) /* Already tried */ | 29 | if (i == irq) /* Already tried */ |
| 31 | continue; | 30 | continue; |
| 32 | desc = &irq_desc[i]; | 31 | |
| 33 | spin_lock(&desc->lock); | 32 | spin_lock(&desc->lock); |
| 34 | action = desc->action; | ||
| 35 | /* Already running on another processor */ | 33 | /* Already running on another processor */ |
| 36 | if (desc->status & IRQ_INPROGRESS) { | 34 | if (desc->status & IRQ_INPROGRESS) { |
| 37 | /* | 35 | /* |
| 38 | * Already running: If it is shared get the other | 36 | * Already running: If it is shared get the other |
| 39 | * CPU to go looking for our mystery interrupt too | 37 | * CPU to go looking for our mystery interrupt too |
| 40 | */ | 38 | */ |
| 41 | if (desc->action && (desc->action->flags & SA_SHIRQ)) | 39 | if (desc->action && (desc->action->flags & IRQF_SHARED)) |
| 42 | desc->status |= IRQ_PENDING; | 40 | desc->status |= IRQ_PENDING; |
| 43 | spin_unlock(&desc->lock); | 41 | spin_unlock(&desc->lock); |
| 44 | continue; | 42 | continue; |
| 45 | } | 43 | } |
| 46 | /* Honour the normal IRQ locking */ | 44 | /* Honour the normal IRQ locking */ |
| 47 | desc->status |= IRQ_INPROGRESS; | 45 | desc->status |= IRQ_INPROGRESS; |
| 46 | action = desc->action; | ||
| 48 | spin_unlock(&desc->lock); | 47 | spin_unlock(&desc->lock); |
| 48 | |||
| 49 | while (action) { | 49 | while (action) { |
| 50 | /* Only shared IRQ handlers are safe to call */ | 50 | /* Only shared IRQ handlers are safe to call */ |
| 51 | if (action->flags & SA_SHIRQ) { | 51 | if (action->flags & IRQF_SHARED) { |
| 52 | if (action->handler(i, action->dev_id, regs) == | 52 | if (action->handler(i, action->dev_id, regs) == |
| 53 | IRQ_HANDLED) | 53 | IRQ_HANDLED) |
| 54 | ok = 1; | 54 | ok = 1; |
| @@ -62,9 +62,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs) | |||
| 62 | 62 | ||
| 63 | /* | 63 | /* |
| 64 | * While we were looking for a fixup someone queued a real | 64 | * While we were looking for a fixup someone queued a real |
| 65 | * IRQ clashing with our walk | 65 | * IRQ clashing with our walk: |
| 66 | */ | 66 | */ |
| 67 | |||
| 68 | while ((desc->status & IRQ_PENDING) && action) { | 67 | while ((desc->status & IRQ_PENDING) && action) { |
| 69 | /* | 68 | /* |
| 70 | * Perform real IRQ processing for the IRQ we deferred | 69 | * Perform real IRQ processing for the IRQ we deferred |
| @@ -80,8 +79,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs) | |||
| 80 | * If we did actual work for the real IRQ line we must let the | 79 | * If we did actual work for the real IRQ line we must let the |
| 81 | * IRQ controller clean up too | 80 | * IRQ controller clean up too |
| 82 | */ | 81 | */ |
| 83 | if(work) | 82 | if (work && desc->chip && desc->chip->end) |
| 84 | desc->handler->end(i); | 83 | desc->chip->end(i); |
| 85 | spin_unlock(&desc->lock); | 84 | spin_unlock(&desc->lock); |
| 86 | } | 85 | } |
| 87 | /* So the caller can adjust the irq error counts */ | 86 | /* So the caller can adjust the irq error counts */ |
| @@ -100,7 +99,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs) | |||
| 100 | */ | 99 | */ |
| 101 | 100 | ||
| 102 | static void | 101 | static void |
| 103 | __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | 102 | __report_bad_irq(unsigned int irq, struct irq_desc *desc, |
| 103 | irqreturn_t action_ret) | ||
| 104 | { | 104 | { |
| 105 | struct irqaction *action; | 105 | struct irqaction *action; |
| 106 | 106 | ||
| @@ -113,6 +113,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | |||
| 113 | } | 113 | } |
| 114 | dump_stack(); | 114 | dump_stack(); |
| 115 | printk(KERN_ERR "handlers:\n"); | 115 | printk(KERN_ERR "handlers:\n"); |
| 116 | |||
| 116 | action = desc->action; | 117 | action = desc->action; |
| 117 | while (action) { | 118 | while (action) { |
| 118 | printk(KERN_ERR "[<%p>]", action->handler); | 119 | printk(KERN_ERR "[<%p>]", action->handler); |
| @@ -123,7 +124,8 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | |||
| 123 | } | 124 | } |
| 124 | } | 125 | } |
| 125 | 126 | ||
| 126 | static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | 127 | static void |
| 128 | report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) | ||
| 127 | { | 129 | { |
| 128 | static int count = 100; | 130 | static int count = 100; |
| 129 | 131 | ||
| @@ -133,8 +135,8 @@ static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t actio | |||
| 133 | } | 135 | } |
| 134 | } | 136 | } |
| 135 | 137 | ||
| 136 | void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, | 138 | void note_interrupt(unsigned int irq, struct irq_desc *desc, |
| 137 | struct pt_regs *regs) | 139 | irqreturn_t action_ret, struct pt_regs *regs) |
| 138 | { | 140 | { |
| 139 | if (unlikely(action_ret != IRQ_HANDLED)) { | 141 | if (unlikely(action_ret != IRQ_HANDLED)) { |
| 140 | desc->irqs_unhandled++; | 142 | desc->irqs_unhandled++; |
| @@ -166,7 +168,8 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, | |||
| 166 | */ | 168 | */ |
| 167 | printk(KERN_EMERG "Disabling IRQ #%d\n", irq); | 169 | printk(KERN_EMERG "Disabling IRQ #%d\n", irq); |
| 168 | desc->status |= IRQ_DISABLED; | 170 | desc->status |= IRQ_DISABLED; |
| 169 | desc->handler->disable(irq); | 171 | desc->depth = 1; |
| 172 | desc->chip->disable(irq); | ||
| 170 | } | 173 | } |
| 171 | desc->irqs_unhandled = 0; | 174 | desc->irqs_unhandled = 0; |
| 172 | } | 175 | } |
| @@ -177,6 +180,7 @@ int __init noirqdebug_setup(char *str) | |||
| 177 | { | 180 | { |
| 178 | noirqdebug = 1; | 181 | noirqdebug = 1; |
| 179 | printk(KERN_INFO "IRQ lockup detection disabled\n"); | 182 | printk(KERN_INFO "IRQ lockup detection disabled\n"); |
| 183 | |||
| 180 | return 1; | 184 | return 1; |
| 181 | } | 185 | } |
| 182 | 186 | ||
| @@ -187,6 +191,7 @@ static int __init irqfixup_setup(char *str) | |||
| 187 | irqfixup = 1; | 191 | irqfixup = 1; |
| 188 | printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); | 192 | printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); |
| 189 | printk(KERN_WARNING "This may impact system performance.\n"); | 193 | printk(KERN_WARNING "This may impact system performance.\n"); |
| 194 | |||
| 190 | return 1; | 195 | return 1; |
| 191 | } | 196 | } |
| 192 | 197 | ||
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 39277dd6bf90..ab16a5a4cfe9 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
| @@ -275,8 +275,8 @@ static void upcase_if_global(struct kallsym_iter *iter) | |||
| 275 | static int get_ksymbol_mod(struct kallsym_iter *iter) | 275 | static int get_ksymbol_mod(struct kallsym_iter *iter) |
| 276 | { | 276 | { |
| 277 | iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, | 277 | iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, |
| 278 | &iter->value, | 278 | &iter->value, &iter->type, |
| 279 | &iter->type, iter->name); | 279 | iter->name, sizeof(iter->name)); |
| 280 | if (iter->owner == NULL) | 280 | if (iter->owner == NULL) |
| 281 | return 0; | 281 | return 0; |
| 282 | 282 | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c index 58f0f382597c..fcdd5d2bc3f4 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
| @@ -40,7 +40,7 @@ struct resource crashk_res = { | |||
| 40 | 40 | ||
| 41 | int kexec_should_crash(struct task_struct *p) | 41 | int kexec_should_crash(struct task_struct *p) |
| 42 | { | 42 | { |
| 43 | if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops) | 43 | if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops) |
| 44 | return 1; | 44 | return 1; |
| 45 | return 0; | 45 | return 0; |
| 46 | } | 46 | } |
| @@ -995,7 +995,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, | |||
| 995 | image = xchg(dest_image, image); | 995 | image = xchg(dest_image, image); |
| 996 | 996 | ||
| 997 | out: | 997 | out: |
| 998 | xchg(&kexec_lock, 0); /* Release the mutex */ | 998 | locked = xchg(&kexec_lock, 0); /* Release the mutex */ |
| 999 | BUG_ON(!locked); | ||
| 999 | kimage_free(image); | 1000 | kimage_free(image); |
| 1000 | 1001 | ||
| 1001 | return result; | 1002 | return result; |
| @@ -1042,7 +1043,6 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry, | |||
| 1042 | 1043 | ||
| 1043 | void crash_kexec(struct pt_regs *regs) | 1044 | void crash_kexec(struct pt_regs *regs) |
| 1044 | { | 1045 | { |
| 1045 | struct kimage *image; | ||
| 1046 | int locked; | 1046 | int locked; |
| 1047 | 1047 | ||
| 1048 | 1048 | ||
| @@ -1056,14 +1056,14 @@ void crash_kexec(struct pt_regs *regs) | |||
| 1056 | */ | 1056 | */ |
| 1057 | locked = xchg(&kexec_lock, 1); | 1057 | locked = xchg(&kexec_lock, 1); |
| 1058 | if (!locked) { | 1058 | if (!locked) { |
| 1059 | image = xchg(&kexec_crash_image, NULL); | 1059 | if (kexec_crash_image) { |
| 1060 | if (image) { | ||
| 1061 | struct pt_regs fixed_regs; | 1060 | struct pt_regs fixed_regs; |
| 1062 | crash_setup_regs(&fixed_regs, regs); | 1061 | crash_setup_regs(&fixed_regs, regs); |
| 1063 | machine_crash_shutdown(&fixed_regs); | 1062 | machine_crash_shutdown(&fixed_regs); |
| 1064 | machine_kexec(image); | 1063 | machine_kexec(kexec_crash_image); |
| 1065 | } | 1064 | } |
| 1066 | xchg(&kexec_lock, 0); | 1065 | locked = xchg(&kexec_lock, 0); |
| 1066 | BUG_ON(!locked); | ||
| 1067 | } | 1067 | } |
| 1068 | } | 1068 | } |
| 1069 | 1069 | ||
diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 64ab045c3d9d..5d1d907378a2 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c | |||
| @@ -122,6 +122,13 @@ unsigned int __kfifo_put(struct kfifo *fifo, | |||
| 122 | 122 | ||
| 123 | len = min(len, fifo->size - fifo->in + fifo->out); | 123 | len = min(len, fifo->size - fifo->in + fifo->out); |
| 124 | 124 | ||
| 125 | /* | ||
| 126 | * Ensure that we sample the fifo->out index -before- we | ||
| 127 | * start putting bytes into the kfifo. | ||
| 128 | */ | ||
| 129 | |||
| 130 | smp_mb(); | ||
| 131 | |||
| 125 | /* first put the data starting from fifo->in to buffer end */ | 132 | /* first put the data starting from fifo->in to buffer end */ |
| 126 | l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); | 133 | l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); |
| 127 | memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l); | 134 | memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l); |
| @@ -129,6 +136,13 @@ unsigned int __kfifo_put(struct kfifo *fifo, | |||
| 129 | /* then put the rest (if any) at the beginning of the buffer */ | 136 | /* then put the rest (if any) at the beginning of the buffer */ |
| 130 | memcpy(fifo->buffer, buffer + l, len - l); | 137 | memcpy(fifo->buffer, buffer + l, len - l); |
| 131 | 138 | ||
| 139 | /* | ||
| 140 | * Ensure that we add the bytes to the kfifo -before- | ||
| 141 | * we update the fifo->in index. | ||
| 142 | */ | ||
| 143 | |||
| 144 | smp_wmb(); | ||
| 145 | |||
| 132 | fifo->in += len; | 146 | fifo->in += len; |
| 133 | 147 | ||
| 134 | return len; | 148 | return len; |
| @@ -154,6 +168,13 @@ unsigned int __kfifo_get(struct kfifo *fifo, | |||
| 154 | 168 | ||
| 155 | len = min(len, fifo->in - fifo->out); | 169 | len = min(len, fifo->in - fifo->out); |
| 156 | 170 | ||
| 171 | /* | ||
| 172 | * Ensure that we sample the fifo->in index -before- we | ||
| 173 | * start removing bytes from the kfifo. | ||
| 174 | */ | ||
| 175 | |||
| 176 | smp_rmb(); | ||
| 177 | |||
| 157 | /* first get the data from fifo->out until the end of the buffer */ | 178 | /* first get the data from fifo->out until the end of the buffer */ |
| 158 | l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); | 179 | l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); |
| 159 | memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l); | 180 | memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l); |
| @@ -161,6 +182,13 @@ unsigned int __kfifo_get(struct kfifo *fifo, | |||
| 161 | /* then get the rest (if any) from the beginning of the buffer */ | 182 | /* then get the rest (if any) from the beginning of the buffer */ |
| 162 | memcpy(buffer + l, fifo->buffer, len - l); | 183 | memcpy(buffer + l, fifo->buffer, len - l); |
| 163 | 184 | ||
| 185 | /* | ||
| 186 | * Ensure that we remove the bytes from the kfifo -before- | ||
| 187 | * we update the fifo->out index. | ||
| 188 | */ | ||
| 189 | |||
| 190 | smp_mb(); | ||
| 191 | |||
| 164 | fifo->out += len; | 192 | fifo->out += len; |
| 165 | 193 | ||
| 166 | return len; | 194 | return len; |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 20a997c73c3d..842f8015d7fd 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
| @@ -20,7 +20,6 @@ | |||
| 20 | */ | 20 | */ |
| 21 | #define __KERNEL_SYSCALLS__ | 21 | #define __KERNEL_SYSCALLS__ |
| 22 | 22 | ||
| 23 | #include <linux/config.h> | ||
| 24 | #include <linux/module.h> | 23 | #include <linux/module.h> |
| 25 | #include <linux/sched.h> | 24 | #include <linux/sched.h> |
| 26 | #include <linux/syscalls.h> | 25 | #include <linux/syscalls.h> |
| @@ -177,6 +176,8 @@ static int wait_for_helper(void *data) | |||
| 177 | if (pid < 0) { | 176 | if (pid < 0) { |
| 178 | sub_info->retval = pid; | 177 | sub_info->retval = pid; |
| 179 | } else { | 178 | } else { |
| 179 | int ret; | ||
| 180 | |||
| 180 | /* | 181 | /* |
| 181 | * Normally it is bogus to call wait4() from in-kernel because | 182 | * Normally it is bogus to call wait4() from in-kernel because |
| 182 | * wait4() wants to write the exit code to a userspace address. | 183 | * wait4() wants to write the exit code to a userspace address. |
| @@ -186,7 +187,15 @@ static int wait_for_helper(void *data) | |||
| 186 | * | 187 | * |
| 187 | * Thus the __user pointer cast is valid here. | 188 | * Thus the __user pointer cast is valid here. |
| 188 | */ | 189 | */ |
| 189 | sys_wait4(pid, (int __user *) &sub_info->retval, 0, NULL); | 190 | sys_wait4(pid, (int __user *)&ret, 0, NULL); |
| 191 | |||
| 192 | /* | ||
| 193 | * If ret is 0, either ____call_usermodehelper failed and the | ||
| 194 | * real error code is already in sub_info->retval or | ||
| 195 | * sub_info->retval is 0 anyway, so don't mess with it then. | ||
| 196 | */ | ||
| 197 | if (ret) | ||
| 198 | sub_info->retval = ret; | ||
| 190 | } | 199 | } |
| 191 | 200 | ||
| 192 | complete(sub_info->complete); | 201 | complete(sub_info->complete); |
| @@ -198,11 +207,12 @@ static void __call_usermodehelper(void *data) | |||
| 198 | { | 207 | { |
| 199 | struct subprocess_info *sub_info = data; | 208 | struct subprocess_info *sub_info = data; |
| 200 | pid_t pid; | 209 | pid_t pid; |
| 210 | int wait = sub_info->wait; | ||
| 201 | 211 | ||
| 202 | /* CLONE_VFORK: wait until the usermode helper has execve'd | 212 | /* CLONE_VFORK: wait until the usermode helper has execve'd |
| 203 | * successfully We need the data structures to stay around | 213 | * successfully We need the data structures to stay around |
| 204 | * until that is done. */ | 214 | * until that is done. */ |
| 205 | if (sub_info->wait) | 215 | if (wait) |
| 206 | pid = kernel_thread(wait_for_helper, sub_info, | 216 | pid = kernel_thread(wait_for_helper, sub_info, |
| 207 | CLONE_FS | CLONE_FILES | SIGCHLD); | 217 | CLONE_FS | CLONE_FILES | SIGCHLD); |
| 208 | else | 218 | else |
| @@ -212,7 +222,7 @@ static void __call_usermodehelper(void *data) | |||
| 212 | if (pid < 0) { | 222 | if (pid < 0) { |
| 213 | sub_info->retval = pid; | 223 | sub_info->retval = pid; |
| 214 | complete(sub_info->complete); | 224 | complete(sub_info->complete); |
| 215 | } else if (!sub_info->wait) | 225 | } else if (!wait) |
| 216 | complete(sub_info->complete); | 226 | complete(sub_info->complete); |
| 217 | } | 227 | } |
| 218 | 228 | ||
| @@ -234,7 +244,7 @@ static void __call_usermodehelper(void *data) | |||
| 234 | int call_usermodehelper_keys(char *path, char **argv, char **envp, | 244 | int call_usermodehelper_keys(char *path, char **argv, char **envp, |
| 235 | struct key *session_keyring, int wait) | 245 | struct key *session_keyring, int wait) |
| 236 | { | 246 | { |
| 237 | DECLARE_COMPLETION(done); | 247 | DECLARE_COMPLETION_ONSTACK(done); |
| 238 | struct subprocess_info sub_info = { | 248 | struct subprocess_info sub_info = { |
| 239 | .complete = &done, | 249 | .complete = &done, |
| 240 | .path = path, | 250 | .path = path, |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1fbf466a29aa..3f57dfdc8f92 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
| @@ -47,11 +47,17 @@ | |||
| 47 | 47 | ||
| 48 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; | 48 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; |
| 49 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; | 49 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; |
| 50 | static atomic_t kprobe_count; | ||
| 50 | 51 | ||
| 51 | DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ | 52 | DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ |
| 52 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ | 53 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ |
| 53 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; | 54 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; |
| 54 | 55 | ||
| 56 | static struct notifier_block kprobe_page_fault_nb = { | ||
| 57 | .notifier_call = kprobe_exceptions_notify, | ||
| 58 | .priority = 0x7fffffff /* we need to notified first */ | ||
| 59 | }; | ||
| 60 | |||
| 55 | #ifdef __ARCH_WANT_KPROBES_INSN_SLOT | 61 | #ifdef __ARCH_WANT_KPROBES_INSN_SLOT |
| 56 | /* | 62 | /* |
| 57 | * kprobe->ainsn.insn points to the copy of the instruction to be | 63 | * kprobe->ainsn.insn points to the copy of the instruction to be |
| @@ -368,16 +374,15 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) | |||
| 368 | */ | 374 | */ |
| 369 | static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) | 375 | static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) |
| 370 | { | 376 | { |
| 371 | struct kprobe *kp; | ||
| 372 | |||
| 373 | if (p->break_handler) { | 377 | if (p->break_handler) { |
| 374 | list_for_each_entry_rcu(kp, &old_p->list, list) { | 378 | if (old_p->break_handler) |
| 375 | if (kp->break_handler) | 379 | return -EEXIST; |
| 376 | return -EEXIST; | ||
| 377 | } | ||
| 378 | list_add_tail_rcu(&p->list, &old_p->list); | 380 | list_add_tail_rcu(&p->list, &old_p->list); |
| 381 | old_p->break_handler = aggr_break_handler; | ||
| 379 | } else | 382 | } else |
| 380 | list_add_rcu(&p->list, &old_p->list); | 383 | list_add_rcu(&p->list, &old_p->list); |
| 384 | if (p->post_handler && !old_p->post_handler) | ||
| 385 | old_p->post_handler = aggr_post_handler; | ||
| 381 | return 0; | 386 | return 0; |
| 382 | } | 387 | } |
| 383 | 388 | ||
| @@ -388,11 +393,14 @@ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) | |||
| 388 | static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | 393 | static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) |
| 389 | { | 394 | { |
| 390 | copy_kprobe(p, ap); | 395 | copy_kprobe(p, ap); |
| 396 | flush_insn_slot(ap); | ||
| 391 | ap->addr = p->addr; | 397 | ap->addr = p->addr; |
| 392 | ap->pre_handler = aggr_pre_handler; | 398 | ap->pre_handler = aggr_pre_handler; |
| 393 | ap->post_handler = aggr_post_handler; | ||
| 394 | ap->fault_handler = aggr_fault_handler; | 399 | ap->fault_handler = aggr_fault_handler; |
| 395 | ap->break_handler = aggr_break_handler; | 400 | if (p->post_handler) |
| 401 | ap->post_handler = aggr_post_handler; | ||
| 402 | if (p->break_handler) | ||
| 403 | ap->break_handler = aggr_break_handler; | ||
| 396 | 404 | ||
| 397 | INIT_LIST_HEAD(&ap->list); | 405 | INIT_LIST_HEAD(&ap->list); |
| 398 | list_add_rcu(&p->list, &ap->list); | 406 | list_add_rcu(&p->list, &ap->list); |
| @@ -464,6 +472,8 @@ static int __kprobes __register_kprobe(struct kprobe *p, | |||
| 464 | old_p = get_kprobe(p->addr); | 472 | old_p = get_kprobe(p->addr); |
| 465 | if (old_p) { | 473 | if (old_p) { |
| 466 | ret = register_aggr_kprobe(old_p, p); | 474 | ret = register_aggr_kprobe(old_p, p); |
| 475 | if (!ret) | ||
| 476 | atomic_inc(&kprobe_count); | ||
| 467 | goto out; | 477 | goto out; |
| 468 | } | 478 | } |
| 469 | 479 | ||
| @@ -474,6 +484,10 @@ static int __kprobes __register_kprobe(struct kprobe *p, | |||
| 474 | hlist_add_head_rcu(&p->hlist, | 484 | hlist_add_head_rcu(&p->hlist, |
| 475 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); | 485 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); |
| 476 | 486 | ||
| 487 | if (atomic_add_return(1, &kprobe_count) == \ | ||
| 488 | (ARCH_INACTIVE_KPROBE_COUNT + 1)) | ||
| 489 | register_page_fault_notifier(&kprobe_page_fault_nb); | ||
| 490 | |||
| 477 | arch_arm_kprobe(p); | 491 | arch_arm_kprobe(p); |
| 478 | 492 | ||
| 479 | out: | 493 | out: |
| @@ -536,14 +550,40 @@ valid_p: | |||
| 536 | kfree(old_p); | 550 | kfree(old_p); |
| 537 | } | 551 | } |
| 538 | arch_remove_kprobe(p); | 552 | arch_remove_kprobe(p); |
| 553 | } else { | ||
| 554 | mutex_lock(&kprobe_mutex); | ||
| 555 | if (p->break_handler) | ||
| 556 | old_p->break_handler = NULL; | ||
| 557 | if (p->post_handler){ | ||
| 558 | list_for_each_entry_rcu(list_p, &old_p->list, list){ | ||
| 559 | if (list_p->post_handler){ | ||
| 560 | cleanup_p = 2; | ||
| 561 | break; | ||
| 562 | } | ||
| 563 | } | ||
| 564 | if (cleanup_p == 0) | ||
| 565 | old_p->post_handler = NULL; | ||
| 566 | } | ||
| 567 | mutex_unlock(&kprobe_mutex); | ||
| 539 | } | 568 | } |
| 569 | |||
| 570 | /* Call unregister_page_fault_notifier() | ||
| 571 | * if no probes are active | ||
| 572 | */ | ||
| 573 | mutex_lock(&kprobe_mutex); | ||
| 574 | if (atomic_add_return(-1, &kprobe_count) == \ | ||
| 575 | ARCH_INACTIVE_KPROBE_COUNT) | ||
| 576 | unregister_page_fault_notifier(&kprobe_page_fault_nb); | ||
| 577 | mutex_unlock(&kprobe_mutex); | ||
| 578 | return; | ||
| 540 | } | 579 | } |
| 541 | 580 | ||
| 542 | static struct notifier_block kprobe_exceptions_nb = { | 581 | static struct notifier_block kprobe_exceptions_nb = { |
| 543 | .notifier_call = kprobe_exceptions_notify, | 582 | .notifier_call = kprobe_exceptions_notify, |
| 544 | .priority = 0x7fffffff /* we need to notified first */ | 583 | .priority = 0x7fffffff /* we need to be notified first */ |
| 545 | }; | 584 | }; |
| 546 | 585 | ||
| 586 | |||
| 547 | int __kprobes register_jprobe(struct jprobe *jp) | 587 | int __kprobes register_jprobe(struct jprobe *jp) |
| 548 | { | 588 | { |
| 549 | /* Todo: Verify probepoint is a function entry point */ | 589 | /* Todo: Verify probepoint is a function entry point */ |
| @@ -652,6 +692,7 @@ static int __init init_kprobes(void) | |||
| 652 | INIT_HLIST_HEAD(&kprobe_table[i]); | 692 | INIT_HLIST_HEAD(&kprobe_table[i]); |
| 653 | INIT_HLIST_HEAD(&kretprobe_inst_table[i]); | 693 | INIT_HLIST_HEAD(&kretprobe_inst_table[i]); |
| 654 | } | 694 | } |
| 695 | atomic_set(&kprobe_count, 0); | ||
| 655 | 696 | ||
| 656 | err = arch_init_kprobes(); | 697 | err = arch_init_kprobes(); |
| 657 | if (!err) | 698 | if (!err) |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 9e28478a17a5..e0ffe4ab0917 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
| @@ -8,7 +8,6 @@ | |||
| 8 | * | 8 | * |
| 9 | */ | 9 | */ |
| 10 | 10 | ||
| 11 | #include <linux/config.h> | ||
| 12 | #include <linux/kobject.h> | 11 | #include <linux/kobject.h> |
| 13 | #include <linux/string.h> | 12 | #include <linux/string.h> |
| 14 | #include <linux/sysfs.h> | 13 | #include <linux/sysfs.h> |
diff --git a/kernel/kthread.c b/kernel/kthread.c index c5f3c6613b6d..4f9c60ef95e8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -45,6 +45,13 @@ struct kthread_stop_info | |||
| 45 | static DEFINE_MUTEX(kthread_stop_lock); | 45 | static DEFINE_MUTEX(kthread_stop_lock); |
| 46 | static struct kthread_stop_info kthread_stop_info; | 46 | static struct kthread_stop_info kthread_stop_info; |
| 47 | 47 | ||
| 48 | /** | ||
| 49 | * kthread_should_stop - should this kthread return now? | ||
| 50 | * | ||
| 51 | * When someone calls kthread_stop on your kthread, it will be woken | ||
| 52 | * and this will return true. You should then return, and your return | ||
| 53 | * value will be passed through to kthread_stop(). | ||
| 54 | */ | ||
| 48 | int kthread_should_stop(void) | 55 | int kthread_should_stop(void) |
| 49 | { | 56 | { |
| 50 | return (kthread_stop_info.k == current); | 57 | return (kthread_stop_info.k == current); |
| @@ -122,6 +129,25 @@ static void keventd_create_kthread(void *_create) | |||
| 122 | complete(&create->done); | 129 | complete(&create->done); |
| 123 | } | 130 | } |
| 124 | 131 | ||
| 132 | /** | ||
| 133 | * kthread_create - create a kthread. | ||
| 134 | * @threadfn: the function to run until signal_pending(current). | ||
| 135 | * @data: data ptr for @threadfn. | ||
| 136 | * @namefmt: printf-style name for the thread. | ||
| 137 | * | ||
| 138 | * Description: This helper function creates and names a kernel | ||
| 139 | * thread. The thread will be stopped: use wake_up_process() to start | ||
| 140 | * it. See also kthread_run(), kthread_create_on_cpu(). | ||
| 141 | * | ||
| 142 | * When woken, the thread will run @threadfn() with @data as its | ||
| 143 | * argument. @threadfn can either call do_exit() directly if it is a | ||
| 144 | * standalone thread for which noone will call kthread_stop(), or | ||
| 145 | * return when 'kthread_should_stop()' is true (which means | ||
| 146 | * kthread_stop() has been called). The return value should be zero | ||
| 147 | * or a negative error number; it will be passed to kthread_stop(). | ||
| 148 | * | ||
| 149 | * Returns a task_struct or ERR_PTR(-ENOMEM). | ||
| 150 | */ | ||
| 125 | struct task_struct *kthread_create(int (*threadfn)(void *data), | 151 | struct task_struct *kthread_create(int (*threadfn)(void *data), |
| 126 | void *data, | 152 | void *data, |
| 127 | const char namefmt[], | 153 | const char namefmt[], |
| @@ -156,6 +182,15 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), | |||
| 156 | } | 182 | } |
| 157 | EXPORT_SYMBOL(kthread_create); | 183 | EXPORT_SYMBOL(kthread_create); |
| 158 | 184 | ||
| 185 | /** | ||
| 186 | * kthread_bind - bind a just-created kthread to a cpu. | ||
| 187 | * @k: thread created by kthread_create(). | ||
| 188 | * @cpu: cpu (might not be online, must be possible) for @k to run on. | ||
| 189 | * | ||
| 190 | * Description: This function is equivalent to set_cpus_allowed(), | ||
| 191 | * except that @cpu doesn't need to be online, and the thread must be | ||
| 192 | * stopped (i.e., just returned from kthread_create(). | ||
| 193 | */ | ||
| 159 | void kthread_bind(struct task_struct *k, unsigned int cpu) | 194 | void kthread_bind(struct task_struct *k, unsigned int cpu) |
| 160 | { | 195 | { |
| 161 | BUG_ON(k->state != TASK_INTERRUPTIBLE); | 196 | BUG_ON(k->state != TASK_INTERRUPTIBLE); |
| @@ -166,14 +201,21 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) | |||
| 166 | } | 201 | } |
| 167 | EXPORT_SYMBOL(kthread_bind); | 202 | EXPORT_SYMBOL(kthread_bind); |
| 168 | 203 | ||
| 204 | /** | ||
| 205 | * kthread_stop - stop a thread created by kthread_create(). | ||
| 206 | * @k: thread created by kthread_create(). | ||
| 207 | * | ||
| 208 | * Sets kthread_should_stop() for @k to return true, wakes it, and | ||
| 209 | * waits for it to exit. Your threadfn() must not call do_exit() | ||
| 210 | * itself if you use this function! This can also be called after | ||
| 211 | * kthread_create() instead of calling wake_up_process(): the thread | ||
| 212 | * will exit without calling threadfn(). | ||
| 213 | * | ||
| 214 | * Returns the result of threadfn(), or %-EINTR if wake_up_process() | ||
| 215 | * was never called. | ||
| 216 | */ | ||
| 169 | int kthread_stop(struct task_struct *k) | 217 | int kthread_stop(struct task_struct *k) |
| 170 | { | 218 | { |
| 171 | return kthread_stop_sem(k, NULL); | ||
| 172 | } | ||
| 173 | EXPORT_SYMBOL(kthread_stop); | ||
| 174 | |||
| 175 | int kthread_stop_sem(struct task_struct *k, struct semaphore *s) | ||
| 176 | { | ||
| 177 | int ret; | 219 | int ret; |
| 178 | 220 | ||
| 179 | mutex_lock(&kthread_stop_lock); | 221 | mutex_lock(&kthread_stop_lock); |
| @@ -187,10 +229,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) | |||
| 187 | 229 | ||
| 188 | /* Now set kthread_should_stop() to true, and wake it up. */ | 230 | /* Now set kthread_should_stop() to true, and wake it up. */ |
| 189 | kthread_stop_info.k = k; | 231 | kthread_stop_info.k = k; |
| 190 | if (s) | 232 | wake_up_process(k); |
| 191 | up(s); | ||
| 192 | else | ||
| 193 | wake_up_process(k); | ||
| 194 | put_task_struct(k); | 233 | put_task_struct(k); |
| 195 | 234 | ||
| 196 | /* Once it dies, reset stop ptr, gather result and we're done. */ | 235 | /* Once it dies, reset stop ptr, gather result and we're done. */ |
| @@ -201,7 +240,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) | |||
| 201 | 240 | ||
| 202 | return ret; | 241 | return ret; |
| 203 | } | 242 | } |
| 204 | EXPORT_SYMBOL(kthread_stop_sem); | 243 | EXPORT_SYMBOL(kthread_stop); |
| 205 | 244 | ||
| 206 | static __init int helper_init(void) | 245 | static __init int helper_init(void) |
| 207 | { | 246 | { |
| @@ -210,5 +249,5 @@ static __init int helper_init(void) | |||
| 210 | 249 | ||
| 211 | return 0; | 250 | return 0; |
| 212 | } | 251 | } |
| 213 | core_initcall(helper_init); | ||
| 214 | 252 | ||
| 253 | core_initcall(helper_init); | ||
diff --git a/kernel/lockdep.c b/kernel/lockdep.c new file mode 100644 index 000000000000..e596525669ed --- /dev/null +++ b/kernel/lockdep.c | |||
| @@ -0,0 +1,2724 @@ | |||
| 1 | /* | ||
| 2 | * kernel/lockdep.c | ||
| 3 | * | ||
| 4 | * Runtime locking correctness validator | ||
| 5 | * | ||
| 6 | * Started by Ingo Molnar: | ||
| 7 | * | ||
| 8 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
| 9 | * | ||
| 10 | * this code maps all the lock dependencies as they occur in a live kernel | ||
| 11 | * and will warn about the following classes of locking bugs: | ||
| 12 | * | ||
| 13 | * - lock inversion scenarios | ||
| 14 | * - circular lock dependencies | ||
| 15 | * - hardirq/softirq safe/unsafe locking bugs | ||
| 16 | * | ||
| 17 | * Bugs are reported even if the current locking scenario does not cause | ||
| 18 | * any deadlock at this point. | ||
| 19 | * | ||
| 20 | * I.e. if anytime in the past two locks were taken in a different order, | ||
| 21 | * even if it happened for another task, even if those were different | ||
| 22 | * locks (but of the same class as this lock), this code will detect it. | ||
| 23 | * | ||
| 24 | * Thanks to Arjan van de Ven for coming up with the initial idea of | ||
| 25 | * mapping lock dependencies runtime. | ||
| 26 | */ | ||
| 27 | #include <linux/mutex.h> | ||
| 28 | #include <linux/sched.h> | ||
| 29 | #include <linux/delay.h> | ||
| 30 | #include <linux/module.h> | ||
| 31 | #include <linux/proc_fs.h> | ||
| 32 | #include <linux/seq_file.h> | ||
| 33 | #include <linux/spinlock.h> | ||
| 34 | #include <linux/kallsyms.h> | ||
| 35 | #include <linux/interrupt.h> | ||
| 36 | #include <linux/stacktrace.h> | ||
| 37 | #include <linux/debug_locks.h> | ||
| 38 | #include <linux/irqflags.h> | ||
| 39 | #include <linux/utsname.h> | ||
| 40 | |||
| 41 | #include <asm/sections.h> | ||
| 42 | |||
| 43 | #include "lockdep_internals.h" | ||
| 44 | |||
| 45 | /* | ||
| 46 | * hash_lock: protects the lockdep hashes and class/list/hash allocators. | ||
| 47 | * | ||
| 48 | * This is one of the rare exceptions where it's justified | ||
| 49 | * to use a raw spinlock - we really dont want the spinlock | ||
| 50 | * code to recurse back into the lockdep code. | ||
| 51 | */ | ||
| 52 | static raw_spinlock_t hash_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; | ||
| 53 | |||
| 54 | static int lockdep_initialized; | ||
| 55 | |||
| 56 | unsigned long nr_list_entries; | ||
| 57 | static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; | ||
| 58 | |||
| 59 | /* | ||
| 60 | * Allocate a lockdep entry. (assumes hash_lock held, returns | ||
| 61 | * with NULL on failure) | ||
| 62 | */ | ||
| 63 | static struct lock_list *alloc_list_entry(void) | ||
| 64 | { | ||
| 65 | if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { | ||
| 66 | __raw_spin_unlock(&hash_lock); | ||
| 67 | debug_locks_off(); | ||
| 68 | printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); | ||
| 69 | printk("turning off the locking correctness validator.\n"); | ||
| 70 | return NULL; | ||
| 71 | } | ||
| 72 | return list_entries + nr_list_entries++; | ||
| 73 | } | ||
| 74 | |||
| 75 | /* | ||
| 76 | * All data structures here are protected by the global debug_lock. | ||
| 77 | * | ||
| 78 | * Mutex key structs only get allocated, once during bootup, and never | ||
| 79 | * get freed - this significantly simplifies the debugging code. | ||
| 80 | */ | ||
| 81 | unsigned long nr_lock_classes; | ||
| 82 | static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; | ||
| 83 | |||
| 84 | /* | ||
| 85 | * We keep a global list of all lock classes. The list only grows, | ||
| 86 | * never shrinks. The list is only accessed with the lockdep | ||
| 87 | * spinlock lock held. | ||
| 88 | */ | ||
| 89 | LIST_HEAD(all_lock_classes); | ||
| 90 | |||
| 91 | /* | ||
| 92 | * The lockdep classes are in a hash-table as well, for fast lookup: | ||
| 93 | */ | ||
| 94 | #define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1) | ||
| 95 | #define CLASSHASH_SIZE (1UL << CLASSHASH_BITS) | ||
| 96 | #define CLASSHASH_MASK (CLASSHASH_SIZE - 1) | ||
| 97 | #define __classhashfn(key) ((((unsigned long)key >> CLASSHASH_BITS) + (unsigned long)key) & CLASSHASH_MASK) | ||
| 98 | #define classhashentry(key) (classhash_table + __classhashfn((key))) | ||
| 99 | |||
| 100 | static struct list_head classhash_table[CLASSHASH_SIZE]; | ||
| 101 | |||
| 102 | unsigned long nr_lock_chains; | ||
| 103 | static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; | ||
| 104 | |||
| 105 | /* | ||
| 106 | * We put the lock dependency chains into a hash-table as well, to cache | ||
| 107 | * their existence: | ||
| 108 | */ | ||
| 109 | #define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1) | ||
| 110 | #define CHAINHASH_SIZE (1UL << CHAINHASH_BITS) | ||
| 111 | #define CHAINHASH_MASK (CHAINHASH_SIZE - 1) | ||
| 112 | #define __chainhashfn(chain) \ | ||
| 113 | (((chain >> CHAINHASH_BITS) + chain) & CHAINHASH_MASK) | ||
| 114 | #define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) | ||
| 115 | |||
| 116 | static struct list_head chainhash_table[CHAINHASH_SIZE]; | ||
| 117 | |||
| 118 | /* | ||
| 119 | * The hash key of the lock dependency chains is a hash itself too: | ||
| 120 | * it's a hash of all locks taken up to that lock, including that lock. | ||
| 121 | * It's a 64-bit hash, because it's important for the keys to be | ||
| 122 | * unique. | ||
| 123 | */ | ||
| 124 | #define iterate_chain_key(key1, key2) \ | ||
| 125 | (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \ | ||
| 126 | ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ | ||
| 127 | (key2)) | ||
| 128 | |||
| 129 | void lockdep_off(void) | ||
| 130 | { | ||
| 131 | current->lockdep_recursion++; | ||
| 132 | } | ||
| 133 | |||
| 134 | EXPORT_SYMBOL(lockdep_off); | ||
| 135 | |||
| 136 | void lockdep_on(void) | ||
| 137 | { | ||
| 138 | current->lockdep_recursion--; | ||
| 139 | } | ||
| 140 | |||
| 141 | EXPORT_SYMBOL(lockdep_on); | ||
| 142 | |||
| 143 | int lockdep_internal(void) | ||
| 144 | { | ||
| 145 | return current->lockdep_recursion != 0; | ||
| 146 | } | ||
| 147 | |||
| 148 | EXPORT_SYMBOL(lockdep_internal); | ||
| 149 | |||
| 150 | /* | ||
| 151 | * Debugging switches: | ||
| 152 | */ | ||
| 153 | |||
| 154 | #define VERBOSE 0 | ||
| 155 | #ifdef VERBOSE | ||
| 156 | # define VERY_VERBOSE 0 | ||
| 157 | #endif | ||
| 158 | |||
| 159 | #if VERBOSE | ||
| 160 | # define HARDIRQ_VERBOSE 1 | ||
| 161 | # define SOFTIRQ_VERBOSE 1 | ||
| 162 | #else | ||
| 163 | # define HARDIRQ_VERBOSE 0 | ||
| 164 | # define SOFTIRQ_VERBOSE 0 | ||
| 165 | #endif | ||
| 166 | |||
| 167 | #if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE | ||
| 168 | /* | ||
| 169 | * Quick filtering for interesting events: | ||
| 170 | */ | ||
| 171 | static int class_filter(struct lock_class *class) | ||
| 172 | { | ||
| 173 | #if 0 | ||
| 174 | /* Example */ | ||
| 175 | if (class->name_version == 1 && | ||
| 176 | !strcmp(class->name, "lockname")) | ||
| 177 | return 1; | ||
| 178 | if (class->name_version == 1 && | ||
| 179 | !strcmp(class->name, "&struct->lockfield")) | ||
| 180 | return 1; | ||
| 181 | #endif | ||
| 182 | /* Allow everything else. 0 would be filter everything else */ | ||
| 183 | return 1; | ||
| 184 | } | ||
| 185 | #endif | ||
| 186 | |||
| 187 | static int verbose(struct lock_class *class) | ||
| 188 | { | ||
| 189 | #if VERBOSE | ||
| 190 | return class_filter(class); | ||
| 191 | #endif | ||
| 192 | return 0; | ||
| 193 | } | ||
| 194 | |||
| 195 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
| 196 | |||
| 197 | static int hardirq_verbose(struct lock_class *class) | ||
| 198 | { | ||
| 199 | #if HARDIRQ_VERBOSE | ||
| 200 | return class_filter(class); | ||
| 201 | #endif | ||
| 202 | return 0; | ||
| 203 | } | ||
| 204 | |||
| 205 | static int softirq_verbose(struct lock_class *class) | ||
| 206 | { | ||
| 207 | #if SOFTIRQ_VERBOSE | ||
| 208 | return class_filter(class); | ||
| 209 | #endif | ||
| 210 | return 0; | ||
| 211 | } | ||
| 212 | |||
| 213 | #endif | ||
| 214 | |||
| 215 | /* | ||
| 216 | * Stack-trace: tightly packed array of stack backtrace | ||
| 217 | * addresses. Protected by the hash_lock. | ||
| 218 | */ | ||
| 219 | unsigned long nr_stack_trace_entries; | ||
| 220 | static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; | ||
| 221 | |||
| 222 | static int save_trace(struct stack_trace *trace) | ||
| 223 | { | ||
| 224 | trace->nr_entries = 0; | ||
| 225 | trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; | ||
| 226 | trace->entries = stack_trace + nr_stack_trace_entries; | ||
| 227 | |||
| 228 | trace->skip = 3; | ||
| 229 | trace->all_contexts = 0; | ||
| 230 | |||
| 231 | /* Make sure to not recurse in case the the unwinder needs to tak | ||
| 232 | e locks. */ | ||
| 233 | lockdep_off(); | ||
| 234 | save_stack_trace(trace, NULL); | ||
| 235 | lockdep_on(); | ||
| 236 | |||
| 237 | trace->max_entries = trace->nr_entries; | ||
| 238 | |||
| 239 | nr_stack_trace_entries += trace->nr_entries; | ||
| 240 | if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES)) | ||
| 241 | return 0; | ||
| 242 | |||
| 243 | if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { | ||
| 244 | __raw_spin_unlock(&hash_lock); | ||
| 245 | if (debug_locks_off()) { | ||
| 246 | printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); | ||
| 247 | printk("turning off the locking correctness validator.\n"); | ||
| 248 | dump_stack(); | ||
| 249 | } | ||
| 250 | return 0; | ||
| 251 | } | ||
| 252 | |||
| 253 | return 1; | ||
| 254 | } | ||
| 255 | |||
| 256 | unsigned int nr_hardirq_chains; | ||
| 257 | unsigned int nr_softirq_chains; | ||
| 258 | unsigned int nr_process_chains; | ||
| 259 | unsigned int max_lockdep_depth; | ||
| 260 | unsigned int max_recursion_depth; | ||
| 261 | |||
| 262 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
| 263 | /* | ||
| 264 | * We cannot printk in early bootup code. Not even early_printk() | ||
| 265 | * might work. So we mark any initialization errors and printk | ||
| 266 | * about it later on, in lockdep_info(). | ||
| 267 | */ | ||
| 268 | static int lockdep_init_error; | ||
| 269 | |||
| 270 | /* | ||
| 271 | * Various lockdep statistics: | ||
| 272 | */ | ||
| 273 | atomic_t chain_lookup_hits; | ||
| 274 | atomic_t chain_lookup_misses; | ||
| 275 | atomic_t hardirqs_on_events; | ||
| 276 | atomic_t hardirqs_off_events; | ||
| 277 | atomic_t redundant_hardirqs_on; | ||
| 278 | atomic_t redundant_hardirqs_off; | ||
| 279 | atomic_t softirqs_on_events; | ||
| 280 | atomic_t softirqs_off_events; | ||
| 281 | atomic_t redundant_softirqs_on; | ||
| 282 | atomic_t redundant_softirqs_off; | ||
| 283 | atomic_t nr_unused_locks; | ||
| 284 | atomic_t nr_cyclic_checks; | ||
| 285 | atomic_t nr_cyclic_check_recursions; | ||
| 286 | atomic_t nr_find_usage_forwards_checks; | ||
| 287 | atomic_t nr_find_usage_forwards_recursions; | ||
| 288 | atomic_t nr_find_usage_backwards_checks; | ||
| 289 | atomic_t nr_find_usage_backwards_recursions; | ||
| 290 | # define debug_atomic_inc(ptr) atomic_inc(ptr) | ||
| 291 | # define debug_atomic_dec(ptr) atomic_dec(ptr) | ||
| 292 | # define debug_atomic_read(ptr) atomic_read(ptr) | ||
| 293 | #else | ||
| 294 | # define debug_atomic_inc(ptr) do { } while (0) | ||
| 295 | # define debug_atomic_dec(ptr) do { } while (0) | ||
| 296 | # define debug_atomic_read(ptr) 0 | ||
| 297 | #endif | ||
| 298 | |||
| 299 | /* | ||
| 300 | * Locking printouts: | ||
| 301 | */ | ||
| 302 | |||
| 303 | static const char *usage_str[] = | ||
| 304 | { | ||
| 305 | [LOCK_USED] = "initial-use ", | ||
| 306 | [LOCK_USED_IN_HARDIRQ] = "in-hardirq-W", | ||
| 307 | [LOCK_USED_IN_SOFTIRQ] = "in-softirq-W", | ||
| 308 | [LOCK_ENABLED_SOFTIRQS] = "softirq-on-W", | ||
| 309 | [LOCK_ENABLED_HARDIRQS] = "hardirq-on-W", | ||
| 310 | [LOCK_USED_IN_HARDIRQ_READ] = "in-hardirq-R", | ||
| 311 | [LOCK_USED_IN_SOFTIRQ_READ] = "in-softirq-R", | ||
| 312 | [LOCK_ENABLED_SOFTIRQS_READ] = "softirq-on-R", | ||
| 313 | [LOCK_ENABLED_HARDIRQS_READ] = "hardirq-on-R", | ||
| 314 | }; | ||
| 315 | |||
| 316 | const char * __get_key_name(struct lockdep_subclass_key *key, char *str) | ||
| 317 | { | ||
| 318 | unsigned long offs, size; | ||
| 319 | char *modname; | ||
| 320 | |||
| 321 | return kallsyms_lookup((unsigned long)key, &size, &offs, &modname, str); | ||
| 322 | } | ||
| 323 | |||
| 324 | void | ||
| 325 | get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4) | ||
| 326 | { | ||
| 327 | *c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.'; | ||
| 328 | |||
| 329 | if (class->usage_mask & LOCKF_USED_IN_HARDIRQ) | ||
| 330 | *c1 = '+'; | ||
| 331 | else | ||
| 332 | if (class->usage_mask & LOCKF_ENABLED_HARDIRQS) | ||
| 333 | *c1 = '-'; | ||
| 334 | |||
| 335 | if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) | ||
| 336 | *c2 = '+'; | ||
| 337 | else | ||
| 338 | if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS) | ||
| 339 | *c2 = '-'; | ||
| 340 | |||
| 341 | if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) | ||
| 342 | *c3 = '-'; | ||
| 343 | if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) { | ||
| 344 | *c3 = '+'; | ||
| 345 | if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) | ||
| 346 | *c3 = '?'; | ||
| 347 | } | ||
| 348 | |||
| 349 | if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) | ||
| 350 | *c4 = '-'; | ||
| 351 | if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) { | ||
| 352 | *c4 = '+'; | ||
| 353 | if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) | ||
| 354 | *c4 = '?'; | ||
| 355 | } | ||
| 356 | } | ||
| 357 | |||
| 358 | static void print_lock_name(struct lock_class *class) | ||
| 359 | { | ||
| 360 | char str[128], c1, c2, c3, c4; | ||
| 361 | const char *name; | ||
| 362 | |||
| 363 | get_usage_chars(class, &c1, &c2, &c3, &c4); | ||
| 364 | |||
| 365 | name = class->name; | ||
| 366 | if (!name) { | ||
| 367 | name = __get_key_name(class->key, str); | ||
| 368 | printk(" (%s", name); | ||
| 369 | } else { | ||
| 370 | printk(" (%s", name); | ||
| 371 | if (class->name_version > 1) | ||
| 372 | printk("#%d", class->name_version); | ||
| 373 | if (class->subclass) | ||
| 374 | printk("/%d", class->subclass); | ||
| 375 | } | ||
| 376 | printk("){%c%c%c%c}", c1, c2, c3, c4); | ||
| 377 | } | ||
| 378 | |||
| 379 | static void print_lockdep_cache(struct lockdep_map *lock) | ||
| 380 | { | ||
| 381 | const char *name; | ||
| 382 | char str[128]; | ||
| 383 | |||
| 384 | name = lock->name; | ||
| 385 | if (!name) | ||
| 386 | name = __get_key_name(lock->key->subkeys, str); | ||
| 387 | |||
| 388 | printk("%s", name); | ||
| 389 | } | ||
| 390 | |||
| 391 | static void print_lock(struct held_lock *hlock) | ||
| 392 | { | ||
| 393 | print_lock_name(hlock->class); | ||
| 394 | printk(", at: "); | ||
| 395 | print_ip_sym(hlock->acquire_ip); | ||
| 396 | } | ||
| 397 | |||
| 398 | static void lockdep_print_held_locks(struct task_struct *curr) | ||
| 399 | { | ||
| 400 | int i, depth = curr->lockdep_depth; | ||
| 401 | |||
| 402 | if (!depth) { | ||
| 403 | printk("no locks held by %s/%d.\n", curr->comm, curr->pid); | ||
| 404 | return; | ||
| 405 | } | ||
| 406 | printk("%d lock%s held by %s/%d:\n", | ||
| 407 | depth, depth > 1 ? "s" : "", curr->comm, curr->pid); | ||
| 408 | |||
| 409 | for (i = 0; i < depth; i++) { | ||
| 410 | printk(" #%d: ", i); | ||
| 411 | print_lock(curr->held_locks + i); | ||
| 412 | } | ||
| 413 | } | ||
| 414 | |||
| 415 | static void print_lock_class_header(struct lock_class *class, int depth) | ||
| 416 | { | ||
| 417 | int bit; | ||
| 418 | |||
| 419 | printk("%*s->", depth, ""); | ||
| 420 | print_lock_name(class); | ||
| 421 | printk(" ops: %lu", class->ops); | ||
| 422 | printk(" {\n"); | ||
| 423 | |||
| 424 | for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { | ||
| 425 | if (class->usage_mask & (1 << bit)) { | ||
| 426 | int len = depth; | ||
| 427 | |||
| 428 | len += printk("%*s %s", depth, "", usage_str[bit]); | ||
| 429 | len += printk(" at:\n"); | ||
| 430 | print_stack_trace(class->usage_traces + bit, len); | ||
| 431 | } | ||
| 432 | } | ||
| 433 | printk("%*s }\n", depth, ""); | ||
| 434 | |||
| 435 | printk("%*s ... key at: ",depth,""); | ||
| 436 | print_ip_sym((unsigned long)class->key); | ||
| 437 | } | ||
| 438 | |||
| 439 | /* | ||
| 440 | * printk all lock dependencies starting at <entry>: | ||
| 441 | */ | ||
| 442 | static void print_lock_dependencies(struct lock_class *class, int depth) | ||
| 443 | { | ||
| 444 | struct lock_list *entry; | ||
| 445 | |||
| 446 | if (DEBUG_LOCKS_WARN_ON(depth >= 20)) | ||
| 447 | return; | ||
| 448 | |||
| 449 | print_lock_class_header(class, depth); | ||
| 450 | |||
| 451 | list_for_each_entry(entry, &class->locks_after, entry) { | ||
| 452 | DEBUG_LOCKS_WARN_ON(!entry->class); | ||
| 453 | print_lock_dependencies(entry->class, depth + 1); | ||
| 454 | |||
| 455 | printk("%*s ... acquired at:\n",depth,""); | ||
| 456 | print_stack_trace(&entry->trace, 2); | ||
| 457 | printk("\n"); | ||
| 458 | } | ||
| 459 | } | ||
| 460 | |||
| 461 | /* | ||
| 462 | * Add a new dependency to the head of the list: | ||
| 463 | */ | ||
| 464 | static int add_lock_to_list(struct lock_class *class, struct lock_class *this, | ||
| 465 | struct list_head *head, unsigned long ip) | ||
| 466 | { | ||
| 467 | struct lock_list *entry; | ||
| 468 | /* | ||
| 469 | * Lock not present yet - get a new dependency struct and | ||
| 470 | * add it to the list: | ||
| 471 | */ | ||
| 472 | entry = alloc_list_entry(); | ||
| 473 | if (!entry) | ||
| 474 | return 0; | ||
| 475 | |||
| 476 | entry->class = this; | ||
| 477 | save_trace(&entry->trace); | ||
| 478 | |||
| 479 | /* | ||
| 480 | * Since we never remove from the dependency list, the list can | ||
| 481 | * be walked lockless by other CPUs, it's only allocation | ||
| 482 | * that must be protected by the spinlock. But this also means | ||
| 483 | * we must make new entries visible only once writes to the | ||
| 484 | * entry become visible - hence the RCU op: | ||
| 485 | */ | ||
| 486 | list_add_tail_rcu(&entry->entry, head); | ||
| 487 | |||
| 488 | return 1; | ||
| 489 | } | ||
| 490 | |||
| 491 | /* | ||
| 492 | * Recursive, forwards-direction lock-dependency checking, used for | ||
| 493 | * both noncyclic checking and for hardirq-unsafe/softirq-unsafe | ||
| 494 | * checking. | ||
| 495 | * | ||
| 496 | * (to keep the stackframe of the recursive functions small we | ||
| 497 | * use these global variables, and we also mark various helper | ||
| 498 | * functions as noinline.) | ||
| 499 | */ | ||
| 500 | static struct held_lock *check_source, *check_target; | ||
| 501 | |||
| 502 | /* | ||
| 503 | * Print a dependency chain entry (this is only done when a deadlock | ||
| 504 | * has been detected): | ||
| 505 | */ | ||
| 506 | static noinline int | ||
| 507 | print_circular_bug_entry(struct lock_list *target, unsigned int depth) | ||
| 508 | { | ||
| 509 | if (debug_locks_silent) | ||
| 510 | return 0; | ||
| 511 | printk("\n-> #%u", depth); | ||
| 512 | print_lock_name(target->class); | ||
| 513 | printk(":\n"); | ||
| 514 | print_stack_trace(&target->trace, 6); | ||
| 515 | |||
| 516 | return 0; | ||
| 517 | } | ||
| 518 | |||
| 519 | static void print_kernel_version(void) | ||
| 520 | { | ||
| 521 | printk("%s %.*s\n", system_utsname.release, | ||
| 522 | (int)strcspn(system_utsname.version, " "), | ||
| 523 | system_utsname.version); | ||
| 524 | } | ||
| 525 | |||
| 526 | /* | ||
| 527 | * When a circular dependency is detected, print the | ||
| 528 | * header first: | ||
| 529 | */ | ||
| 530 | static noinline int | ||
| 531 | print_circular_bug_header(struct lock_list *entry, unsigned int depth) | ||
| 532 | { | ||
| 533 | struct task_struct *curr = current; | ||
| 534 | |||
| 535 | __raw_spin_unlock(&hash_lock); | ||
| 536 | debug_locks_off(); | ||
| 537 | if (debug_locks_silent) | ||
| 538 | return 0; | ||
| 539 | |||
| 540 | printk("\n=======================================================\n"); | ||
| 541 | printk( "[ INFO: possible circular locking dependency detected ]\n"); | ||
| 542 | print_kernel_version(); | ||
| 543 | printk( "-------------------------------------------------------\n"); | ||
| 544 | printk("%s/%d is trying to acquire lock:\n", | ||
| 545 | curr->comm, curr->pid); | ||
| 546 | print_lock(check_source); | ||
| 547 | printk("\nbut task is already holding lock:\n"); | ||
| 548 | print_lock(check_target); | ||
| 549 | printk("\nwhich lock already depends on the new lock.\n\n"); | ||
| 550 | printk("\nthe existing dependency chain (in reverse order) is:\n"); | ||
| 551 | |||
| 552 | print_circular_bug_entry(entry, depth); | ||
| 553 | |||
| 554 | return 0; | ||
| 555 | } | ||
| 556 | |||
| 557 | static noinline int print_circular_bug_tail(void) | ||
| 558 | { | ||
| 559 | struct task_struct *curr = current; | ||
| 560 | struct lock_list this; | ||
| 561 | |||
| 562 | if (debug_locks_silent) | ||
| 563 | return 0; | ||
| 564 | |||
| 565 | this.class = check_source->class; | ||
| 566 | save_trace(&this.trace); | ||
| 567 | print_circular_bug_entry(&this, 0); | ||
| 568 | |||
| 569 | printk("\nother info that might help us debug this:\n\n"); | ||
| 570 | lockdep_print_held_locks(curr); | ||
| 571 | |||
| 572 | printk("\nstack backtrace:\n"); | ||
| 573 | dump_stack(); | ||
| 574 | |||
| 575 | return 0; | ||
| 576 | } | ||
| 577 | |||
| 578 | static int noinline print_infinite_recursion_bug(void) | ||
| 579 | { | ||
| 580 | __raw_spin_unlock(&hash_lock); | ||
| 581 | DEBUG_LOCKS_WARN_ON(1); | ||
| 582 | |||
| 583 | return 0; | ||
| 584 | } | ||
| 585 | |||
| 586 | /* | ||
| 587 | * Prove that the dependency graph starting at <entry> can not | ||
| 588 | * lead to <target>. Print an error and return 0 if it does. | ||
| 589 | */ | ||
| 590 | static noinline int | ||
| 591 | check_noncircular(struct lock_class *source, unsigned int depth) | ||
| 592 | { | ||
| 593 | struct lock_list *entry; | ||
| 594 | |||
| 595 | debug_atomic_inc(&nr_cyclic_check_recursions); | ||
| 596 | if (depth > max_recursion_depth) | ||
| 597 | max_recursion_depth = depth; | ||
| 598 | if (depth >= 20) | ||
| 599 | return print_infinite_recursion_bug(); | ||
| 600 | /* | ||
| 601 | * Check this lock's dependency list: | ||
| 602 | */ | ||
| 603 | list_for_each_entry(entry, &source->locks_after, entry) { | ||
| 604 | if (entry->class == check_target->class) | ||
| 605 | return print_circular_bug_header(entry, depth+1); | ||
| 606 | debug_atomic_inc(&nr_cyclic_checks); | ||
| 607 | if (!check_noncircular(entry->class, depth+1)) | ||
| 608 | return print_circular_bug_entry(entry, depth+1); | ||
| 609 | } | ||
| 610 | return 1; | ||
| 611 | } | ||
| 612 | |||
| 613 | static int very_verbose(struct lock_class *class) | ||
| 614 | { | ||
| 615 | #if VERY_VERBOSE | ||
| 616 | return class_filter(class); | ||
| 617 | #endif | ||
| 618 | return 0; | ||
| 619 | } | ||
| 620 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
| 621 | |||
| 622 | /* | ||
| 623 | * Forwards and backwards subgraph searching, for the purposes of | ||
| 624 | * proving that two subgraphs can be connected by a new dependency | ||
| 625 | * without creating any illegal irq-safe -> irq-unsafe lock dependency. | ||
| 626 | */ | ||
| 627 | static enum lock_usage_bit find_usage_bit; | ||
| 628 | static struct lock_class *forwards_match, *backwards_match; | ||
| 629 | |||
| 630 | /* | ||
| 631 | * Find a node in the forwards-direction dependency sub-graph starting | ||
| 632 | * at <source> that matches <find_usage_bit>. | ||
| 633 | * | ||
| 634 | * Return 2 if such a node exists in the subgraph, and put that node | ||
| 635 | * into <forwards_match>. | ||
| 636 | * | ||
| 637 | * Return 1 otherwise and keep <forwards_match> unchanged. | ||
| 638 | * Return 0 on error. | ||
| 639 | */ | ||
| 640 | static noinline int | ||
| 641 | find_usage_forwards(struct lock_class *source, unsigned int depth) | ||
| 642 | { | ||
| 643 | struct lock_list *entry; | ||
| 644 | int ret; | ||
| 645 | |||
| 646 | if (depth > max_recursion_depth) | ||
| 647 | max_recursion_depth = depth; | ||
| 648 | if (depth >= 20) | ||
| 649 | return print_infinite_recursion_bug(); | ||
| 650 | |||
| 651 | debug_atomic_inc(&nr_find_usage_forwards_checks); | ||
| 652 | if (source->usage_mask & (1 << find_usage_bit)) { | ||
| 653 | forwards_match = source; | ||
| 654 | return 2; | ||
| 655 | } | ||
| 656 | |||
| 657 | /* | ||
| 658 | * Check this lock's dependency list: | ||
| 659 | */ | ||
| 660 | list_for_each_entry(entry, &source->locks_after, entry) { | ||
| 661 | debug_atomic_inc(&nr_find_usage_forwards_recursions); | ||
| 662 | ret = find_usage_forwards(entry->class, depth+1); | ||
| 663 | if (ret == 2 || ret == 0) | ||
| 664 | return ret; | ||
| 665 | } | ||
| 666 | return 1; | ||
| 667 | } | ||
| 668 | |||
| 669 | /* | ||
| 670 | * Find a node in the backwards-direction dependency sub-graph starting | ||
| 671 | * at <source> that matches <find_usage_bit>. | ||
| 672 | * | ||
| 673 | * Return 2 if such a node exists in the subgraph, and put that node | ||
| 674 | * into <backwards_match>. | ||
| 675 | * | ||
| 676 | * Return 1 otherwise and keep <backwards_match> unchanged. | ||
| 677 | * Return 0 on error. | ||
| 678 | */ | ||
| 679 | static noinline int | ||
| 680 | find_usage_backwards(struct lock_class *source, unsigned int depth) | ||
| 681 | { | ||
| 682 | struct lock_list *entry; | ||
| 683 | int ret; | ||
| 684 | |||
| 685 | if (depth > max_recursion_depth) | ||
| 686 | max_recursion_depth = depth; | ||
| 687 | if (depth >= 20) | ||
| 688 | return print_infinite_recursion_bug(); | ||
| 689 | |||
| 690 | debug_atomic_inc(&nr_find_usage_backwards_checks); | ||
| 691 | if (source->usage_mask & (1 << find_usage_bit)) { | ||
| 692 | backwards_match = source; | ||
| 693 | return 2; | ||
| 694 | } | ||
| 695 | |||
| 696 | /* | ||
| 697 | * Check this lock's dependency list: | ||
| 698 | */ | ||
| 699 | list_for_each_entry(entry, &source->locks_before, entry) { | ||
| 700 | debug_atomic_inc(&nr_find_usage_backwards_recursions); | ||
| 701 | ret = find_usage_backwards(entry->class, depth+1); | ||
| 702 | if (ret == 2 || ret == 0) | ||
| 703 | return ret; | ||
| 704 | } | ||
| 705 | return 1; | ||
| 706 | } | ||
| 707 | |||
| 708 | static int | ||
| 709 | print_bad_irq_dependency(struct task_struct *curr, | ||
| 710 | struct held_lock *prev, | ||
| 711 | struct held_lock *next, | ||
| 712 | enum lock_usage_bit bit1, | ||
| 713 | enum lock_usage_bit bit2, | ||
| 714 | const char *irqclass) | ||
| 715 | { | ||
| 716 | __raw_spin_unlock(&hash_lock); | ||
| 717 | debug_locks_off(); | ||
| 718 | if (debug_locks_silent) | ||
| 719 | return 0; | ||
| 720 | |||
| 721 | printk("\n======================================================\n"); | ||
| 722 | printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", | ||
| 723 | irqclass, irqclass); | ||
| 724 | print_kernel_version(); | ||
| 725 | printk( "------------------------------------------------------\n"); | ||
| 726 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", | ||
| 727 | curr->comm, curr->pid, | ||
| 728 | curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, | ||
| 729 | curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, | ||
| 730 | curr->hardirqs_enabled, | ||
| 731 | curr->softirqs_enabled); | ||
| 732 | print_lock(next); | ||
| 733 | |||
| 734 | printk("\nand this task is already holding:\n"); | ||
| 735 | print_lock(prev); | ||
| 736 | printk("which would create a new lock dependency:\n"); | ||
| 737 | print_lock_name(prev->class); | ||
| 738 | printk(" ->"); | ||
| 739 | print_lock_name(next->class); | ||
| 740 | printk("\n"); | ||
| 741 | |||
| 742 | printk("\nbut this new dependency connects a %s-irq-safe lock:\n", | ||
| 743 | irqclass); | ||
| 744 | print_lock_name(backwards_match); | ||
| 745 | printk("\n... which became %s-irq-safe at:\n", irqclass); | ||
| 746 | |||
| 747 | print_stack_trace(backwards_match->usage_traces + bit1, 1); | ||
| 748 | |||
| 749 | printk("\nto a %s-irq-unsafe lock:\n", irqclass); | ||
| 750 | print_lock_name(forwards_match); | ||
| 751 | printk("\n... which became %s-irq-unsafe at:\n", irqclass); | ||
| 752 | printk("..."); | ||
| 753 | |||
| 754 | print_stack_trace(forwards_match->usage_traces + bit2, 1); | ||
| 755 | |||
| 756 | printk("\nother info that might help us debug this:\n\n"); | ||
| 757 | lockdep_print_held_locks(curr); | ||
| 758 | |||
| 759 | printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass); | ||
| 760 | print_lock_dependencies(backwards_match, 0); | ||
| 761 | |||
| 762 | printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass); | ||
| 763 | print_lock_dependencies(forwards_match, 0); | ||
| 764 | |||
| 765 | printk("\nstack backtrace:\n"); | ||
| 766 | dump_stack(); | ||
| 767 | |||
| 768 | return 0; | ||
| 769 | } | ||
| 770 | |||
| 771 | static int | ||
| 772 | check_usage(struct task_struct *curr, struct held_lock *prev, | ||
| 773 | struct held_lock *next, enum lock_usage_bit bit_backwards, | ||
| 774 | enum lock_usage_bit bit_forwards, const char *irqclass) | ||
| 775 | { | ||
| 776 | int ret; | ||
| 777 | |||
| 778 | find_usage_bit = bit_backwards; | ||
| 779 | /* fills in <backwards_match> */ | ||
| 780 | ret = find_usage_backwards(prev->class, 0); | ||
| 781 | if (!ret || ret == 1) | ||
| 782 | return ret; | ||
| 783 | |||
| 784 | find_usage_bit = bit_forwards; | ||
| 785 | ret = find_usage_forwards(next->class, 0); | ||
| 786 | if (!ret || ret == 1) | ||
| 787 | return ret; | ||
| 788 | /* ret == 2 */ | ||
| 789 | return print_bad_irq_dependency(curr, prev, next, | ||
| 790 | bit_backwards, bit_forwards, irqclass); | ||
| 791 | } | ||
| 792 | |||
| 793 | #endif | ||
| 794 | |||
| 795 | static int | ||
| 796 | print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, | ||
| 797 | struct held_lock *next) | ||
| 798 | { | ||
| 799 | debug_locks_off(); | ||
| 800 | __raw_spin_unlock(&hash_lock); | ||
| 801 | if (debug_locks_silent) | ||
| 802 | return 0; | ||
| 803 | |||
| 804 | printk("\n=============================================\n"); | ||
| 805 | printk( "[ INFO: possible recursive locking detected ]\n"); | ||
| 806 | print_kernel_version(); | ||
| 807 | printk( "---------------------------------------------\n"); | ||
| 808 | printk("%s/%d is trying to acquire lock:\n", | ||
| 809 | curr->comm, curr->pid); | ||
| 810 | print_lock(next); | ||
| 811 | printk("\nbut task is already holding lock:\n"); | ||
| 812 | print_lock(prev); | ||
| 813 | |||
| 814 | printk("\nother info that might help us debug this:\n"); | ||
| 815 | lockdep_print_held_locks(curr); | ||
| 816 | |||
| 817 | printk("\nstack backtrace:\n"); | ||
| 818 | dump_stack(); | ||
| 819 | |||
| 820 | return 0; | ||
| 821 | } | ||
| 822 | |||
| 823 | /* | ||
| 824 | * Check whether we are holding such a class already. | ||
| 825 | * | ||
| 826 | * (Note that this has to be done separately, because the graph cannot | ||
| 827 | * detect such classes of deadlocks.) | ||
| 828 | * | ||
| 829 | * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read | ||
| 830 | */ | ||
| 831 | static int | ||
| 832 | check_deadlock(struct task_struct *curr, struct held_lock *next, | ||
| 833 | struct lockdep_map *next_instance, int read) | ||
| 834 | { | ||
| 835 | struct held_lock *prev; | ||
| 836 | int i; | ||
| 837 | |||
| 838 | for (i = 0; i < curr->lockdep_depth; i++) { | ||
| 839 | prev = curr->held_locks + i; | ||
| 840 | if (prev->class != next->class) | ||
| 841 | continue; | ||
| 842 | /* | ||
| 843 | * Allow read-after-read recursion of the same | ||
| 844 | * lock class (i.e. read_lock(lock)+read_lock(lock)): | ||
| 845 | */ | ||
| 846 | if ((read == 2) && prev->read) | ||
| 847 | return 2; | ||
| 848 | return print_deadlock_bug(curr, prev, next); | ||
| 849 | } | ||
| 850 | return 1; | ||
| 851 | } | ||
| 852 | |||
| 853 | /* | ||
| 854 | * There was a chain-cache miss, and we are about to add a new dependency | ||
| 855 | * to a previous lock. We recursively validate the following rules: | ||
| 856 | * | ||
| 857 | * - would the adding of the <prev> -> <next> dependency create a | ||
| 858 | * circular dependency in the graph? [== circular deadlock] | ||
| 859 | * | ||
| 860 | * - does the new prev->next dependency connect any hardirq-safe lock | ||
| 861 | * (in the full backwards-subgraph starting at <prev>) with any | ||
| 862 | * hardirq-unsafe lock (in the full forwards-subgraph starting at | ||
| 863 | * <next>)? [== illegal lock inversion with hardirq contexts] | ||
| 864 | * | ||
| 865 | * - does the new prev->next dependency connect any softirq-safe lock | ||
| 866 | * (in the full backwards-subgraph starting at <prev>) with any | ||
| 867 | * softirq-unsafe lock (in the full forwards-subgraph starting at | ||
| 868 | * <next>)? [== illegal lock inversion with softirq contexts] | ||
| 869 | * | ||
| 870 | * any of these scenarios could lead to a deadlock. | ||
| 871 | * | ||
| 872 | * Then if all the validations pass, we add the forwards and backwards | ||
| 873 | * dependency. | ||
| 874 | */ | ||
| 875 | static int | ||
| 876 | check_prev_add(struct task_struct *curr, struct held_lock *prev, | ||
| 877 | struct held_lock *next) | ||
| 878 | { | ||
| 879 | struct lock_list *entry; | ||
| 880 | int ret; | ||
| 881 | |||
| 882 | /* | ||
| 883 | * Prove that the new <prev> -> <next> dependency would not | ||
| 884 | * create a circular dependency in the graph. (We do this by | ||
| 885 | * forward-recursing into the graph starting at <next>, and | ||
| 886 | * checking whether we can reach <prev>.) | ||
| 887 | * | ||
| 888 | * We are using global variables to control the recursion, to | ||
| 889 | * keep the stackframe size of the recursive functions low: | ||
| 890 | */ | ||
| 891 | check_source = next; | ||
| 892 | check_target = prev; | ||
| 893 | if (!(check_noncircular(next->class, 0))) | ||
| 894 | return print_circular_bug_tail(); | ||
| 895 | |||
| 896 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
| 897 | /* | ||
| 898 | * Prove that the new dependency does not connect a hardirq-safe | ||
| 899 | * lock with a hardirq-unsafe lock - to achieve this we search | ||
| 900 | * the backwards-subgraph starting at <prev>, and the | ||
| 901 | * forwards-subgraph starting at <next>: | ||
| 902 | */ | ||
| 903 | if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ, | ||
| 904 | LOCK_ENABLED_HARDIRQS, "hard")) | ||
| 905 | return 0; | ||
| 906 | |||
| 907 | /* | ||
| 908 | * Prove that the new dependency does not connect a hardirq-safe-read | ||
| 909 | * lock with a hardirq-unsafe lock - to achieve this we search | ||
| 910 | * the backwards-subgraph starting at <prev>, and the | ||
| 911 | * forwards-subgraph starting at <next>: | ||
| 912 | */ | ||
| 913 | if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ, | ||
| 914 | LOCK_ENABLED_HARDIRQS, "hard-read")) | ||
| 915 | return 0; | ||
| 916 | |||
| 917 | /* | ||
| 918 | * Prove that the new dependency does not connect a softirq-safe | ||
| 919 | * lock with a softirq-unsafe lock - to achieve this we search | ||
| 920 | * the backwards-subgraph starting at <prev>, and the | ||
| 921 | * forwards-subgraph starting at <next>: | ||
| 922 | */ | ||
| 923 | if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ, | ||
| 924 | LOCK_ENABLED_SOFTIRQS, "soft")) | ||
| 925 | return 0; | ||
| 926 | /* | ||
| 927 | * Prove that the new dependency does not connect a softirq-safe-read | ||
| 928 | * lock with a softirq-unsafe lock - to achieve this we search | ||
| 929 | * the backwards-subgraph starting at <prev>, and the | ||
| 930 | * forwards-subgraph starting at <next>: | ||
| 931 | */ | ||
| 932 | if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ, | ||
| 933 | LOCK_ENABLED_SOFTIRQS, "soft")) | ||
| 934 | return 0; | ||
| 935 | #endif | ||
| 936 | /* | ||
| 937 | * For recursive read-locks we do all the dependency checks, | ||
| 938 | * but we dont store read-triggered dependencies (only | ||
| 939 | * write-triggered dependencies). This ensures that only the | ||
| 940 | * write-side dependencies matter, and that if for example a | ||
| 941 | * write-lock never takes any other locks, then the reads are | ||
| 942 | * equivalent to a NOP. | ||
| 943 | */ | ||
| 944 | if (next->read == 2 || prev->read == 2) | ||
| 945 | return 1; | ||
| 946 | /* | ||
| 947 | * Is the <prev> -> <next> dependency already present? | ||
| 948 | * | ||
| 949 | * (this may occur even though this is a new chain: consider | ||
| 950 | * e.g. the L1 -> L2 -> L3 -> L4 and the L5 -> L1 -> L2 -> L3 | ||
| 951 | * chains - the second one will be new, but L1 already has | ||
| 952 | * L2 added to its dependency list, due to the first chain.) | ||
| 953 | */ | ||
| 954 | list_for_each_entry(entry, &prev->class->locks_after, entry) { | ||
| 955 | if (entry->class == next->class) | ||
| 956 | return 2; | ||
| 957 | } | ||
| 958 | |||
| 959 | /* | ||
| 960 | * Ok, all validations passed, add the new lock | ||
| 961 | * to the previous lock's dependency list: | ||
| 962 | */ | ||
| 963 | ret = add_lock_to_list(prev->class, next->class, | ||
| 964 | &prev->class->locks_after, next->acquire_ip); | ||
| 965 | if (!ret) | ||
| 966 | return 0; | ||
| 967 | /* | ||
| 968 | * Return value of 2 signals 'dependency already added', | ||
| 969 | * in that case we dont have to add the backlink either. | ||
| 970 | */ | ||
| 971 | if (ret == 2) | ||
| 972 | return 2; | ||
| 973 | ret = add_lock_to_list(next->class, prev->class, | ||
| 974 | &next->class->locks_before, next->acquire_ip); | ||
| 975 | |||
| 976 | /* | ||
| 977 | * Debugging printouts: | ||
| 978 | */ | ||
| 979 | if (verbose(prev->class) || verbose(next->class)) { | ||
| 980 | __raw_spin_unlock(&hash_lock); | ||
| 981 | printk("\n new dependency: "); | ||
| 982 | print_lock_name(prev->class); | ||
| 983 | printk(" => "); | ||
| 984 | print_lock_name(next->class); | ||
| 985 | printk("\n"); | ||
| 986 | dump_stack(); | ||
| 987 | __raw_spin_lock(&hash_lock); | ||
| 988 | } | ||
| 989 | return 1; | ||
| 990 | } | ||
| 991 | |||
| 992 | /* | ||
| 993 | * Add the dependency to all directly-previous locks that are 'relevant'. | ||
| 994 | * The ones that are relevant are (in increasing distance from curr): | ||
| 995 | * all consecutive trylock entries and the final non-trylock entry - or | ||
| 996 | * the end of this context's lock-chain - whichever comes first. | ||
| 997 | */ | ||
| 998 | static int | ||
| 999 | check_prevs_add(struct task_struct *curr, struct held_lock *next) | ||
| 1000 | { | ||
| 1001 | int depth = curr->lockdep_depth; | ||
| 1002 | struct held_lock *hlock; | ||
| 1003 | |||
| 1004 | /* | ||
| 1005 | * Debugging checks. | ||
| 1006 | * | ||
| 1007 | * Depth must not be zero for a non-head lock: | ||
| 1008 | */ | ||
| 1009 | if (!depth) | ||
| 1010 | goto out_bug; | ||
| 1011 | /* | ||
| 1012 | * At least two relevant locks must exist for this | ||
| 1013 | * to be a head: | ||
| 1014 | */ | ||
| 1015 | if (curr->held_locks[depth].irq_context != | ||
| 1016 | curr->held_locks[depth-1].irq_context) | ||
| 1017 | goto out_bug; | ||
| 1018 | |||
| 1019 | for (;;) { | ||
| 1020 | hlock = curr->held_locks + depth-1; | ||
| 1021 | /* | ||
| 1022 | * Only non-recursive-read entries get new dependencies | ||
| 1023 | * added: | ||
| 1024 | */ | ||
| 1025 | if (hlock->read != 2) { | ||
| 1026 | check_prev_add(curr, hlock, next); | ||
| 1027 | /* | ||
| 1028 | * Stop after the first non-trylock entry, | ||
| 1029 | * as non-trylock entries have added their | ||
| 1030 | * own direct dependencies already, so this | ||
| 1031 | * lock is connected to them indirectly: | ||
| 1032 | */ | ||
| 1033 | if (!hlock->trylock) | ||
| 1034 | break; | ||
| 1035 | } | ||
| 1036 | depth--; | ||
| 1037 | /* | ||
| 1038 | * End of lock-stack? | ||
| 1039 | */ | ||
| 1040 | if (!depth) | ||
| 1041 | break; | ||
| 1042 | /* | ||
| 1043 | * Stop the search if we cross into another context: | ||
| 1044 | */ | ||
| 1045 | if (curr->held_locks[depth].irq_context != | ||
| 1046 | curr->held_locks[depth-1].irq_context) | ||
| 1047 | break; | ||
| 1048 | } | ||
| 1049 | return 1; | ||
| 1050 | out_bug: | ||
| 1051 | __raw_spin_unlock(&hash_lock); | ||
| 1052 | DEBUG_LOCKS_WARN_ON(1); | ||
| 1053 | |||
| 1054 | return 0; | ||
| 1055 | } | ||
| 1056 | |||
| 1057 | |||
| 1058 | /* | ||
| 1059 | * Is this the address of a static object: | ||
| 1060 | */ | ||
| 1061 | static int static_obj(void *obj) | ||
| 1062 | { | ||
| 1063 | unsigned long start = (unsigned long) &_stext, | ||
| 1064 | end = (unsigned long) &_end, | ||
| 1065 | addr = (unsigned long) obj; | ||
| 1066 | #ifdef CONFIG_SMP | ||
| 1067 | int i; | ||
| 1068 | #endif | ||
| 1069 | |||
| 1070 | /* | ||
| 1071 | * static variable? | ||
| 1072 | */ | ||
| 1073 | if ((addr >= start) && (addr < end)) | ||
| 1074 | return 1; | ||
| 1075 | |||
| 1076 | #ifdef CONFIG_SMP | ||
| 1077 | /* | ||
| 1078 | * percpu var? | ||
| 1079 | */ | ||
| 1080 | for_each_possible_cpu(i) { | ||
| 1081 | start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); | ||
| 1082 | end = (unsigned long) &__per_cpu_end + per_cpu_offset(i); | ||
| 1083 | |||
| 1084 | if ((addr >= start) && (addr < end)) | ||
| 1085 | return 1; | ||
| 1086 | } | ||
| 1087 | #endif | ||
| 1088 | |||
| 1089 | /* | ||
| 1090 | * module var? | ||
| 1091 | */ | ||
| 1092 | return is_module_address(addr); | ||
| 1093 | } | ||
| 1094 | |||
| 1095 | /* | ||
| 1096 | * To make lock name printouts unique, we calculate a unique | ||
| 1097 | * class->name_version generation counter: | ||
| 1098 | */ | ||
| 1099 | static int count_matching_names(struct lock_class *new_class) | ||
| 1100 | { | ||
| 1101 | struct lock_class *class; | ||
| 1102 | int count = 0; | ||
| 1103 | |||
| 1104 | if (!new_class->name) | ||
| 1105 | return 0; | ||
| 1106 | |||
| 1107 | list_for_each_entry(class, &all_lock_classes, lock_entry) { | ||
| 1108 | if (new_class->key - new_class->subclass == class->key) | ||
| 1109 | return class->name_version; | ||
| 1110 | if (class->name && !strcmp(class->name, new_class->name)) | ||
| 1111 | count = max(count, class->name_version); | ||
| 1112 | } | ||
| 1113 | |||
| 1114 | return count + 1; | ||
| 1115 | } | ||
| 1116 | |||
| 1117 | extern void __error_too_big_MAX_LOCKDEP_SUBCLASSES(void); | ||
| 1118 | |||
| 1119 | /* | ||
| 1120 | * Register a lock's class in the hash-table, if the class is not present | ||
| 1121 | * yet. Otherwise we look it up. We cache the result in the lock object | ||
| 1122 | * itself, so actual lookup of the hash should be once per lock object. | ||
| 1123 | */ | ||
| 1124 | static inline struct lock_class * | ||
| 1125 | look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | ||
| 1126 | { | ||
| 1127 | struct lockdep_subclass_key *key; | ||
| 1128 | struct list_head *hash_head; | ||
| 1129 | struct lock_class *class; | ||
| 1130 | |||
| 1131 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
| 1132 | /* | ||
| 1133 | * If the architecture calls into lockdep before initializing | ||
| 1134 | * the hashes then we'll warn about it later. (we cannot printk | ||
| 1135 | * right now) | ||
| 1136 | */ | ||
| 1137 | if (unlikely(!lockdep_initialized)) { | ||
| 1138 | lockdep_init(); | ||
| 1139 | lockdep_init_error = 1; | ||
| 1140 | } | ||
| 1141 | #endif | ||
| 1142 | |||
| 1143 | /* | ||
| 1144 | * Static locks do not have their class-keys yet - for them the key | ||
| 1145 | * is the lock object itself: | ||
| 1146 | */ | ||
| 1147 | if (unlikely(!lock->key)) | ||
| 1148 | lock->key = (void *)lock; | ||
| 1149 | |||
| 1150 | /* | ||
| 1151 | * NOTE: the class-key must be unique. For dynamic locks, a static | ||
| 1152 | * lock_class_key variable is passed in through the mutex_init() | ||
| 1153 | * (or spin_lock_init()) call - which acts as the key. For static | ||
| 1154 | * locks we use the lock object itself as the key. | ||
| 1155 | */ | ||
| 1156 | if (sizeof(struct lock_class_key) > sizeof(struct lock_class)) | ||
| 1157 | __error_too_big_MAX_LOCKDEP_SUBCLASSES(); | ||
| 1158 | |||
| 1159 | key = lock->key->subkeys + subclass; | ||
| 1160 | |||
| 1161 | hash_head = classhashentry(key); | ||
| 1162 | |||
| 1163 | /* | ||
| 1164 | * We can walk the hash lockfree, because the hash only | ||
| 1165 | * grows, and we are careful when adding entries to the end: | ||
| 1166 | */ | ||
| 1167 | list_for_each_entry(class, hash_head, hash_entry) | ||
| 1168 | if (class->key == key) | ||
| 1169 | return class; | ||
| 1170 | |||
| 1171 | return NULL; | ||
| 1172 | } | ||
| 1173 | |||
| 1174 | /* | ||
| 1175 | * Register a lock's class in the hash-table, if the class is not present | ||
| 1176 | * yet. Otherwise we look it up. We cache the result in the lock object | ||
| 1177 | * itself, so actual lookup of the hash should be once per lock object. | ||
| 1178 | */ | ||
| 1179 | static inline struct lock_class * | ||
| 1180 | register_lock_class(struct lockdep_map *lock, unsigned int subclass) | ||
| 1181 | { | ||
| 1182 | struct lockdep_subclass_key *key; | ||
| 1183 | struct list_head *hash_head; | ||
| 1184 | struct lock_class *class; | ||
| 1185 | |||
| 1186 | class = look_up_lock_class(lock, subclass); | ||
| 1187 | if (likely(class)) | ||
| 1188 | return class; | ||
| 1189 | |||
| 1190 | /* | ||
| 1191 | * Debug-check: all keys must be persistent! | ||
| 1192 | */ | ||
| 1193 | if (!static_obj(lock->key)) { | ||
| 1194 | debug_locks_off(); | ||
| 1195 | printk("INFO: trying to register non-static key.\n"); | ||
| 1196 | printk("the code is fine but needs lockdep annotation.\n"); | ||
| 1197 | printk("turning off the locking correctness validator.\n"); | ||
| 1198 | dump_stack(); | ||
| 1199 | |||
| 1200 | return NULL; | ||
| 1201 | } | ||
| 1202 | |||
| 1203 | key = lock->key->subkeys + subclass; | ||
| 1204 | hash_head = classhashentry(key); | ||
| 1205 | |||
| 1206 | __raw_spin_lock(&hash_lock); | ||
| 1207 | /* | ||
| 1208 | * We have to do the hash-walk again, to avoid races | ||
| 1209 | * with another CPU: | ||
| 1210 | */ | ||
| 1211 | list_for_each_entry(class, hash_head, hash_entry) | ||
| 1212 | if (class->key == key) | ||
| 1213 | goto out_unlock_set; | ||
| 1214 | /* | ||
| 1215 | * Allocate a new key from the static array, and add it to | ||
| 1216 | * the hash: | ||
| 1217 | */ | ||
| 1218 | if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { | ||
| 1219 | __raw_spin_unlock(&hash_lock); | ||
| 1220 | debug_locks_off(); | ||
| 1221 | printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); | ||
| 1222 | printk("turning off the locking correctness validator.\n"); | ||
| 1223 | return NULL; | ||
| 1224 | } | ||
| 1225 | class = lock_classes + nr_lock_classes++; | ||
| 1226 | debug_atomic_inc(&nr_unused_locks); | ||
| 1227 | class->key = key; | ||
| 1228 | class->name = lock->name; | ||
| 1229 | class->subclass = subclass; | ||
| 1230 | INIT_LIST_HEAD(&class->lock_entry); | ||
| 1231 | INIT_LIST_HEAD(&class->locks_before); | ||
| 1232 | INIT_LIST_HEAD(&class->locks_after); | ||
| 1233 | class->name_version = count_matching_names(class); | ||
| 1234 | /* | ||
| 1235 | * We use RCU's safe list-add method to make | ||
| 1236 | * parallel walking of the hash-list safe: | ||
| 1237 | */ | ||
| 1238 | list_add_tail_rcu(&class->hash_entry, hash_head); | ||
| 1239 | |||
| 1240 | if (verbose(class)) { | ||
| 1241 | __raw_spin_unlock(&hash_lock); | ||
| 1242 | printk("\nnew class %p: %s", class->key, class->name); | ||
| 1243 | if (class->name_version > 1) | ||
| 1244 | printk("#%d", class->name_version); | ||
| 1245 | printk("\n"); | ||
| 1246 | dump_stack(); | ||
| 1247 | __raw_spin_lock(&hash_lock); | ||
| 1248 | } | ||
| 1249 | out_unlock_set: | ||
| 1250 | __raw_spin_unlock(&hash_lock); | ||
| 1251 | |||
| 1252 | if (!subclass) | ||
| 1253 | lock->class_cache = class; | ||
| 1254 | |||
| 1255 | DEBUG_LOCKS_WARN_ON(class->subclass != subclass); | ||
| 1256 | |||
| 1257 | return class; | ||
| 1258 | } | ||
| 1259 | |||
| 1260 | /* | ||
| 1261 | * Look up a dependency chain. If the key is not present yet then | ||
| 1262 | * add it and return 0 - in this case the new dependency chain is | ||
| 1263 | * validated. If the key is already hashed, return 1. | ||
| 1264 | */ | ||
| 1265 | static inline int lookup_chain_cache(u64 chain_key) | ||
| 1266 | { | ||
| 1267 | struct list_head *hash_head = chainhashentry(chain_key); | ||
| 1268 | struct lock_chain *chain; | ||
| 1269 | |||
| 1270 | DEBUG_LOCKS_WARN_ON(!irqs_disabled()); | ||
| 1271 | /* | ||
| 1272 | * We can walk it lock-free, because entries only get added | ||
| 1273 | * to the hash: | ||
| 1274 | */ | ||
| 1275 | list_for_each_entry(chain, hash_head, entry) { | ||
| 1276 | if (chain->chain_key == chain_key) { | ||
| 1277 | cache_hit: | ||
| 1278 | debug_atomic_inc(&chain_lookup_hits); | ||
| 1279 | /* | ||
| 1280 | * In the debugging case, force redundant checking | ||
| 1281 | * by returning 1: | ||
| 1282 | */ | ||
| 1283 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
| 1284 | __raw_spin_lock(&hash_lock); | ||
| 1285 | return 1; | ||
| 1286 | #endif | ||
| 1287 | return 0; | ||
| 1288 | } | ||
| 1289 | } | ||
| 1290 | /* | ||
| 1291 | * Allocate a new chain entry from the static array, and add | ||
| 1292 | * it to the hash: | ||
| 1293 | */ | ||
| 1294 | __raw_spin_lock(&hash_lock); | ||
| 1295 | /* | ||
| 1296 | * We have to walk the chain again locked - to avoid duplicates: | ||
| 1297 | */ | ||
| 1298 | list_for_each_entry(chain, hash_head, entry) { | ||
| 1299 | if (chain->chain_key == chain_key) { | ||
| 1300 | __raw_spin_unlock(&hash_lock); | ||
| 1301 | goto cache_hit; | ||
| 1302 | } | ||
| 1303 | } | ||
| 1304 | if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { | ||
| 1305 | __raw_spin_unlock(&hash_lock); | ||
| 1306 | debug_locks_off(); | ||
| 1307 | printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); | ||
| 1308 | printk("turning off the locking correctness validator.\n"); | ||
| 1309 | return 0; | ||
| 1310 | } | ||
| 1311 | chain = lock_chains + nr_lock_chains++; | ||
| 1312 | chain->chain_key = chain_key; | ||
| 1313 | list_add_tail_rcu(&chain->entry, hash_head); | ||
| 1314 | debug_atomic_inc(&chain_lookup_misses); | ||
| 1315 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
| 1316 | if (current->hardirq_context) | ||
| 1317 | nr_hardirq_chains++; | ||
| 1318 | else { | ||
| 1319 | if (current->softirq_context) | ||
| 1320 | nr_softirq_chains++; | ||
| 1321 | else | ||
| 1322 | nr_process_chains++; | ||
| 1323 | } | ||
| 1324 | #else | ||
| 1325 | nr_process_chains++; | ||
| 1326 | #endif | ||
| 1327 | |||
| 1328 | return 1; | ||
| 1329 | } | ||
| 1330 | |||
| 1331 | /* | ||
| 1332 | * We are building curr_chain_key incrementally, so double-check | ||
| 1333 | * it from scratch, to make sure that it's done correctly: | ||
| 1334 | */ | ||
| 1335 | static void check_chain_key(struct task_struct *curr) | ||
| 1336 | { | ||
| 1337 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
| 1338 | struct held_lock *hlock, *prev_hlock = NULL; | ||
| 1339 | unsigned int i, id; | ||
| 1340 | u64 chain_key = 0; | ||
| 1341 | |||
| 1342 | for (i = 0; i < curr->lockdep_depth; i++) { | ||
| 1343 | hlock = curr->held_locks + i; | ||
| 1344 | if (chain_key != hlock->prev_chain_key) { | ||
| 1345 | debug_locks_off(); | ||
| 1346 | printk("hm#1, depth: %u [%u], %016Lx != %016Lx\n", | ||
| 1347 | curr->lockdep_depth, i, | ||
| 1348 | (unsigned long long)chain_key, | ||
| 1349 | (unsigned long long)hlock->prev_chain_key); | ||
| 1350 | WARN_ON(1); | ||
| 1351 | return; | ||
| 1352 | } | ||
| 1353 | id = hlock->class - lock_classes; | ||
| 1354 | DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS); | ||
| 1355 | if (prev_hlock && (prev_hlock->irq_context != | ||
| 1356 | hlock->irq_context)) | ||
| 1357 | chain_key = 0; | ||
| 1358 | chain_key = iterate_chain_key(chain_key, id); | ||
| 1359 | prev_hlock = hlock; | ||
| 1360 | } | ||
| 1361 | if (chain_key != curr->curr_chain_key) { | ||
| 1362 | debug_locks_off(); | ||
| 1363 | printk("hm#2, depth: %u [%u], %016Lx != %016Lx\n", | ||
| 1364 | curr->lockdep_depth, i, | ||
| 1365 | (unsigned long long)chain_key, | ||
| 1366 | (unsigned long long)curr->curr_chain_key); | ||
| 1367 | WARN_ON(1); | ||
| 1368 | } | ||
| 1369 | #endif | ||
| 1370 | } | ||
| 1371 | |||
| 1372 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
| 1373 | |||
| 1374 | /* | ||
| 1375 | * print irq inversion bug: | ||
| 1376 | */ | ||
| 1377 | static int | ||
| 1378 | print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, | ||
| 1379 | struct held_lock *this, int forwards, | ||
| 1380 | const char *irqclass) | ||
| 1381 | { | ||
| 1382 | __raw_spin_unlock(&hash_lock); | ||
| 1383 | debug_locks_off(); | ||
| 1384 | if (debug_locks_silent) | ||
| 1385 | return 0; | ||
| 1386 | |||
| 1387 | printk("\n=========================================================\n"); | ||
| 1388 | printk( "[ INFO: possible irq lock inversion dependency detected ]\n"); | ||
| 1389 | print_kernel_version(); | ||
| 1390 | printk( "---------------------------------------------------------\n"); | ||
| 1391 | printk("%s/%d just changed the state of lock:\n", | ||
| 1392 | curr->comm, curr->pid); | ||
| 1393 | print_lock(this); | ||
| 1394 | if (forwards) | ||
| 1395 | printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass); | ||
| 1396 | else | ||
| 1397 | printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass); | ||
| 1398 | print_lock_name(other); | ||
| 1399 | printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); | ||
| 1400 | |||
| 1401 | printk("\nother info that might help us debug this:\n"); | ||
| 1402 | lockdep_print_held_locks(curr); | ||
| 1403 | |||
| 1404 | printk("\nthe first lock's dependencies:\n"); | ||
| 1405 | print_lock_dependencies(this->class, 0); | ||
| 1406 | |||
| 1407 | printk("\nthe second lock's dependencies:\n"); | ||
| 1408 | print_lock_dependencies(other, 0); | ||
| 1409 | |||
| 1410 | printk("\nstack backtrace:\n"); | ||
| 1411 | dump_stack(); | ||
| 1412 | |||
| 1413 | return 0; | ||
| 1414 | } | ||
| 1415 | |||
| 1416 | /* | ||
| 1417 | * Prove that in the forwards-direction subgraph starting at <this> | ||
| 1418 | * there is no lock matching <mask>: | ||
| 1419 | */ | ||
| 1420 | static int | ||
| 1421 | check_usage_forwards(struct task_struct *curr, struct held_lock *this, | ||
| 1422 | enum lock_usage_bit bit, const char *irqclass) | ||
| 1423 | { | ||
| 1424 | int ret; | ||
| 1425 | |||
| 1426 | find_usage_bit = bit; | ||
| 1427 | /* fills in <forwards_match> */ | ||
| 1428 | ret = find_usage_forwards(this->class, 0); | ||
| 1429 | if (!ret || ret == 1) | ||
| 1430 | return ret; | ||
| 1431 | |||
| 1432 | return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass); | ||
| 1433 | } | ||
| 1434 | |||
| 1435 | /* | ||
| 1436 | * Prove that in the backwards-direction subgraph starting at <this> | ||
| 1437 | * there is no lock matching <mask>: | ||
| 1438 | */ | ||
| 1439 | static int | ||
| 1440 | check_usage_backwards(struct task_struct *curr, struct held_lock *this, | ||
| 1441 | enum lock_usage_bit bit, const char *irqclass) | ||
| 1442 | { | ||
| 1443 | int ret; | ||
| 1444 | |||
| 1445 | find_usage_bit = bit; | ||
| 1446 | /* fills in <backwards_match> */ | ||
| 1447 | ret = find_usage_backwards(this->class, 0); | ||
| 1448 | if (!ret || ret == 1) | ||
| 1449 | return ret; | ||
| 1450 | |||
| 1451 | return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass); | ||
| 1452 | } | ||
| 1453 | |||
| 1454 | static inline void print_irqtrace_events(struct task_struct *curr) | ||
| 1455 | { | ||
| 1456 | printk("irq event stamp: %u\n", curr->irq_events); | ||
| 1457 | printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event); | ||
| 1458 | print_ip_sym(curr->hardirq_enable_ip); | ||
| 1459 | printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event); | ||
| 1460 | print_ip_sym(curr->hardirq_disable_ip); | ||
| 1461 | printk("softirqs last enabled at (%u): ", curr->softirq_enable_event); | ||
| 1462 | print_ip_sym(curr->softirq_enable_ip); | ||
| 1463 | printk("softirqs last disabled at (%u): ", curr->softirq_disable_event); | ||
| 1464 | print_ip_sym(curr->softirq_disable_ip); | ||
| 1465 | } | ||
| 1466 | |||
| 1467 | #else | ||
| 1468 | static inline void print_irqtrace_events(struct task_struct *curr) | ||
| 1469 | { | ||
| 1470 | } | ||
| 1471 | #endif | ||
| 1472 | |||
| 1473 | static int | ||
| 1474 | print_usage_bug(struct task_struct *curr, struct held_lock *this, | ||
| 1475 | enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) | ||
| 1476 | { | ||
| 1477 | __raw_spin_unlock(&hash_lock); | ||
| 1478 | debug_locks_off(); | ||
| 1479 | if (debug_locks_silent) | ||
| 1480 | return 0; | ||
| 1481 | |||
| 1482 | printk("\n=================================\n"); | ||
| 1483 | printk( "[ INFO: inconsistent lock state ]\n"); | ||
| 1484 | print_kernel_version(); | ||
| 1485 | printk( "---------------------------------\n"); | ||
| 1486 | |||
| 1487 | printk("inconsistent {%s} -> {%s} usage.\n", | ||
| 1488 | usage_str[prev_bit], usage_str[new_bit]); | ||
| 1489 | |||
| 1490 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", | ||
| 1491 | curr->comm, curr->pid, | ||
| 1492 | trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, | ||
| 1493 | trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, | ||
| 1494 | trace_hardirqs_enabled(curr), | ||
| 1495 | trace_softirqs_enabled(curr)); | ||
| 1496 | print_lock(this); | ||
| 1497 | |||
| 1498 | printk("{%s} state was registered at:\n", usage_str[prev_bit]); | ||
| 1499 | print_stack_trace(this->class->usage_traces + prev_bit, 1); | ||
| 1500 | |||
| 1501 | print_irqtrace_events(curr); | ||
| 1502 | printk("\nother info that might help us debug this:\n"); | ||
| 1503 | lockdep_print_held_locks(curr); | ||
| 1504 | |||
| 1505 | printk("\nstack backtrace:\n"); | ||
| 1506 | dump_stack(); | ||
| 1507 | |||
| 1508 | return 0; | ||
| 1509 | } | ||
| 1510 | |||
| 1511 | /* | ||
| 1512 | * Print out an error if an invalid bit is set: | ||
| 1513 | */ | ||
| 1514 | static inline int | ||
| 1515 | valid_state(struct task_struct *curr, struct held_lock *this, | ||
| 1516 | enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) | ||
| 1517 | { | ||
| 1518 | if (unlikely(this->class->usage_mask & (1 << bad_bit))) | ||
| 1519 | return print_usage_bug(curr, this, bad_bit, new_bit); | ||
| 1520 | return 1; | ||
| 1521 | } | ||
| 1522 | |||
| 1523 | #define STRICT_READ_CHECKS 1 | ||
| 1524 | |||
| 1525 | /* | ||
| 1526 | * Mark a lock with a usage bit, and validate the state transition: | ||
| 1527 | */ | ||
| 1528 | static int mark_lock(struct task_struct *curr, struct held_lock *this, | ||
| 1529 | enum lock_usage_bit new_bit, unsigned long ip) | ||
| 1530 | { | ||
| 1531 | unsigned int new_mask = 1 << new_bit, ret = 1; | ||
| 1532 | |||
| 1533 | /* | ||
| 1534 | * If already set then do not dirty the cacheline, | ||
| 1535 | * nor do any checks: | ||
| 1536 | */ | ||
| 1537 | if (likely(this->class->usage_mask & new_mask)) | ||
| 1538 | return 1; | ||
| 1539 | |||
| 1540 | __raw_spin_lock(&hash_lock); | ||
| 1541 | /* | ||
| 1542 | * Make sure we didnt race: | ||
| 1543 | */ | ||
| 1544 | if (unlikely(this->class->usage_mask & new_mask)) { | ||
| 1545 | __raw_spin_unlock(&hash_lock); | ||
| 1546 | return 1; | ||
| 1547 | } | ||
| 1548 | |||
| 1549 | this->class->usage_mask |= new_mask; | ||
| 1550 | |||
| 1551 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
| 1552 | if (new_bit == LOCK_ENABLED_HARDIRQS || | ||
| 1553 | new_bit == LOCK_ENABLED_HARDIRQS_READ) | ||
| 1554 | ip = curr->hardirq_enable_ip; | ||
| 1555 | else if (new_bit == LOCK_ENABLED_SOFTIRQS || | ||
| 1556 | new_bit == LOCK_ENABLED_SOFTIRQS_READ) | ||
| 1557 | ip = curr->softirq_enable_ip; | ||
| 1558 | #endif | ||
| 1559 | if (!save_trace(this->class->usage_traces + new_bit)) | ||
| 1560 | return 0; | ||
| 1561 | |||
| 1562 | switch (new_bit) { | ||
| 1563 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
| 1564 | case LOCK_USED_IN_HARDIRQ: | ||
| 1565 | if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) | ||
| 1566 | return 0; | ||
| 1567 | if (!valid_state(curr, this, new_bit, | ||
| 1568 | LOCK_ENABLED_HARDIRQS_READ)) | ||
| 1569 | return 0; | ||
| 1570 | /* | ||
| 1571 | * just marked it hardirq-safe, check that this lock | ||
| 1572 | * took no hardirq-unsafe lock in the past: | ||
| 1573 | */ | ||
| 1574 | if (!check_usage_forwards(curr, this, | ||
| 1575 | LOCK_ENABLED_HARDIRQS, "hard")) | ||
| 1576 | return 0; | ||
| 1577 | #if STRICT_READ_CHECKS | ||
| 1578 | /* | ||
| 1579 | * just marked it hardirq-safe, check that this lock | ||
| 1580 | * took no hardirq-unsafe-read lock in the past: | ||
| 1581 | */ | ||
| 1582 | if (!check_usage_forwards(curr, this, | ||
| 1583 | LOCK_ENABLED_HARDIRQS_READ, "hard-read")) | ||
| 1584 | return 0; | ||
| 1585 | #endif | ||
| 1586 | if (hardirq_verbose(this->class)) | ||
| 1587 | ret = 2; | ||
| 1588 | break; | ||
| 1589 | case LOCK_USED_IN_SOFTIRQ: | ||
| 1590 | if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS)) | ||
| 1591 | return 0; | ||
| 1592 | if (!valid_state(curr, this, new_bit, | ||
| 1593 | LOCK_ENABLED_SOFTIRQS_READ)) | ||
| 1594 | return 0; | ||
| 1595 | /* | ||
| 1596 | * just marked it softirq-safe, check that this lock | ||
| 1597 | * took no softirq-unsafe lock in the past: | ||
| 1598 | */ | ||
| 1599 | if (!check_usage_forwards(curr, this, | ||
| 1600 | LOCK_ENABLED_SOFTIRQS, "soft")) | ||
| 1601 | return 0; | ||
| 1602 | #if STRICT_READ_CHECKS | ||
| 1603 | /* | ||
| 1604 | * just marked it softirq-safe, check that this lock | ||
| 1605 | * took no softirq-unsafe-read lock in the past: | ||
| 1606 | */ | ||
| 1607 | if (!check_usage_forwards(curr, this, | ||
| 1608 | LOCK_ENABLED_SOFTIRQS_READ, "soft-read")) | ||
| 1609 | return 0; | ||
| 1610 | #endif | ||
| 1611 | if (softirq_verbose(this->class)) | ||
| 1612 | ret = 2; | ||
| 1613 | break; | ||
| 1614 | case LOCK_USED_IN_HARDIRQ_READ: | ||
| 1615 | if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) | ||
| 1616 | return 0; | ||
| 1617 | /* | ||
| 1618 | * just marked it hardirq-read-safe, check that this lock | ||
| 1619 | * took no hardirq-unsafe lock in the past: | ||
| 1620 | */ | ||
| 1621 | if (!check_usage_forwards(curr, this, | ||
| 1622 | LOCK_ENABLED_HARDIRQS, "hard")) | ||
| 1623 | return 0; | ||
| 1624 | if (hardirq_verbose(this->class)) | ||
| 1625 | ret = 2; | ||
| 1626 | break; | ||
| 1627 | case LOCK_USED_IN_SOFTIRQ_READ: | ||
| 1628 | if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS)) | ||
| 1629 | return 0; | ||
| 1630 | /* | ||
| 1631 | * just marked it softirq-read-safe, check that this lock | ||
| 1632 | * took no softirq-unsafe lock in the past: | ||
| 1633 | */ | ||
| 1634 | if (!check_usage_forwards(curr, this, | ||
| 1635 | LOCK_ENABLED_SOFTIRQS, "soft")) | ||
| 1636 | return 0; | ||
| 1637 | if (softirq_verbose(this->class)) | ||
| 1638 | ret = 2; | ||
| 1639 | break; | ||
| 1640 | case LOCK_ENABLED_HARDIRQS: | ||
| 1641 | if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ)) | ||
| 1642 | return 0; | ||
| 1643 | if (!valid_state(curr, this, new_bit, | ||
| 1644 | LOCK_USED_IN_HARDIRQ_READ)) | ||
| 1645 | return 0; | ||
| 1646 | /* | ||
| 1647 | * just marked it hardirq-unsafe, check that no hardirq-safe | ||
| 1648 | * lock in the system ever took it in the past: | ||
| 1649 | */ | ||
| 1650 | if (!check_usage_backwards(curr, this, | ||
| 1651 | LOCK_USED_IN_HARDIRQ, "hard")) | ||
| 1652 | return 0; | ||
| 1653 | #if STRICT_READ_CHECKS | ||
| 1654 | /* | ||
| 1655 | * just marked it hardirq-unsafe, check that no | ||
| 1656 | * hardirq-safe-read lock in the system ever took | ||
| 1657 | * it in the past: | ||
| 1658 | */ | ||
| 1659 | if (!check_usage_backwards(curr, this, | ||
| 1660 | LOCK_USED_IN_HARDIRQ_READ, "hard-read")) | ||
| 1661 | return 0; | ||
| 1662 | #endif | ||
| 1663 | if (hardirq_verbose(this->class)) | ||
| 1664 | ret = 2; | ||
| 1665 | break; | ||
| 1666 | case LOCK_ENABLED_SOFTIRQS: | ||
| 1667 | if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ)) | ||
| 1668 | return 0; | ||
| 1669 | if (!valid_state(curr, this, new_bit, | ||
| 1670 | LOCK_USED_IN_SOFTIRQ_READ)) | ||
| 1671 | return 0; | ||
| 1672 | /* | ||
| 1673 | * just marked it softirq-unsafe, check that no softirq-safe | ||
| 1674 | * lock in the system ever took it in the past: | ||
| 1675 | */ | ||
| 1676 | if (!check_usage_backwards(curr, this, | ||
| 1677 | LOCK_USED_IN_SOFTIRQ, "soft")) | ||
| 1678 | return 0; | ||
| 1679 | #if STRICT_READ_CHECKS | ||
| 1680 | /* | ||
| 1681 | * just marked it softirq-unsafe, check that no | ||
| 1682 | * softirq-safe-read lock in the system ever took | ||
| 1683 | * it in the past: | ||
| 1684 | */ | ||
| 1685 | if (!check_usage_backwards(curr, this, | ||
| 1686 | LOCK_USED_IN_SOFTIRQ_READ, "soft-read")) | ||
| 1687 | return 0; | ||
| 1688 | #endif | ||
| 1689 | if (softirq_verbose(this->class)) | ||
| 1690 | ret = 2; | ||
| 1691 | break; | ||
| 1692 | case LOCK_ENABLED_HARDIRQS_READ: | ||
| 1693 | if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ)) | ||
| 1694 | return 0; | ||
| 1695 | #if STRICT_READ_CHECKS | ||
| 1696 | /* | ||
| 1697 | * just marked it hardirq-read-unsafe, check that no | ||
| 1698 | * hardirq-safe lock in the system ever took it in the past: | ||
| 1699 | */ | ||
| 1700 | if (!check_usage_backwards(curr, this, | ||
| 1701 | LOCK_USED_IN_HARDIRQ, "hard")) | ||
| 1702 | return 0; | ||
| 1703 | #endif | ||
| 1704 | if (hardirq_verbose(this->class)) | ||
| 1705 | ret = 2; | ||
| 1706 | break; | ||
| 1707 | case LOCK_ENABLED_SOFTIRQS_READ: | ||
| 1708 | if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ)) | ||
| 1709 | return 0; | ||
| 1710 | #if STRICT_READ_CHECKS | ||
| 1711 | /* | ||
| 1712 | * just marked it softirq-read-unsafe, check that no | ||
| 1713 | * softirq-safe lock in the system ever took it in the past: | ||
| 1714 | */ | ||
| 1715 | if (!check_usage_backwards(curr, this, | ||
| 1716 | LOCK_USED_IN_SOFTIRQ, "soft")) | ||
| 1717 | return 0; | ||
| 1718 | #endif | ||
| 1719 | if (softirq_verbose(this->class)) | ||
| 1720 | ret = 2; | ||
| 1721 | break; | ||
| 1722 | #endif | ||
| 1723 | case LOCK_USED: | ||
| 1724 | /* | ||
| 1725 | * Add it to the global list of classes: | ||
| 1726 | */ | ||
| 1727 | list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes); | ||
| 1728 | debug_atomic_dec(&nr_unused_locks); | ||
| 1729 | break; | ||
| 1730 | default: | ||
| 1731 | debug_locks_off(); | ||
| 1732 | WARN_ON(1); | ||
| 1733 | return 0; | ||
| 1734 | } | ||
| 1735 | |||
| 1736 | __raw_spin_unlock(&hash_lock); | ||
| 1737 | |||
| 1738 | /* | ||
| 1739 | * We must printk outside of the hash_lock: | ||
| 1740 | */ | ||
| 1741 | if (ret == 2) { | ||
| 1742 | printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); | ||
| 1743 | print_lock(this); | ||
| 1744 | print_irqtrace_events(curr); | ||
| 1745 | dump_stack(); | ||
| 1746 | } | ||
| 1747 | |||
| 1748 | return ret; | ||
| 1749 | } | ||
| 1750 | |||
| 1751 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
| 1752 | /* | ||
| 1753 | * Mark all held locks with a usage bit: | ||
| 1754 | */ | ||
| 1755 | static int | ||
| 1756 | mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip) | ||
| 1757 | { | ||
| 1758 | enum lock_usage_bit usage_bit; | ||
| 1759 | struct held_lock *hlock; | ||
| 1760 | int i; | ||
| 1761 | |||
| 1762 | for (i = 0; i < curr->lockdep_depth; i++) { | ||
| 1763 | hlock = curr->held_locks + i; | ||
| 1764 | |||
| 1765 | if (hardirq) { | ||
| 1766 | if (hlock->read) | ||
| 1767 | usage_bit = LOCK_ENABLED_HARDIRQS_READ; | ||
| 1768 | else | ||
| 1769 | usage_bit = LOCK_ENABLED_HARDIRQS; | ||
| 1770 | } else { | ||
| 1771 | if (hlock->read) | ||
| 1772 | usage_bit = LOCK_ENABLED_SOFTIRQS_READ; | ||
| 1773 | else | ||
| 1774 | usage_bit = LOCK_ENABLED_SOFTIRQS; | ||
| 1775 | } | ||
| 1776 | if (!mark_lock(curr, hlock, usage_bit, ip)) | ||
| 1777 | return 0; | ||
| 1778 | } | ||
| 1779 | |||
| 1780 | return 1; | ||
| 1781 | } | ||
| 1782 | |||
| 1783 | /* | ||
| 1784 | * Debugging helper: via this flag we know that we are in | ||
| 1785 | * 'early bootup code', and will warn about any invalid irqs-on event: | ||
| 1786 | */ | ||
| 1787 | static int early_boot_irqs_enabled; | ||
| 1788 | |||
| 1789 | void early_boot_irqs_off(void) | ||
| 1790 | { | ||
| 1791 | early_boot_irqs_enabled = 0; | ||
| 1792 | } | ||
| 1793 | |||
| 1794 | void early_boot_irqs_on(void) | ||
| 1795 | { | ||
| 1796 | early_boot_irqs_enabled = 1; | ||
| 1797 | } | ||
| 1798 | |||
| 1799 | /* | ||
| 1800 | * Hardirqs will be enabled: | ||
| 1801 | */ | ||
| 1802 | void trace_hardirqs_on(void) | ||
| 1803 | { | ||
| 1804 | struct task_struct *curr = current; | ||
| 1805 | unsigned long ip; | ||
| 1806 | |||
| 1807 | if (unlikely(!debug_locks || current->lockdep_recursion)) | ||
| 1808 | return; | ||
| 1809 | |||
| 1810 | if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled))) | ||
| 1811 | return; | ||
| 1812 | |||
| 1813 | if (unlikely(curr->hardirqs_enabled)) { | ||
| 1814 | debug_atomic_inc(&redundant_hardirqs_on); | ||
| 1815 | return; | ||
| 1816 | } | ||
| 1817 | /* we'll do an OFF -> ON transition: */ | ||
| 1818 | curr->hardirqs_enabled = 1; | ||
| 1819 | ip = (unsigned long) __builtin_return_address(0); | ||
| 1820 | |||
| 1821 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | ||
| 1822 | return; | ||
| 1823 | if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) | ||
| 1824 | return; | ||
| 1825 | /* | ||
| 1826 | * We are going to turn hardirqs on, so set the | ||
| 1827 | * usage bit for all held locks: | ||
| 1828 | */ | ||
| 1829 | if (!mark_held_locks(curr, 1, ip)) | ||
| 1830 | return; | ||
| 1831 | /* | ||
| 1832 | * If we have softirqs enabled, then set the usage | ||
| 1833 | * bit for all held locks. (disabled hardirqs prevented | ||
| 1834 | * this bit from being set before) | ||
| 1835 | */ | ||
| 1836 | if (curr->softirqs_enabled) | ||
| 1837 | if (!mark_held_locks(curr, 0, ip)) | ||
| 1838 | return; | ||
| 1839 | |||
| 1840 | curr->hardirq_enable_ip = ip; | ||
| 1841 | curr->hardirq_enable_event = ++curr->irq_events; | ||
| 1842 | debug_atomic_inc(&hardirqs_on_events); | ||
| 1843 | } | ||
| 1844 | |||
| 1845 | EXPORT_SYMBOL(trace_hardirqs_on); | ||
| 1846 | |||
| 1847 | /* | ||
| 1848 | * Hardirqs were disabled: | ||
| 1849 | */ | ||
| 1850 | void trace_hardirqs_off(void) | ||
| 1851 | { | ||
| 1852 | struct task_struct *curr = current; | ||
| 1853 | |||
| 1854 | if (unlikely(!debug_locks || current->lockdep_recursion)) | ||
| 1855 | return; | ||
| 1856 | |||
| 1857 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | ||
| 1858 | return; | ||
| 1859 | |||
| 1860 | if (curr->hardirqs_enabled) { | ||
| 1861 | /* | ||
| 1862 | * We have done an ON -> OFF transition: | ||
| 1863 | */ | ||
| 1864 | curr->hardirqs_enabled = 0; | ||
| 1865 | curr->hardirq_disable_ip = _RET_IP_; | ||
| 1866 | curr->hardirq_disable_event = ++curr->irq_events; | ||
| 1867 | debug_atomic_inc(&hardirqs_off_events); | ||
| 1868 | } else | ||
| 1869 | debug_atomic_inc(&redundant_hardirqs_off); | ||
| 1870 | } | ||
| 1871 | |||
| 1872 | EXPORT_SYMBOL(trace_hardirqs_off); | ||
| 1873 | |||
| 1874 | /* | ||
| 1875 | * Softirqs will be enabled: | ||
| 1876 | */ | ||
| 1877 | void trace_softirqs_on(unsigned long ip) | ||
| 1878 | { | ||
| 1879 | struct task_struct *curr = current; | ||
| 1880 | |||
| 1881 | if (unlikely(!debug_locks)) | ||
| 1882 | return; | ||
| 1883 | |||
| 1884 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | ||
| 1885 | return; | ||
| 1886 | |||
| 1887 | if (curr->softirqs_enabled) { | ||
| 1888 | debug_atomic_inc(&redundant_softirqs_on); | ||
| 1889 | return; | ||
| 1890 | } | ||
| 1891 | |||
| 1892 | /* | ||
| 1893 | * We'll do an OFF -> ON transition: | ||
| 1894 | */ | ||
| 1895 | curr->softirqs_enabled = 1; | ||
| 1896 | curr->softirq_enable_ip = ip; | ||
| 1897 | curr->softirq_enable_event = ++curr->irq_events; | ||
| 1898 | debug_atomic_inc(&softirqs_on_events); | ||
| 1899 | /* | ||
| 1900 | * We are going to turn softirqs on, so set the | ||
| 1901 | * usage bit for all held locks, if hardirqs are | ||
| 1902 | * enabled too: | ||
| 1903 | */ | ||
| 1904 | if (curr->hardirqs_enabled) | ||
| 1905 | mark_held_locks(curr, 0, ip); | ||
| 1906 | } | ||
| 1907 | |||
| 1908 | /* | ||
| 1909 | * Softirqs were disabled: | ||
| 1910 | */ | ||
| 1911 | void trace_softirqs_off(unsigned long ip) | ||
| 1912 | { | ||
| 1913 | struct task_struct *curr = current; | ||
| 1914 | |||
| 1915 | if (unlikely(!debug_locks)) | ||
| 1916 | return; | ||
| 1917 | |||
| 1918 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | ||
| 1919 | return; | ||
| 1920 | |||
| 1921 | if (curr->softirqs_enabled) { | ||
| 1922 | /* | ||
| 1923 | * We have done an ON -> OFF transition: | ||
| 1924 | */ | ||
| 1925 | curr->softirqs_enabled = 0; | ||
| 1926 | curr->softirq_disable_ip = ip; | ||
| 1927 | curr->softirq_disable_event = ++curr->irq_events; | ||
| 1928 | debug_atomic_inc(&softirqs_off_events); | ||
| 1929 | DEBUG_LOCKS_WARN_ON(!softirq_count()); | ||
| 1930 | } else | ||
| 1931 | debug_atomic_inc(&redundant_softirqs_off); | ||
| 1932 | } | ||
| 1933 | |||
| 1934 | #endif | ||
| 1935 | |||
| 1936 | /* | ||
| 1937 | * Initialize a lock instance's lock-class mapping info: | ||
| 1938 | */ | ||
| 1939 | void lockdep_init_map(struct lockdep_map *lock, const char *name, | ||
| 1940 | struct lock_class_key *key) | ||
| 1941 | { | ||
| 1942 | if (unlikely(!debug_locks)) | ||
| 1943 | return; | ||
| 1944 | |||
| 1945 | if (DEBUG_LOCKS_WARN_ON(!key)) | ||
| 1946 | return; | ||
| 1947 | if (DEBUG_LOCKS_WARN_ON(!name)) | ||
| 1948 | return; | ||
| 1949 | /* | ||
| 1950 | * Sanity check, the lock-class key must be persistent: | ||
| 1951 | */ | ||
| 1952 | if (!static_obj(key)) { | ||
| 1953 | printk("BUG: key %p not in .data!\n", key); | ||
| 1954 | DEBUG_LOCKS_WARN_ON(1); | ||
| 1955 | return; | ||
| 1956 | } | ||
| 1957 | lock->name = name; | ||
| 1958 | lock->key = key; | ||
| 1959 | lock->class_cache = NULL; | ||
| 1960 | } | ||
| 1961 | |||
| 1962 | EXPORT_SYMBOL_GPL(lockdep_init_map); | ||
| 1963 | |||
| 1964 | /* | ||
| 1965 | * This gets called for every mutex_lock*()/spin_lock*() operation. | ||
| 1966 | * We maintain the dependency maps and validate the locking attempt: | ||
| 1967 | */ | ||
| 1968 | static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | ||
| 1969 | int trylock, int read, int check, int hardirqs_off, | ||
| 1970 | unsigned long ip) | ||
| 1971 | { | ||
| 1972 | struct task_struct *curr = current; | ||
| 1973 | struct lock_class *class = NULL; | ||
| 1974 | struct held_lock *hlock; | ||
| 1975 | unsigned int depth, id; | ||
| 1976 | int chain_head = 0; | ||
| 1977 | u64 chain_key; | ||
| 1978 | |||
| 1979 | if (unlikely(!debug_locks)) | ||
| 1980 | return 0; | ||
| 1981 | |||
| 1982 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | ||
| 1983 | return 0; | ||
| 1984 | |||
| 1985 | if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { | ||
| 1986 | debug_locks_off(); | ||
| 1987 | printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n"); | ||
| 1988 | printk("turning off the locking correctness validator.\n"); | ||
| 1989 | return 0; | ||
| 1990 | } | ||
| 1991 | |||
| 1992 | if (!subclass) | ||
| 1993 | class = lock->class_cache; | ||
| 1994 | /* | ||
| 1995 | * Not cached yet or subclass? | ||
| 1996 | */ | ||
| 1997 | if (unlikely(!class)) { | ||
| 1998 | class = register_lock_class(lock, subclass); | ||
| 1999 | if (!class) | ||
| 2000 | return 0; | ||
| 2001 | } | ||
| 2002 | debug_atomic_inc((atomic_t *)&class->ops); | ||
| 2003 | if (very_verbose(class)) { | ||
| 2004 | printk("\nacquire class [%p] %s", class->key, class->name); | ||
| 2005 | if (class->name_version > 1) | ||
| 2006 | printk("#%d", class->name_version); | ||
| 2007 | printk("\n"); | ||
| 2008 | dump_stack(); | ||
| 2009 | } | ||
| 2010 | |||
| 2011 | /* | ||
| 2012 | * Add the lock to the list of currently held locks. | ||
| 2013 | * (we dont increase the depth just yet, up until the | ||
| 2014 | * dependency checks are done) | ||
| 2015 | */ | ||
| 2016 | depth = curr->lockdep_depth; | ||
| 2017 | if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) | ||
| 2018 | return 0; | ||
| 2019 | |||
| 2020 | hlock = curr->held_locks + depth; | ||
| 2021 | |||
| 2022 | hlock->class = class; | ||
| 2023 | hlock->acquire_ip = ip; | ||
| 2024 | hlock->instance = lock; | ||
| 2025 | hlock->trylock = trylock; | ||
| 2026 | hlock->read = read; | ||
| 2027 | hlock->check = check; | ||
| 2028 | hlock->hardirqs_off = hardirqs_off; | ||
| 2029 | |||
| 2030 | if (check != 2) | ||
| 2031 | goto out_calc_hash; | ||
| 2032 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
| 2033 | /* | ||
| 2034 | * If non-trylock use in a hardirq or softirq context, then | ||
| 2035 | * mark the lock as used in these contexts: | ||
| 2036 | */ | ||
| 2037 | if (!trylock) { | ||
| 2038 | if (read) { | ||
| 2039 | if (curr->hardirq_context) | ||
| 2040 | if (!mark_lock(curr, hlock, | ||
| 2041 | LOCK_USED_IN_HARDIRQ_READ, ip)) | ||
| 2042 | return 0; | ||
| 2043 | if (curr->softirq_context) | ||
| 2044 | if (!mark_lock(curr, hlock, | ||
| 2045 | LOCK_USED_IN_SOFTIRQ_READ, ip)) | ||
| 2046 | return 0; | ||
| 2047 | } else { | ||
| 2048 | if (curr->hardirq_context) | ||
| 2049 | if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ, ip)) | ||
| 2050 | return 0; | ||
| 2051 | if (curr->softirq_context) | ||
| 2052 | if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ, ip)) | ||
| 2053 | return 0; | ||
| 2054 | } | ||
| 2055 | } | ||
| 2056 | if (!hardirqs_off) { | ||
| 2057 | if (read) { | ||
| 2058 | if (!mark_lock(curr, hlock, | ||
| 2059 | LOCK_ENABLED_HARDIRQS_READ, ip)) | ||
| 2060 | return 0; | ||
| 2061 | if (curr->softirqs_enabled) | ||
| 2062 | if (!mark_lock(curr, hlock, | ||
| 2063 | LOCK_ENABLED_SOFTIRQS_READ, ip)) | ||
| 2064 | return 0; | ||
| 2065 | } else { | ||
| 2066 | if (!mark_lock(curr, hlock, | ||
| 2067 | LOCK_ENABLED_HARDIRQS, ip)) | ||
| 2068 | return 0; | ||
| 2069 | if (curr->softirqs_enabled) | ||
| 2070 | if (!mark_lock(curr, hlock, | ||
| 2071 | LOCK_ENABLED_SOFTIRQS, ip)) | ||
| 2072 | return 0; | ||
| 2073 | } | ||
| 2074 | } | ||
| 2075 | #endif | ||
| 2076 | /* mark it as used: */ | ||
| 2077 | if (!mark_lock(curr, hlock, LOCK_USED, ip)) | ||
| 2078 | return 0; | ||
| 2079 | out_calc_hash: | ||
| 2080 | /* | ||
| 2081 | * Calculate the chain hash: it's the combined has of all the | ||
| 2082 | * lock keys along the dependency chain. We save the hash value | ||
| 2083 | * at every step so that we can get the current hash easily | ||
| 2084 | * after unlock. The chain hash is then used to cache dependency | ||
| 2085 | * results. | ||
| 2086 | * | ||
| 2087 | * The 'key ID' is what is the most compact key value to drive | ||
| 2088 | * the hash, not class->key. | ||
| 2089 | */ | ||
| 2090 | id = class - lock_classes; | ||
| 2091 | if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) | ||
| 2092 | return 0; | ||
| 2093 | |||
| 2094 | chain_key = curr->curr_chain_key; | ||
| 2095 | if (!depth) { | ||
| 2096 | if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) | ||
| 2097 | return 0; | ||
| 2098 | chain_head = 1; | ||
| 2099 | } | ||
| 2100 | |||
| 2101 | hlock->prev_chain_key = chain_key; | ||
| 2102 | |||
| 2103 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
| 2104 | /* | ||
| 2105 | * Keep track of points where we cross into an interrupt context: | ||
| 2106 | */ | ||
| 2107 | hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) + | ||
| 2108 | curr->softirq_context; | ||
| 2109 | if (depth) { | ||
| 2110 | struct held_lock *prev_hlock; | ||
| 2111 | |||
| 2112 | prev_hlock = curr->held_locks + depth-1; | ||
| 2113 | /* | ||
| 2114 | * If we cross into another context, reset the | ||
| 2115 | * hash key (this also prevents the checking and the | ||
| 2116 | * adding of the dependency to 'prev'): | ||
| 2117 | */ | ||
| 2118 | if (prev_hlock->irq_context != hlock->irq_context) { | ||
| 2119 | chain_key = 0; | ||
| 2120 | chain_head = 1; | ||
| 2121 | } | ||
| 2122 | } | ||
| 2123 | #endif | ||
| 2124 | chain_key = iterate_chain_key(chain_key, id); | ||
| 2125 | curr->curr_chain_key = chain_key; | ||
| 2126 | |||
| 2127 | /* | ||
| 2128 | * Trylock needs to maintain the stack of held locks, but it | ||
| 2129 | * does not add new dependencies, because trylock can be done | ||
| 2130 | * in any order. | ||
| 2131 | * | ||
| 2132 | * We look up the chain_key and do the O(N^2) check and update of | ||
| 2133 | * the dependencies only if this is a new dependency chain. | ||
| 2134 | * (If lookup_chain_cache() returns with 1 it acquires | ||
| 2135 | * hash_lock for us) | ||
| 2136 | */ | ||
| 2137 | if (!trylock && (check == 2) && lookup_chain_cache(chain_key)) { | ||
| 2138 | /* | ||
| 2139 | * Check whether last held lock: | ||
| 2140 | * | ||
| 2141 | * - is irq-safe, if this lock is irq-unsafe | ||
| 2142 | * - is softirq-safe, if this lock is hardirq-unsafe | ||
| 2143 | * | ||
| 2144 | * And check whether the new lock's dependency graph | ||
| 2145 | * could lead back to the previous lock. | ||
| 2146 | * | ||
| 2147 | * any of these scenarios could lead to a deadlock. If | ||
| 2148 | * All validations | ||
| 2149 | */ | ||
| 2150 | int ret = check_deadlock(curr, hlock, lock, read); | ||
| 2151 | |||
| 2152 | if (!ret) | ||
| 2153 | return 0; | ||
| 2154 | /* | ||
| 2155 | * Mark recursive read, as we jump over it when | ||
| 2156 | * building dependencies (just like we jump over | ||
| 2157 | * trylock entries): | ||
| 2158 | */ | ||
| 2159 | if (ret == 2) | ||
| 2160 | hlock->read = 2; | ||
| 2161 | /* | ||
| 2162 | * Add dependency only if this lock is not the head | ||
| 2163 | * of the chain, and if it's not a secondary read-lock: | ||
| 2164 | */ | ||
| 2165 | if (!chain_head && ret != 2) | ||
| 2166 | if (!check_prevs_add(curr, hlock)) | ||
| 2167 | return 0; | ||
| 2168 | __raw_spin_unlock(&hash_lock); | ||
| 2169 | } | ||
| 2170 | curr->lockdep_depth++; | ||
| 2171 | check_chain_key(curr); | ||
| 2172 | if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { | ||
| 2173 | debug_locks_off(); | ||
| 2174 | printk("BUG: MAX_LOCK_DEPTH too low!\n"); | ||
| 2175 | printk("turning off the locking correctness validator.\n"); | ||
| 2176 | return 0; | ||
| 2177 | } | ||
| 2178 | if (unlikely(curr->lockdep_depth > max_lockdep_depth)) | ||
| 2179 | max_lockdep_depth = curr->lockdep_depth; | ||
| 2180 | |||
| 2181 | return 1; | ||
| 2182 | } | ||
| 2183 | |||
| 2184 | static int | ||
| 2185 | print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, | ||
| 2186 | unsigned long ip) | ||
| 2187 | { | ||
| 2188 | if (!debug_locks_off()) | ||
| 2189 | return 0; | ||
| 2190 | if (debug_locks_silent) | ||
| 2191 | return 0; | ||
| 2192 | |||
| 2193 | printk("\n=====================================\n"); | ||
| 2194 | printk( "[ BUG: bad unlock balance detected! ]\n"); | ||
| 2195 | printk( "-------------------------------------\n"); | ||
| 2196 | printk("%s/%d is trying to release lock (", | ||
| 2197 | curr->comm, curr->pid); | ||
| 2198 | print_lockdep_cache(lock); | ||
| 2199 | printk(") at:\n"); | ||
| 2200 | print_ip_sym(ip); | ||
| 2201 | printk("but there are no more locks to release!\n"); | ||
| 2202 | printk("\nother info that might help us debug this:\n"); | ||
| 2203 | lockdep_print_held_locks(curr); | ||
| 2204 | |||
| 2205 | printk("\nstack backtrace:\n"); | ||
| 2206 | dump_stack(); | ||
| 2207 | |||
| 2208 | return 0; | ||
| 2209 | } | ||
| 2210 | |||
| 2211 | /* | ||
| 2212 | * Common debugging checks for both nested and non-nested unlock: | ||
| 2213 | */ | ||
| 2214 | static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, | ||
| 2215 | unsigned long ip) | ||
| 2216 | { | ||
| 2217 | if (unlikely(!debug_locks)) | ||
| 2218 | return 0; | ||
| 2219 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | ||
| 2220 | return 0; | ||
| 2221 | |||
| 2222 | if (curr->lockdep_depth <= 0) | ||
| 2223 | return print_unlock_inbalance_bug(curr, lock, ip); | ||
| 2224 | |||
| 2225 | return 1; | ||
| 2226 | } | ||
| 2227 | |||
| 2228 | /* | ||
| 2229 | * Remove the lock to the list of currently held locks in a | ||
| 2230 | * potentially non-nested (out of order) manner. This is a | ||
| 2231 | * relatively rare operation, as all the unlock APIs default | ||
| 2232 | * to nested mode (which uses lock_release()): | ||
| 2233 | */ | ||
| 2234 | static int | ||
| 2235 | lock_release_non_nested(struct task_struct *curr, | ||
| 2236 | struct lockdep_map *lock, unsigned long ip) | ||
| 2237 | { | ||
| 2238 | struct held_lock *hlock, *prev_hlock; | ||
| 2239 | unsigned int depth; | ||
| 2240 | int i; | ||
| 2241 | |||
| 2242 | /* | ||
| 2243 | * Check whether the lock exists in the current stack | ||
| 2244 | * of held locks: | ||
| 2245 | */ | ||
| 2246 | depth = curr->lockdep_depth; | ||
| 2247 | if (DEBUG_LOCKS_WARN_ON(!depth)) | ||
| 2248 | return 0; | ||
| 2249 | |||
| 2250 | prev_hlock = NULL; | ||
| 2251 | for (i = depth-1; i >= 0; i--) { | ||
| 2252 | hlock = curr->held_locks + i; | ||
| 2253 | /* | ||
| 2254 | * We must not cross into another context: | ||
| 2255 | */ | ||
| 2256 | if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) | ||
| 2257 | break; | ||
| 2258 | if (hlock->instance == lock) | ||
| 2259 | goto found_it; | ||
| 2260 | prev_hlock = hlock; | ||
| 2261 | } | ||
| 2262 | return print_unlock_inbalance_bug(curr, lock, ip); | ||
| 2263 | |||
| 2264 | found_it: | ||
| 2265 | /* | ||
| 2266 | * We have the right lock to unlock, 'hlock' points to it. | ||
| 2267 | * Now we remove it from the stack, and add back the other | ||
| 2268 | * entries (if any), recalculating the hash along the way: | ||
| 2269 | */ | ||
| 2270 | curr->lockdep_depth = i; | ||
| 2271 | curr->curr_chain_key = hlock->prev_chain_key; | ||
| 2272 | |||
| 2273 | for (i++; i < depth; i++) { | ||
| 2274 | hlock = curr->held_locks + i; | ||
| 2275 | if (!__lock_acquire(hlock->instance, | ||
| 2276 | hlock->class->subclass, hlock->trylock, | ||
| 2277 | hlock->read, hlock->check, hlock->hardirqs_off, | ||
| 2278 | hlock->acquire_ip)) | ||
| 2279 | return 0; | ||
| 2280 | } | ||
| 2281 | |||
| 2282 | if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) | ||
| 2283 | return 0; | ||
| 2284 | return 1; | ||
| 2285 | } | ||
| 2286 | |||
| 2287 | /* | ||
| 2288 | * Remove the lock to the list of currently held locks - this gets | ||
| 2289 | * called on mutex_unlock()/spin_unlock*() (or on a failed | ||
| 2290 | * mutex_lock_interruptible()). This is done for unlocks that nest | ||
| 2291 | * perfectly. (i.e. the current top of the lock-stack is unlocked) | ||
| 2292 | */ | ||
| 2293 | static int lock_release_nested(struct task_struct *curr, | ||
| 2294 | struct lockdep_map *lock, unsigned long ip) | ||
| 2295 | { | ||
| 2296 | struct held_lock *hlock; | ||
| 2297 | unsigned int depth; | ||
| 2298 | |||
| 2299 | /* | ||
| 2300 | * Pop off the top of the lock stack: | ||
| 2301 | */ | ||
| 2302 | depth = curr->lockdep_depth - 1; | ||
| 2303 | hlock = curr->held_locks + depth; | ||
| 2304 | |||
| 2305 | /* | ||
| 2306 | * Is the unlock non-nested: | ||
| 2307 | */ | ||
| 2308 | if (hlock->instance != lock) | ||
| 2309 | return lock_release_non_nested(curr, lock, ip); | ||
| 2310 | curr->lockdep_depth--; | ||
| 2311 | |||
| 2312 | if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) | ||
| 2313 | return 0; | ||
| 2314 | |||
| 2315 | curr->curr_chain_key = hlock->prev_chain_key; | ||
| 2316 | |||
| 2317 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
| 2318 | hlock->prev_chain_key = 0; | ||
| 2319 | hlock->class = NULL; | ||
| 2320 | hlock->acquire_ip = 0; | ||
| 2321 | hlock->irq_context = 0; | ||
| 2322 | #endif | ||
| 2323 | return 1; | ||
| 2324 | } | ||
| 2325 | |||
| 2326 | /* | ||
| 2327 | * Remove the lock to the list of currently held locks - this gets | ||
| 2328 | * called on mutex_unlock()/spin_unlock*() (or on a failed | ||
| 2329 | * mutex_lock_interruptible()). This is done for unlocks that nest | ||
| 2330 | * perfectly. (i.e. the current top of the lock-stack is unlocked) | ||
| 2331 | */ | ||
| 2332 | static void | ||
| 2333 | __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) | ||
| 2334 | { | ||
| 2335 | struct task_struct *curr = current; | ||
| 2336 | |||
| 2337 | if (!check_unlock(curr, lock, ip)) | ||
| 2338 | return; | ||
| 2339 | |||
| 2340 | if (nested) { | ||
| 2341 | if (!lock_release_nested(curr, lock, ip)) | ||
| 2342 | return; | ||
| 2343 | } else { | ||
| 2344 | if (!lock_release_non_nested(curr, lock, ip)) | ||
| 2345 | return; | ||
| 2346 | } | ||
| 2347 | |||
| 2348 | check_chain_key(curr); | ||
| 2349 | } | ||
| 2350 | |||
| 2351 | /* | ||
| 2352 | * Check whether we follow the irq-flags state precisely: | ||
| 2353 | */ | ||
| 2354 | static void check_flags(unsigned long flags) | ||
| 2355 | { | ||
| 2356 | #if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS) | ||
| 2357 | if (!debug_locks) | ||
| 2358 | return; | ||
| 2359 | |||
| 2360 | if (irqs_disabled_flags(flags)) | ||
| 2361 | DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled); | ||
| 2362 | else | ||
| 2363 | DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled); | ||
| 2364 | |||
| 2365 | /* | ||
| 2366 | * We dont accurately track softirq state in e.g. | ||
| 2367 | * hardirq contexts (such as on 4KSTACKS), so only | ||
| 2368 | * check if not in hardirq contexts: | ||
| 2369 | */ | ||
| 2370 | if (!hardirq_count()) { | ||
| 2371 | if (softirq_count()) | ||
| 2372 | DEBUG_LOCKS_WARN_ON(current->softirqs_enabled); | ||
| 2373 | else | ||
| 2374 | DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); | ||
| 2375 | } | ||
| 2376 | |||
| 2377 | if (!debug_locks) | ||
| 2378 | print_irqtrace_events(current); | ||
| 2379 | #endif | ||
| 2380 | } | ||
| 2381 | |||
| 2382 | /* | ||
| 2383 | * We are not always called with irqs disabled - do that here, | ||
| 2384 | * and also avoid lockdep recursion: | ||
| 2385 | */ | ||
| 2386 | void lock_acquire(struct lockdep_map *lock, unsigned int subclass, | ||
| 2387 | int trylock, int read, int check, unsigned long ip) | ||
| 2388 | { | ||
| 2389 | unsigned long flags; | ||
| 2390 | |||
| 2391 | if (unlikely(current->lockdep_recursion)) | ||
| 2392 | return; | ||
| 2393 | |||
| 2394 | raw_local_irq_save(flags); | ||
| 2395 | check_flags(flags); | ||
| 2396 | |||
| 2397 | current->lockdep_recursion = 1; | ||
| 2398 | __lock_acquire(lock, subclass, trylock, read, check, | ||
| 2399 | irqs_disabled_flags(flags), ip); | ||
| 2400 | current->lockdep_recursion = 0; | ||
| 2401 | raw_local_irq_restore(flags); | ||
| 2402 | } | ||
| 2403 | |||
| 2404 | EXPORT_SYMBOL_GPL(lock_acquire); | ||
| 2405 | |||
| 2406 | void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) | ||
| 2407 | { | ||
| 2408 | unsigned long flags; | ||
| 2409 | |||
| 2410 | if (unlikely(current->lockdep_recursion)) | ||
| 2411 | return; | ||
| 2412 | |||
| 2413 | raw_local_irq_save(flags); | ||
| 2414 | check_flags(flags); | ||
| 2415 | current->lockdep_recursion = 1; | ||
| 2416 | __lock_release(lock, nested, ip); | ||
| 2417 | current->lockdep_recursion = 0; | ||
| 2418 | raw_local_irq_restore(flags); | ||
| 2419 | } | ||
| 2420 | |||
| 2421 | EXPORT_SYMBOL_GPL(lock_release); | ||
| 2422 | |||
| 2423 | /* | ||
| 2424 | * Used by the testsuite, sanitize the validator state | ||
| 2425 | * after a simulated failure: | ||
| 2426 | */ | ||
| 2427 | |||
| 2428 | void lockdep_reset(void) | ||
| 2429 | { | ||
| 2430 | unsigned long flags; | ||
| 2431 | |||
| 2432 | raw_local_irq_save(flags); | ||
| 2433 | current->curr_chain_key = 0; | ||
| 2434 | current->lockdep_depth = 0; | ||
| 2435 | current->lockdep_recursion = 0; | ||
| 2436 | memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock)); | ||
| 2437 | nr_hardirq_chains = 0; | ||
| 2438 | nr_softirq_chains = 0; | ||
| 2439 | nr_process_chains = 0; | ||
| 2440 | debug_locks = 1; | ||
| 2441 | raw_local_irq_restore(flags); | ||
| 2442 | } | ||
| 2443 | |||
| 2444 | static void zap_class(struct lock_class *class) | ||
| 2445 | { | ||
| 2446 | int i; | ||
| 2447 | |||
| 2448 | /* | ||
| 2449 | * Remove all dependencies this lock is | ||
| 2450 | * involved in: | ||
| 2451 | */ | ||
| 2452 | for (i = 0; i < nr_list_entries; i++) { | ||
| 2453 | if (list_entries[i].class == class) | ||
| 2454 | list_del_rcu(&list_entries[i].entry); | ||
| 2455 | } | ||
| 2456 | /* | ||
| 2457 | * Unhash the class and remove it from the all_lock_classes list: | ||
| 2458 | */ | ||
| 2459 | list_del_rcu(&class->hash_entry); | ||
| 2460 | list_del_rcu(&class->lock_entry); | ||
| 2461 | |||
| 2462 | } | ||
| 2463 | |||
| 2464 | static inline int within(void *addr, void *start, unsigned long size) | ||
| 2465 | { | ||
| 2466 | return addr >= start && addr < start + size; | ||
| 2467 | } | ||
| 2468 | |||
| 2469 | void lockdep_free_key_range(void *start, unsigned long size) | ||
| 2470 | { | ||
| 2471 | struct lock_class *class, *next; | ||
| 2472 | struct list_head *head; | ||
| 2473 | unsigned long flags; | ||
| 2474 | int i; | ||
| 2475 | |||
| 2476 | raw_local_irq_save(flags); | ||
| 2477 | __raw_spin_lock(&hash_lock); | ||
| 2478 | |||
| 2479 | /* | ||
| 2480 | * Unhash all classes that were created by this module: | ||
| 2481 | */ | ||
| 2482 | for (i = 0; i < CLASSHASH_SIZE; i++) { | ||
| 2483 | head = classhash_table + i; | ||
| 2484 | if (list_empty(head)) | ||
| 2485 | continue; | ||
| 2486 | list_for_each_entry_safe(class, next, head, hash_entry) | ||
| 2487 | if (within(class->key, start, size)) | ||
| 2488 | zap_class(class); | ||
| 2489 | } | ||
| 2490 | |||
| 2491 | __raw_spin_unlock(&hash_lock); | ||
| 2492 | raw_local_irq_restore(flags); | ||
| 2493 | } | ||
| 2494 | |||
| 2495 | void lockdep_reset_lock(struct lockdep_map *lock) | ||
| 2496 | { | ||
| 2497 | struct lock_class *class, *next; | ||
| 2498 | struct list_head *head; | ||
| 2499 | unsigned long flags; | ||
| 2500 | int i, j; | ||
| 2501 | |||
| 2502 | raw_local_irq_save(flags); | ||
| 2503 | |||
| 2504 | /* | ||
| 2505 | * Remove all classes this lock might have: | ||
| 2506 | */ | ||
| 2507 | for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) { | ||
| 2508 | /* | ||
| 2509 | * If the class exists we look it up and zap it: | ||
| 2510 | */ | ||
| 2511 | class = look_up_lock_class(lock, j); | ||
| 2512 | if (class) | ||
| 2513 | zap_class(class); | ||
| 2514 | } | ||
| 2515 | /* | ||
| 2516 | * Debug check: in the end all mapped classes should | ||
| 2517 | * be gone. | ||
| 2518 | */ | ||
| 2519 | __raw_spin_lock(&hash_lock); | ||
| 2520 | for (i = 0; i < CLASSHASH_SIZE; i++) { | ||
| 2521 | head = classhash_table + i; | ||
| 2522 | if (list_empty(head)) | ||
| 2523 | continue; | ||
| 2524 | list_for_each_entry_safe(class, next, head, hash_entry) { | ||
| 2525 | if (unlikely(class == lock->class_cache)) { | ||
| 2526 | __raw_spin_unlock(&hash_lock); | ||
| 2527 | DEBUG_LOCKS_WARN_ON(1); | ||
| 2528 | goto out_restore; | ||
| 2529 | } | ||
| 2530 | } | ||
| 2531 | } | ||
| 2532 | __raw_spin_unlock(&hash_lock); | ||
| 2533 | |||
| 2534 | out_restore: | ||
| 2535 | raw_local_irq_restore(flags); | ||
| 2536 | } | ||
| 2537 | |||
| 2538 | void __init lockdep_init(void) | ||
| 2539 | { | ||
| 2540 | int i; | ||
| 2541 | |||
| 2542 | /* | ||
| 2543 | * Some architectures have their own start_kernel() | ||
| 2544 | * code which calls lockdep_init(), while we also | ||
| 2545 | * call lockdep_init() from the start_kernel() itself, | ||
| 2546 | * and we want to initialize the hashes only once: | ||
| 2547 | */ | ||
| 2548 | if (lockdep_initialized) | ||
| 2549 | return; | ||
| 2550 | |||
| 2551 | for (i = 0; i < CLASSHASH_SIZE; i++) | ||
| 2552 | INIT_LIST_HEAD(classhash_table + i); | ||
| 2553 | |||
| 2554 | for (i = 0; i < CHAINHASH_SIZE; i++) | ||
| 2555 | INIT_LIST_HEAD(chainhash_table + i); | ||
| 2556 | |||
| 2557 | lockdep_initialized = 1; | ||
| 2558 | } | ||
| 2559 | |||
| 2560 | void __init lockdep_info(void) | ||
| 2561 | { | ||
| 2562 | printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); | ||
| 2563 | |||
| 2564 | printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); | ||
| 2565 | printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); | ||
| 2566 | printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); | ||
| 2567 | printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); | ||
| 2568 | printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); | ||
| 2569 | printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); | ||
| 2570 | printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); | ||
| 2571 | |||
| 2572 | printk(" memory used by lock dependency info: %lu kB\n", | ||
| 2573 | (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS + | ||
| 2574 | sizeof(struct list_head) * CLASSHASH_SIZE + | ||
| 2575 | sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + | ||
| 2576 | sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + | ||
| 2577 | sizeof(struct list_head) * CHAINHASH_SIZE) / 1024); | ||
| 2578 | |||
| 2579 | printk(" per task-struct memory footprint: %lu bytes\n", | ||
| 2580 | sizeof(struct held_lock) * MAX_LOCK_DEPTH); | ||
| 2581 | |||
| 2582 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
| 2583 | if (lockdep_init_error) | ||
| 2584 | printk("WARNING: lockdep init error! Arch code didnt call lockdep_init() early enough?\n"); | ||
| 2585 | #endif | ||
| 2586 | } | ||
| 2587 | |||
| 2588 | static inline int in_range(const void *start, const void *addr, const void *end) | ||
| 2589 | { | ||
| 2590 | return addr >= start && addr <= end; | ||
| 2591 | } | ||
| 2592 | |||
| 2593 | static void | ||
| 2594 | print_freed_lock_bug(struct task_struct *curr, const void *mem_from, | ||
| 2595 | const void *mem_to, struct held_lock *hlock) | ||
| 2596 | { | ||
| 2597 | if (!debug_locks_off()) | ||
| 2598 | return; | ||
| 2599 | if (debug_locks_silent) | ||
| 2600 | return; | ||
| 2601 | |||
| 2602 | printk("\n=========================\n"); | ||
| 2603 | printk( "[ BUG: held lock freed! ]\n"); | ||
| 2604 | printk( "-------------------------\n"); | ||
| 2605 | printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", | ||
| 2606 | curr->comm, curr->pid, mem_from, mem_to-1); | ||
| 2607 | print_lock(hlock); | ||
| 2608 | lockdep_print_held_locks(curr); | ||
| 2609 | |||
| 2610 | printk("\nstack backtrace:\n"); | ||
| 2611 | dump_stack(); | ||
| 2612 | } | ||
| 2613 | |||
| 2614 | /* | ||
| 2615 | * Called when kernel memory is freed (or unmapped), or if a lock | ||
| 2616 | * is destroyed or reinitialized - this code checks whether there is | ||
| 2617 | * any held lock in the memory range of <from> to <to>: | ||
| 2618 | */ | ||
| 2619 | void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) | ||
| 2620 | { | ||
| 2621 | const void *mem_to = mem_from + mem_len, *lock_from, *lock_to; | ||
| 2622 | struct task_struct *curr = current; | ||
| 2623 | struct held_lock *hlock; | ||
| 2624 | unsigned long flags; | ||
| 2625 | int i; | ||
| 2626 | |||
| 2627 | if (unlikely(!debug_locks)) | ||
| 2628 | return; | ||
| 2629 | |||
| 2630 | local_irq_save(flags); | ||
| 2631 | for (i = 0; i < curr->lockdep_depth; i++) { | ||
| 2632 | hlock = curr->held_locks + i; | ||
| 2633 | |||
| 2634 | lock_from = (void *)hlock->instance; | ||
| 2635 | lock_to = (void *)(hlock->instance + 1); | ||
| 2636 | |||
| 2637 | if (!in_range(mem_from, lock_from, mem_to) && | ||
| 2638 | !in_range(mem_from, lock_to, mem_to)) | ||
| 2639 | continue; | ||
| 2640 | |||
| 2641 | print_freed_lock_bug(curr, mem_from, mem_to, hlock); | ||
| 2642 | break; | ||
| 2643 | } | ||
| 2644 | local_irq_restore(flags); | ||
| 2645 | } | ||
| 2646 | |||
| 2647 | static void print_held_locks_bug(struct task_struct *curr) | ||
| 2648 | { | ||
| 2649 | if (!debug_locks_off()) | ||
| 2650 | return; | ||
| 2651 | if (debug_locks_silent) | ||
| 2652 | return; | ||
| 2653 | |||
| 2654 | printk("\n=====================================\n"); | ||
| 2655 | printk( "[ BUG: lock held at task exit time! ]\n"); | ||
| 2656 | printk( "-------------------------------------\n"); | ||
| 2657 | printk("%s/%d is exiting with locks still held!\n", | ||
| 2658 | curr->comm, curr->pid); | ||
| 2659 | lockdep_print_held_locks(curr); | ||
| 2660 | |||
| 2661 | printk("\nstack backtrace:\n"); | ||
| 2662 | dump_stack(); | ||
| 2663 | } | ||
| 2664 | |||
| 2665 | void debug_check_no_locks_held(struct task_struct *task) | ||
| 2666 | { | ||
| 2667 | if (unlikely(task->lockdep_depth > 0)) | ||
| 2668 | print_held_locks_bug(task); | ||
| 2669 | } | ||
| 2670 | |||
| 2671 | void debug_show_all_locks(void) | ||
| 2672 | { | ||
| 2673 | struct task_struct *g, *p; | ||
| 2674 | int count = 10; | ||
| 2675 | int unlock = 1; | ||
| 2676 | |||
| 2677 | printk("\nShowing all locks held in the system:\n"); | ||
| 2678 | |||
| 2679 | /* | ||
| 2680 | * Here we try to get the tasklist_lock as hard as possible, | ||
| 2681 | * if not successful after 2 seconds we ignore it (but keep | ||
| 2682 | * trying). This is to enable a debug printout even if a | ||
| 2683 | * tasklist_lock-holding task deadlocks or crashes. | ||
| 2684 | */ | ||
| 2685 | retry: | ||
| 2686 | if (!read_trylock(&tasklist_lock)) { | ||
| 2687 | if (count == 10) | ||
| 2688 | printk("hm, tasklist_lock locked, retrying... "); | ||
| 2689 | if (count) { | ||
| 2690 | count--; | ||
| 2691 | printk(" #%d", 10-count); | ||
| 2692 | mdelay(200); | ||
| 2693 | goto retry; | ||
| 2694 | } | ||
| 2695 | printk(" ignoring it.\n"); | ||
| 2696 | unlock = 0; | ||
| 2697 | } | ||
| 2698 | if (count != 10) | ||
| 2699 | printk(" locked it.\n"); | ||
| 2700 | |||
| 2701 | do_each_thread(g, p) { | ||
| 2702 | if (p->lockdep_depth) | ||
| 2703 | lockdep_print_held_locks(p); | ||
| 2704 | if (!unlock) | ||
| 2705 | if (read_trylock(&tasklist_lock)) | ||
| 2706 | unlock = 1; | ||
| 2707 | } while_each_thread(g, p); | ||
| 2708 | |||
| 2709 | printk("\n"); | ||
| 2710 | printk("=============================================\n\n"); | ||
| 2711 | |||
| 2712 | if (unlock) | ||
| 2713 | read_unlock(&tasklist_lock); | ||
| 2714 | } | ||
| 2715 | |||
| 2716 | EXPORT_SYMBOL_GPL(debug_show_all_locks); | ||
| 2717 | |||
| 2718 | void debug_show_held_locks(struct task_struct *task) | ||
| 2719 | { | ||
| 2720 | lockdep_print_held_locks(task); | ||
| 2721 | } | ||
| 2722 | |||
| 2723 | EXPORT_SYMBOL_GPL(debug_show_held_locks); | ||
| 2724 | |||
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h new file mode 100644 index 000000000000..eab043c83bb2 --- /dev/null +++ b/kernel/lockdep_internals.h | |||
| @@ -0,0 +1,78 @@ | |||
| 1 | /* | ||
| 2 | * kernel/lockdep_internals.h | ||
| 3 | * | ||
| 4 | * Runtime locking correctness validator | ||
| 5 | * | ||
| 6 | * lockdep subsystem internal functions and variables. | ||
| 7 | */ | ||
| 8 | |||
| 9 | /* | ||
| 10 | * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies | ||
| 11 | * we track. | ||
| 12 | * | ||
| 13 | * We use the per-lock dependency maps in two ways: we grow it by adding | ||
| 14 | * every to-be-taken lock to all currently held lock's own dependency | ||
| 15 | * table (if it's not there yet), and we check it for lock order | ||
| 16 | * conflicts and deadlocks. | ||
| 17 | */ | ||
| 18 | #define MAX_LOCKDEP_ENTRIES 8192UL | ||
| 19 | |||
| 20 | #define MAX_LOCKDEP_KEYS_BITS 11 | ||
| 21 | #define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) | ||
| 22 | |||
| 23 | #define MAX_LOCKDEP_CHAINS_BITS 13 | ||
| 24 | #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) | ||
| 25 | |||
| 26 | /* | ||
| 27 | * Stack-trace: tightly packed array of stack backtrace | ||
| 28 | * addresses. Protected by the hash_lock. | ||
| 29 | */ | ||
| 30 | #define MAX_STACK_TRACE_ENTRIES 262144UL | ||
| 31 | |||
| 32 | extern struct list_head all_lock_classes; | ||
| 33 | |||
| 34 | extern void | ||
| 35 | get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4); | ||
| 36 | |||
| 37 | extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); | ||
| 38 | |||
| 39 | extern unsigned long nr_lock_classes; | ||
| 40 | extern unsigned long nr_list_entries; | ||
| 41 | extern unsigned long nr_lock_chains; | ||
| 42 | extern unsigned long nr_stack_trace_entries; | ||
| 43 | |||
| 44 | extern unsigned int nr_hardirq_chains; | ||
| 45 | extern unsigned int nr_softirq_chains; | ||
| 46 | extern unsigned int nr_process_chains; | ||
| 47 | extern unsigned int max_lockdep_depth; | ||
| 48 | extern unsigned int max_recursion_depth; | ||
| 49 | |||
| 50 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
| 51 | /* | ||
| 52 | * Various lockdep statistics: | ||
| 53 | */ | ||
| 54 | extern atomic_t chain_lookup_hits; | ||
| 55 | extern atomic_t chain_lookup_misses; | ||
| 56 | extern atomic_t hardirqs_on_events; | ||
| 57 | extern atomic_t hardirqs_off_events; | ||
| 58 | extern atomic_t redundant_hardirqs_on; | ||
| 59 | extern atomic_t redundant_hardirqs_off; | ||
| 60 | extern atomic_t softirqs_on_events; | ||
| 61 | extern atomic_t softirqs_off_events; | ||
| 62 | extern atomic_t redundant_softirqs_on; | ||
| 63 | extern atomic_t redundant_softirqs_off; | ||
| 64 | extern atomic_t nr_unused_locks; | ||
| 65 | extern atomic_t nr_cyclic_checks; | ||
| 66 | extern atomic_t nr_cyclic_check_recursions; | ||
| 67 | extern atomic_t nr_find_usage_forwards_checks; | ||
| 68 | extern atomic_t nr_find_usage_forwards_recursions; | ||
| 69 | extern atomic_t nr_find_usage_backwards_checks; | ||
| 70 | extern atomic_t nr_find_usage_backwards_recursions; | ||
| 71 | # define debug_atomic_inc(ptr) atomic_inc(ptr) | ||
| 72 | # define debug_atomic_dec(ptr) atomic_dec(ptr) | ||
| 73 | # define debug_atomic_read(ptr) atomic_read(ptr) | ||
| 74 | #else | ||
| 75 | # define debug_atomic_inc(ptr) do { } while (0) | ||
| 76 | # define debug_atomic_dec(ptr) do { } while (0) | ||
| 77 | # define debug_atomic_read(ptr) 0 | ||
| 78 | #endif | ||
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c new file mode 100644 index 000000000000..f6e72eaab3fa --- /dev/null +++ b/kernel/lockdep_proc.c | |||
| @@ -0,0 +1,345 @@ | |||
| 1 | /* | ||
| 2 | * kernel/lockdep_proc.c | ||
| 3 | * | ||
| 4 | * Runtime locking correctness validator | ||
| 5 | * | ||
| 6 | * Started by Ingo Molnar: | ||
| 7 | * | ||
| 8 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
| 9 | * | ||
| 10 | * Code for /proc/lockdep and /proc/lockdep_stats: | ||
| 11 | * | ||
| 12 | */ | ||
| 13 | #include <linux/sched.h> | ||
| 14 | #include <linux/module.h> | ||
| 15 | #include <linux/proc_fs.h> | ||
| 16 | #include <linux/seq_file.h> | ||
| 17 | #include <linux/kallsyms.h> | ||
| 18 | #include <linux/debug_locks.h> | ||
| 19 | |||
| 20 | #include "lockdep_internals.h" | ||
| 21 | |||
| 22 | static void *l_next(struct seq_file *m, void *v, loff_t *pos) | ||
| 23 | { | ||
| 24 | struct lock_class *class = v; | ||
| 25 | |||
| 26 | (*pos)++; | ||
| 27 | |||
| 28 | if (class->lock_entry.next != &all_lock_classes) | ||
| 29 | class = list_entry(class->lock_entry.next, struct lock_class, | ||
| 30 | lock_entry); | ||
| 31 | else | ||
| 32 | class = NULL; | ||
| 33 | m->private = class; | ||
| 34 | |||
| 35 | return class; | ||
| 36 | } | ||
| 37 | |||
| 38 | static void *l_start(struct seq_file *m, loff_t *pos) | ||
| 39 | { | ||
| 40 | struct lock_class *class = m->private; | ||
| 41 | |||
| 42 | if (&class->lock_entry == all_lock_classes.next) | ||
| 43 | seq_printf(m, "all lock classes:\n"); | ||
| 44 | |||
| 45 | return class; | ||
| 46 | } | ||
| 47 | |||
| 48 | static void l_stop(struct seq_file *m, void *v) | ||
| 49 | { | ||
| 50 | } | ||
| 51 | |||
| 52 | static unsigned long count_forward_deps(struct lock_class *class) | ||
| 53 | { | ||
| 54 | struct lock_list *entry; | ||
| 55 | unsigned long ret = 1; | ||
| 56 | |||
| 57 | /* | ||
| 58 | * Recurse this class's dependency list: | ||
| 59 | */ | ||
| 60 | list_for_each_entry(entry, &class->locks_after, entry) | ||
| 61 | ret += count_forward_deps(entry->class); | ||
| 62 | |||
| 63 | return ret; | ||
| 64 | } | ||
| 65 | |||
| 66 | static unsigned long count_backward_deps(struct lock_class *class) | ||
| 67 | { | ||
| 68 | struct lock_list *entry; | ||
| 69 | unsigned long ret = 1; | ||
| 70 | |||
| 71 | /* | ||
| 72 | * Recurse this class's dependency list: | ||
| 73 | */ | ||
| 74 | list_for_each_entry(entry, &class->locks_before, entry) | ||
| 75 | ret += count_backward_deps(entry->class); | ||
| 76 | |||
| 77 | return ret; | ||
| 78 | } | ||
| 79 | |||
| 80 | static int l_show(struct seq_file *m, void *v) | ||
| 81 | { | ||
| 82 | unsigned long nr_forward_deps, nr_backward_deps; | ||
| 83 | struct lock_class *class = m->private; | ||
| 84 | char str[128], c1, c2, c3, c4; | ||
| 85 | const char *name; | ||
| 86 | |||
| 87 | seq_printf(m, "%p", class->key); | ||
| 88 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
| 89 | seq_printf(m, " OPS:%8ld", class->ops); | ||
| 90 | #endif | ||
| 91 | nr_forward_deps = count_forward_deps(class); | ||
| 92 | seq_printf(m, " FD:%5ld", nr_forward_deps); | ||
| 93 | |||
| 94 | nr_backward_deps = count_backward_deps(class); | ||
| 95 | seq_printf(m, " BD:%5ld", nr_backward_deps); | ||
| 96 | |||
| 97 | get_usage_chars(class, &c1, &c2, &c3, &c4); | ||
| 98 | seq_printf(m, " %c%c%c%c", c1, c2, c3, c4); | ||
| 99 | |||
| 100 | name = class->name; | ||
| 101 | if (!name) { | ||
| 102 | name = __get_key_name(class->key, str); | ||
| 103 | seq_printf(m, ": %s", name); | ||
| 104 | } else{ | ||
| 105 | seq_printf(m, ": %s", name); | ||
| 106 | if (class->name_version > 1) | ||
| 107 | seq_printf(m, "#%d", class->name_version); | ||
| 108 | if (class->subclass) | ||
| 109 | seq_printf(m, "/%d", class->subclass); | ||
| 110 | } | ||
| 111 | seq_puts(m, "\n"); | ||
| 112 | |||
| 113 | return 0; | ||
| 114 | } | ||
| 115 | |||
| 116 | static struct seq_operations lockdep_ops = { | ||
| 117 | .start = l_start, | ||
| 118 | .next = l_next, | ||
| 119 | .stop = l_stop, | ||
| 120 | .show = l_show, | ||
| 121 | }; | ||
| 122 | |||
| 123 | static int lockdep_open(struct inode *inode, struct file *file) | ||
| 124 | { | ||
| 125 | int res = seq_open(file, &lockdep_ops); | ||
| 126 | if (!res) { | ||
| 127 | struct seq_file *m = file->private_data; | ||
| 128 | |||
| 129 | if (!list_empty(&all_lock_classes)) | ||
| 130 | m->private = list_entry(all_lock_classes.next, | ||
| 131 | struct lock_class, lock_entry); | ||
| 132 | else | ||
| 133 | m->private = NULL; | ||
| 134 | } | ||
| 135 | return res; | ||
| 136 | } | ||
| 137 | |||
| 138 | static struct file_operations proc_lockdep_operations = { | ||
| 139 | .open = lockdep_open, | ||
| 140 | .read = seq_read, | ||
| 141 | .llseek = seq_lseek, | ||
| 142 | .release = seq_release, | ||
| 143 | }; | ||
| 144 | |||
| 145 | static void lockdep_stats_debug_show(struct seq_file *m) | ||
| 146 | { | ||
| 147 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
| 148 | unsigned int hi1 = debug_atomic_read(&hardirqs_on_events), | ||
| 149 | hi2 = debug_atomic_read(&hardirqs_off_events), | ||
| 150 | hr1 = debug_atomic_read(&redundant_hardirqs_on), | ||
| 151 | hr2 = debug_atomic_read(&redundant_hardirqs_off), | ||
| 152 | si1 = debug_atomic_read(&softirqs_on_events), | ||
| 153 | si2 = debug_atomic_read(&softirqs_off_events), | ||
| 154 | sr1 = debug_atomic_read(&redundant_softirqs_on), | ||
| 155 | sr2 = debug_atomic_read(&redundant_softirqs_off); | ||
| 156 | |||
| 157 | seq_printf(m, " chain lookup misses: %11u\n", | ||
| 158 | debug_atomic_read(&chain_lookup_misses)); | ||
| 159 | seq_printf(m, " chain lookup hits: %11u\n", | ||
| 160 | debug_atomic_read(&chain_lookup_hits)); | ||
| 161 | seq_printf(m, " cyclic checks: %11u\n", | ||
| 162 | debug_atomic_read(&nr_cyclic_checks)); | ||
| 163 | seq_printf(m, " cyclic-check recursions: %11u\n", | ||
| 164 | debug_atomic_read(&nr_cyclic_check_recursions)); | ||
| 165 | seq_printf(m, " find-mask forwards checks: %11u\n", | ||
| 166 | debug_atomic_read(&nr_find_usage_forwards_checks)); | ||
| 167 | seq_printf(m, " find-mask forwards recursions: %11u\n", | ||
| 168 | debug_atomic_read(&nr_find_usage_forwards_recursions)); | ||
| 169 | seq_printf(m, " find-mask backwards checks: %11u\n", | ||
| 170 | debug_atomic_read(&nr_find_usage_backwards_checks)); | ||
| 171 | seq_printf(m, " find-mask backwards recursions:%11u\n", | ||
| 172 | debug_atomic_read(&nr_find_usage_backwards_recursions)); | ||
| 173 | |||
| 174 | seq_printf(m, " hardirq on events: %11u\n", hi1); | ||
| 175 | seq_printf(m, " hardirq off events: %11u\n", hi2); | ||
| 176 | seq_printf(m, " redundant hardirq ons: %11u\n", hr1); | ||
| 177 | seq_printf(m, " redundant hardirq offs: %11u\n", hr2); | ||
| 178 | seq_printf(m, " softirq on events: %11u\n", si1); | ||
| 179 | seq_printf(m, " softirq off events: %11u\n", si2); | ||
| 180 | seq_printf(m, " redundant softirq ons: %11u\n", sr1); | ||
| 181 | seq_printf(m, " redundant softirq offs: %11u\n", sr2); | ||
| 182 | #endif | ||
| 183 | } | ||
| 184 | |||
| 185 | static int lockdep_stats_show(struct seq_file *m, void *v) | ||
| 186 | { | ||
| 187 | struct lock_class *class; | ||
| 188 | unsigned long nr_unused = 0, nr_uncategorized = 0, | ||
| 189 | nr_irq_safe = 0, nr_irq_unsafe = 0, | ||
| 190 | nr_softirq_safe = 0, nr_softirq_unsafe = 0, | ||
| 191 | nr_hardirq_safe = 0, nr_hardirq_unsafe = 0, | ||
| 192 | nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, | ||
| 193 | nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, | ||
| 194 | nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, | ||
| 195 | sum_forward_deps = 0, factor = 0; | ||
| 196 | |||
| 197 | list_for_each_entry(class, &all_lock_classes, lock_entry) { | ||
| 198 | |||
| 199 | if (class->usage_mask == 0) | ||
| 200 | nr_unused++; | ||
| 201 | if (class->usage_mask == LOCKF_USED) | ||
| 202 | nr_uncategorized++; | ||
| 203 | if (class->usage_mask & LOCKF_USED_IN_IRQ) | ||
| 204 | nr_irq_safe++; | ||
| 205 | if (class->usage_mask & LOCKF_ENABLED_IRQS) | ||
| 206 | nr_irq_unsafe++; | ||
| 207 | if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) | ||
| 208 | nr_softirq_safe++; | ||
| 209 | if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS) | ||
| 210 | nr_softirq_unsafe++; | ||
| 211 | if (class->usage_mask & LOCKF_USED_IN_HARDIRQ) | ||
| 212 | nr_hardirq_safe++; | ||
| 213 | if (class->usage_mask & LOCKF_ENABLED_HARDIRQS) | ||
| 214 | nr_hardirq_unsafe++; | ||
| 215 | if (class->usage_mask & LOCKF_USED_IN_IRQ_READ) | ||
| 216 | nr_irq_read_safe++; | ||
| 217 | if (class->usage_mask & LOCKF_ENABLED_IRQS_READ) | ||
| 218 | nr_irq_read_unsafe++; | ||
| 219 | if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) | ||
| 220 | nr_softirq_read_safe++; | ||
| 221 | if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) | ||
| 222 | nr_softirq_read_unsafe++; | ||
| 223 | if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) | ||
| 224 | nr_hardirq_read_safe++; | ||
| 225 | if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) | ||
| 226 | nr_hardirq_read_unsafe++; | ||
| 227 | |||
| 228 | sum_forward_deps += count_forward_deps(class); | ||
| 229 | } | ||
| 230 | #ifdef CONFIG_LOCKDEP_DEBUG | ||
| 231 | DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); | ||
| 232 | #endif | ||
| 233 | seq_printf(m, " lock-classes: %11lu [max: %lu]\n", | ||
| 234 | nr_lock_classes, MAX_LOCKDEP_KEYS); | ||
| 235 | seq_printf(m, " direct dependencies: %11lu [max: %lu]\n", | ||
| 236 | nr_list_entries, MAX_LOCKDEP_ENTRIES); | ||
| 237 | seq_printf(m, " indirect dependencies: %11lu\n", | ||
| 238 | sum_forward_deps); | ||
| 239 | |||
| 240 | /* | ||
| 241 | * Total number of dependencies: | ||
| 242 | * | ||
| 243 | * All irq-safe locks may nest inside irq-unsafe locks, | ||
| 244 | * plus all the other known dependencies: | ||
| 245 | */ | ||
| 246 | seq_printf(m, " all direct dependencies: %11lu\n", | ||
| 247 | nr_irq_unsafe * nr_irq_safe + | ||
| 248 | nr_hardirq_unsafe * nr_hardirq_safe + | ||
| 249 | nr_list_entries); | ||
| 250 | |||
| 251 | /* | ||
| 252 | * Estimated factor between direct and indirect | ||
| 253 | * dependencies: | ||
| 254 | */ | ||
| 255 | if (nr_list_entries) | ||
| 256 | factor = sum_forward_deps / nr_list_entries; | ||
| 257 | |||
| 258 | seq_printf(m, " dependency chains: %11lu [max: %lu]\n", | ||
| 259 | nr_lock_chains, MAX_LOCKDEP_CHAINS); | ||
| 260 | |||
| 261 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
| 262 | seq_printf(m, " in-hardirq chains: %11u\n", | ||
| 263 | nr_hardirq_chains); | ||
| 264 | seq_printf(m, " in-softirq chains: %11u\n", | ||
| 265 | nr_softirq_chains); | ||
| 266 | #endif | ||
| 267 | seq_printf(m, " in-process chains: %11u\n", | ||
| 268 | nr_process_chains); | ||
| 269 | seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n", | ||
| 270 | nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES); | ||
| 271 | seq_printf(m, " combined max dependencies: %11u\n", | ||
| 272 | (nr_hardirq_chains + 1) * | ||
| 273 | (nr_softirq_chains + 1) * | ||
| 274 | (nr_process_chains + 1) | ||
| 275 | ); | ||
| 276 | seq_printf(m, " hardirq-safe locks: %11lu\n", | ||
| 277 | nr_hardirq_safe); | ||
| 278 | seq_printf(m, " hardirq-unsafe locks: %11lu\n", | ||
| 279 | nr_hardirq_unsafe); | ||
| 280 | seq_printf(m, " softirq-safe locks: %11lu\n", | ||
| 281 | nr_softirq_safe); | ||
| 282 | seq_printf(m, " softirq-unsafe locks: %11lu\n", | ||
| 283 | nr_softirq_unsafe); | ||
| 284 | seq_printf(m, " irq-safe locks: %11lu\n", | ||
| 285 | nr_irq_safe); | ||
| 286 | seq_printf(m, " irq-unsafe locks: %11lu\n", | ||
| 287 | nr_irq_unsafe); | ||
| 288 | |||
| 289 | seq_printf(m, " hardirq-read-safe locks: %11lu\n", | ||
| 290 | nr_hardirq_read_safe); | ||
| 291 | seq_printf(m, " hardirq-read-unsafe locks: %11lu\n", | ||
| 292 | nr_hardirq_read_unsafe); | ||
| 293 | seq_printf(m, " softirq-read-safe locks: %11lu\n", | ||
| 294 | nr_softirq_read_safe); | ||
| 295 | seq_printf(m, " softirq-read-unsafe locks: %11lu\n", | ||
| 296 | nr_softirq_read_unsafe); | ||
| 297 | seq_printf(m, " irq-read-safe locks: %11lu\n", | ||
| 298 | nr_irq_read_safe); | ||
| 299 | seq_printf(m, " irq-read-unsafe locks: %11lu\n", | ||
| 300 | nr_irq_read_unsafe); | ||
| 301 | |||
| 302 | seq_printf(m, " uncategorized locks: %11lu\n", | ||
| 303 | nr_uncategorized); | ||
| 304 | seq_printf(m, " unused locks: %11lu\n", | ||
| 305 | nr_unused); | ||
| 306 | seq_printf(m, " max locking depth: %11u\n", | ||
| 307 | max_lockdep_depth); | ||
| 308 | seq_printf(m, " max recursion depth: %11u\n", | ||
| 309 | max_recursion_depth); | ||
| 310 | lockdep_stats_debug_show(m); | ||
| 311 | seq_printf(m, " debug_locks: %11u\n", | ||
| 312 | debug_locks); | ||
| 313 | |||
| 314 | return 0; | ||
| 315 | } | ||
| 316 | |||
| 317 | static int lockdep_stats_open(struct inode *inode, struct file *file) | ||
| 318 | { | ||
| 319 | return single_open(file, lockdep_stats_show, NULL); | ||
| 320 | } | ||
| 321 | |||
| 322 | static struct file_operations proc_lockdep_stats_operations = { | ||
| 323 | .open = lockdep_stats_open, | ||
| 324 | .read = seq_read, | ||
| 325 | .llseek = seq_lseek, | ||
| 326 | .release = seq_release, | ||
| 327 | }; | ||
| 328 | |||
| 329 | static int __init lockdep_proc_init(void) | ||
| 330 | { | ||
| 331 | struct proc_dir_entry *entry; | ||
| 332 | |||
| 333 | entry = create_proc_entry("lockdep", S_IRUSR, NULL); | ||
| 334 | if (entry) | ||
| 335 | entry->proc_fops = &proc_lockdep_operations; | ||
| 336 | |||
| 337 | entry = create_proc_entry("lockdep_stats", S_IRUSR, NULL); | ||
| 338 | if (entry) | ||
| 339 | entry->proc_fops = &proc_lockdep_stats_operations; | ||
| 340 | |||
| 341 | return 0; | ||
| 342 | } | ||
| 343 | |||
| 344 | __initcall(lockdep_proc_init); | ||
| 345 | |||
diff --git a/kernel/module.c b/kernel/module.c index bbe04862e1b0..05625d5dc758 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -1,4 +1,4 @@ | |||
| 1 | /* Rewritten by Rusty Russell, on the backs of many others... | 1 | /* |
| 2 | Copyright (C) 2002 Richard Henderson | 2 | Copyright (C) 2002 Richard Henderson |
| 3 | Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. | 3 | Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. |
| 4 | 4 | ||
| @@ -16,7 +16,6 @@ | |||
| 16 | along with this program; if not, write to the Free Software | 16 | along with this program; if not, write to the Free Software |
| 17 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 17 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
| 18 | */ | 18 | */ |
| 19 | #include <linux/config.h> | ||
| 20 | #include <linux/module.h> | 19 | #include <linux/module.h> |
| 21 | #include <linux/moduleloader.h> | 20 | #include <linux/moduleloader.h> |
| 22 | #include <linux/init.h> | 21 | #include <linux/init.h> |
| @@ -40,9 +39,11 @@ | |||
| 40 | #include <linux/string.h> | 39 | #include <linux/string.h> |
| 41 | #include <linux/sched.h> | 40 | #include <linux/sched.h> |
| 42 | #include <linux/mutex.h> | 41 | #include <linux/mutex.h> |
| 42 | #include <linux/unwind.h> | ||
| 43 | #include <asm/uaccess.h> | 43 | #include <asm/uaccess.h> |
| 44 | #include <asm/semaphore.h> | 44 | #include <asm/semaphore.h> |
| 45 | #include <asm/cacheflush.h> | 45 | #include <asm/cacheflush.h> |
| 46 | #include <linux/license.h> | ||
| 46 | 47 | ||
| 47 | #if 0 | 48 | #if 0 |
| 48 | #define DEBUGP printk | 49 | #define DEBUGP printk |
| @@ -120,9 +121,17 @@ extern const struct kernel_symbol __start___ksymtab_gpl[]; | |||
| 120 | extern const struct kernel_symbol __stop___ksymtab_gpl[]; | 121 | extern const struct kernel_symbol __stop___ksymtab_gpl[]; |
| 121 | extern const struct kernel_symbol __start___ksymtab_gpl_future[]; | 122 | extern const struct kernel_symbol __start___ksymtab_gpl_future[]; |
| 122 | extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; | 123 | extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; |
| 124 | extern const struct kernel_symbol __start___ksymtab_unused[]; | ||
| 125 | extern const struct kernel_symbol __stop___ksymtab_unused[]; | ||
| 126 | extern const struct kernel_symbol __start___ksymtab_unused_gpl[]; | ||
| 127 | extern const struct kernel_symbol __stop___ksymtab_unused_gpl[]; | ||
| 128 | extern const struct kernel_symbol __start___ksymtab_gpl_future[]; | ||
| 129 | extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; | ||
| 123 | extern const unsigned long __start___kcrctab[]; | 130 | extern const unsigned long __start___kcrctab[]; |
| 124 | extern const unsigned long __start___kcrctab_gpl[]; | 131 | extern const unsigned long __start___kcrctab_gpl[]; |
| 125 | extern const unsigned long __start___kcrctab_gpl_future[]; | 132 | extern const unsigned long __start___kcrctab_gpl_future[]; |
| 133 | extern const unsigned long __start___kcrctab_unused[]; | ||
| 134 | extern const unsigned long __start___kcrctab_unused_gpl[]; | ||
| 126 | 135 | ||
| 127 | #ifndef CONFIG_MODVERSIONS | 136 | #ifndef CONFIG_MODVERSIONS |
| 128 | #define symversion(base, idx) NULL | 137 | #define symversion(base, idx) NULL |
| @@ -142,6 +151,17 @@ static const struct kernel_symbol *lookup_symbol(const char *name, | |||
| 142 | return NULL; | 151 | return NULL; |
| 143 | } | 152 | } |
| 144 | 153 | ||
| 154 | static void printk_unused_warning(const char *name) | ||
| 155 | { | ||
| 156 | printk(KERN_WARNING "Symbol %s is marked as UNUSED, " | ||
| 157 | "however this module is using it.\n", name); | ||
| 158 | printk(KERN_WARNING "This symbol will go away in the future.\n"); | ||
| 159 | printk(KERN_WARNING "Please evalute if this is the right api to use, " | ||
| 160 | "and if it really is, submit a report the linux kernel " | ||
| 161 | "mailinglist together with submitting your code for " | ||
| 162 | "inclusion.\n"); | ||
| 163 | } | ||
| 164 | |||
| 145 | /* Find a symbol, return value, crc and module which owns it */ | 165 | /* Find a symbol, return value, crc and module which owns it */ |
| 146 | static unsigned long __find_symbol(const char *name, | 166 | static unsigned long __find_symbol(const char *name, |
| 147 | struct module **owner, | 167 | struct module **owner, |
| @@ -184,6 +204,25 @@ static unsigned long __find_symbol(const char *name, | |||
| 184 | return ks->value; | 204 | return ks->value; |
| 185 | } | 205 | } |
| 186 | 206 | ||
| 207 | ks = lookup_symbol(name, __start___ksymtab_unused, | ||
| 208 | __stop___ksymtab_unused); | ||
| 209 | if (ks) { | ||
| 210 | printk_unused_warning(name); | ||
| 211 | *crc = symversion(__start___kcrctab_unused, | ||
| 212 | (ks - __start___ksymtab_unused)); | ||
| 213 | return ks->value; | ||
| 214 | } | ||
| 215 | |||
| 216 | if (gplok) | ||
| 217 | ks = lookup_symbol(name, __start___ksymtab_unused_gpl, | ||
| 218 | __stop___ksymtab_unused_gpl); | ||
| 219 | if (ks) { | ||
| 220 | printk_unused_warning(name); | ||
| 221 | *crc = symversion(__start___kcrctab_unused_gpl, | ||
| 222 | (ks - __start___ksymtab_unused_gpl)); | ||
| 223 | return ks->value; | ||
| 224 | } | ||
| 225 | |||
| 187 | /* Now try modules. */ | 226 | /* Now try modules. */ |
| 188 | list_for_each_entry(mod, &modules, list) { | 227 | list_for_each_entry(mod, &modules, list) { |
| 189 | *owner = mod; | 228 | *owner = mod; |
| @@ -202,6 +241,23 @@ static unsigned long __find_symbol(const char *name, | |||
| 202 | return ks->value; | 241 | return ks->value; |
| 203 | } | 242 | } |
| 204 | } | 243 | } |
| 244 | ks = lookup_symbol(name, mod->unused_syms, mod->unused_syms + mod->num_unused_syms); | ||
| 245 | if (ks) { | ||
| 246 | printk_unused_warning(name); | ||
| 247 | *crc = symversion(mod->unused_crcs, (ks - mod->unused_syms)); | ||
| 248 | return ks->value; | ||
| 249 | } | ||
| 250 | |||
| 251 | if (gplok) { | ||
| 252 | ks = lookup_symbol(name, mod->unused_gpl_syms, | ||
| 253 | mod->unused_gpl_syms + mod->num_unused_gpl_syms); | ||
| 254 | if (ks) { | ||
| 255 | printk_unused_warning(name); | ||
| 256 | *crc = symversion(mod->unused_gpl_crcs, | ||
| 257 | (ks - mod->unused_gpl_syms)); | ||
| 258 | return ks->value; | ||
| 259 | } | ||
| 260 | } | ||
| 205 | ks = lookup_symbol(name, mod->gpl_future_syms, | 261 | ks = lookup_symbol(name, mod->gpl_future_syms, |
| 206 | (mod->gpl_future_syms + | 262 | (mod->gpl_future_syms + |
| 207 | mod->num_gpl_future_syms)); | 263 | mod->num_gpl_future_syms)); |
| @@ -877,6 +933,15 @@ static ssize_t module_sect_show(struct module_attribute *mattr, | |||
| 877 | return sprintf(buf, "0x%lx\n", sattr->address); | 933 | return sprintf(buf, "0x%lx\n", sattr->address); |
| 878 | } | 934 | } |
| 879 | 935 | ||
| 936 | static void free_sect_attrs(struct module_sect_attrs *sect_attrs) | ||
| 937 | { | ||
| 938 | int section; | ||
| 939 | |||
| 940 | for (section = 0; section < sect_attrs->nsections; section++) | ||
| 941 | kfree(sect_attrs->attrs[section].name); | ||
| 942 | kfree(sect_attrs); | ||
| 943 | } | ||
| 944 | |||
| 880 | static void add_sect_attrs(struct module *mod, unsigned int nsect, | 945 | static void add_sect_attrs(struct module *mod, unsigned int nsect, |
| 881 | char *secstrings, Elf_Shdr *sechdrs) | 946 | char *secstrings, Elf_Shdr *sechdrs) |
| 882 | { | 947 | { |
| @@ -893,21 +958,26 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, | |||
| 893 | + nloaded * sizeof(sect_attrs->attrs[0]), | 958 | + nloaded * sizeof(sect_attrs->attrs[0]), |
| 894 | sizeof(sect_attrs->grp.attrs[0])); | 959 | sizeof(sect_attrs->grp.attrs[0])); |
| 895 | size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]); | 960 | size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]); |
| 896 | if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL))) | 961 | sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); |
| 962 | if (sect_attrs == NULL) | ||
| 897 | return; | 963 | return; |
| 898 | 964 | ||
| 899 | /* Setup section attributes. */ | 965 | /* Setup section attributes. */ |
| 900 | sect_attrs->grp.name = "sections"; | 966 | sect_attrs->grp.name = "sections"; |
| 901 | sect_attrs->grp.attrs = (void *)sect_attrs + size[0]; | 967 | sect_attrs->grp.attrs = (void *)sect_attrs + size[0]; |
| 902 | 968 | ||
| 969 | sect_attrs->nsections = 0; | ||
| 903 | sattr = §_attrs->attrs[0]; | 970 | sattr = §_attrs->attrs[0]; |
| 904 | gattr = §_attrs->grp.attrs[0]; | 971 | gattr = §_attrs->grp.attrs[0]; |
| 905 | for (i = 0; i < nsect; i++) { | 972 | for (i = 0; i < nsect; i++) { |
| 906 | if (! (sechdrs[i].sh_flags & SHF_ALLOC)) | 973 | if (! (sechdrs[i].sh_flags & SHF_ALLOC)) |
| 907 | continue; | 974 | continue; |
| 908 | sattr->address = sechdrs[i].sh_addr; | 975 | sattr->address = sechdrs[i].sh_addr; |
| 909 | strlcpy(sattr->name, secstrings + sechdrs[i].sh_name, | 976 | sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, |
| 910 | MODULE_SECT_NAME_LEN); | 977 | GFP_KERNEL); |
| 978 | if (sattr->name == NULL) | ||
| 979 | goto out; | ||
| 980 | sect_attrs->nsections++; | ||
| 911 | sattr->mattr.show = module_sect_show; | 981 | sattr->mattr.show = module_sect_show; |
| 912 | sattr->mattr.store = NULL; | 982 | sattr->mattr.store = NULL; |
| 913 | sattr->mattr.attr.name = sattr->name; | 983 | sattr->mattr.attr.name = sattr->name; |
| @@ -923,7 +993,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, | |||
| 923 | mod->sect_attrs = sect_attrs; | 993 | mod->sect_attrs = sect_attrs; |
| 924 | return; | 994 | return; |
| 925 | out: | 995 | out: |
| 926 | kfree(sect_attrs); | 996 | free_sect_attrs(sect_attrs); |
| 927 | } | 997 | } |
| 928 | 998 | ||
| 929 | static void remove_sect_attrs(struct module *mod) | 999 | static void remove_sect_attrs(struct module *mod) |
| @@ -933,13 +1003,13 @@ static void remove_sect_attrs(struct module *mod) | |||
| 933 | &mod->sect_attrs->grp); | 1003 | &mod->sect_attrs->grp); |
| 934 | /* We are positive that no one is using any sect attrs | 1004 | /* We are positive that no one is using any sect attrs |
| 935 | * at this point. Deallocate immediately. */ | 1005 | * at this point. Deallocate immediately. */ |
| 936 | kfree(mod->sect_attrs); | 1006 | free_sect_attrs(mod->sect_attrs); |
| 937 | mod->sect_attrs = NULL; | 1007 | mod->sect_attrs = NULL; |
| 938 | } | 1008 | } |
| 939 | } | 1009 | } |
| 940 | 1010 | ||
| 941 | |||
| 942 | #else | 1011 | #else |
| 1012 | |||
| 943 | static inline void add_sect_attrs(struct module *mod, unsigned int nsect, | 1013 | static inline void add_sect_attrs(struct module *mod, unsigned int nsect, |
| 944 | char *sectstrings, Elf_Shdr *sechdrs) | 1014 | char *sectstrings, Elf_Shdr *sechdrs) |
| 945 | { | 1015 | { |
| @@ -998,6 +1068,12 @@ static int mod_sysfs_setup(struct module *mod, | |||
| 998 | { | 1068 | { |
| 999 | int err; | 1069 | int err; |
| 1000 | 1070 | ||
| 1071 | if (!module_subsys.kset.subsys) { | ||
| 1072 | printk(KERN_ERR "%s: module_subsys not initialized\n", | ||
| 1073 | mod->name); | ||
| 1074 | err = -EINVAL; | ||
| 1075 | goto out; | ||
| 1076 | } | ||
| 1001 | memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); | 1077 | memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); |
| 1002 | err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name); | 1078 | err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name); |
| 1003 | if (err) | 1079 | if (err) |
| @@ -1051,6 +1127,8 @@ static void free_module(struct module *mod) | |||
| 1051 | remove_sect_attrs(mod); | 1127 | remove_sect_attrs(mod); |
| 1052 | mod_kobject_remove(mod); | 1128 | mod_kobject_remove(mod); |
| 1053 | 1129 | ||
| 1130 | unwind_remove_table(mod->unwind_info, 0); | ||
| 1131 | |||
| 1054 | /* Arch-specific cleanup. */ | 1132 | /* Arch-specific cleanup. */ |
| 1055 | module_arch_cleanup(mod); | 1133 | module_arch_cleanup(mod); |
| 1056 | 1134 | ||
| @@ -1063,6 +1141,9 @@ static void free_module(struct module *mod) | |||
| 1063 | if (mod->percpu) | 1141 | if (mod->percpu) |
| 1064 | percpu_modfree(mod->percpu); | 1142 | percpu_modfree(mod->percpu); |
| 1065 | 1143 | ||
| 1144 | /* Free lock-classes: */ | ||
| 1145 | lockdep_free_key_range(mod->module_core, mod->core_size); | ||
| 1146 | |||
| 1066 | /* Finally, free the core (containing the module structure) */ | 1147 | /* Finally, free the core (containing the module structure) */ |
| 1067 | module_free(mod, mod->module_core); | 1148 | module_free(mod, mod->module_core); |
| 1068 | } | 1149 | } |
| @@ -1248,16 +1329,6 @@ static void layout_sections(struct module *mod, | |||
| 1248 | } | 1329 | } |
| 1249 | } | 1330 | } |
| 1250 | 1331 | ||
| 1251 | static inline int license_is_gpl_compatible(const char *license) | ||
| 1252 | { | ||
| 1253 | return (strcmp(license, "GPL") == 0 | ||
| 1254 | || strcmp(license, "GPL v2") == 0 | ||
| 1255 | || strcmp(license, "GPL and additional rights") == 0 | ||
| 1256 | || strcmp(license, "Dual BSD/GPL") == 0 | ||
| 1257 | || strcmp(license, "Dual MIT/GPL") == 0 | ||
| 1258 | || strcmp(license, "Dual MPL/GPL") == 0); | ||
| 1259 | } | ||
| 1260 | |||
| 1261 | static void set_license(struct module *mod, const char *license) | 1332 | static void set_license(struct module *mod, const char *license) |
| 1262 | { | 1333 | { |
| 1263 | if (!license) | 1334 | if (!license) |
| @@ -1326,7 +1397,7 @@ int is_exported(const char *name, const struct module *mod) | |||
| 1326 | if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) | 1397 | if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) |
| 1327 | return 1; | 1398 | return 1; |
| 1328 | else | 1399 | else |
| 1329 | if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) | 1400 | if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) |
| 1330 | return 1; | 1401 | return 1; |
| 1331 | else | 1402 | else |
| 1332 | return 0; | 1403 | return 0; |
| @@ -1409,10 +1480,27 @@ static struct module *load_module(void __user *umod, | |||
| 1409 | Elf_Ehdr *hdr; | 1480 | Elf_Ehdr *hdr; |
| 1410 | Elf_Shdr *sechdrs; | 1481 | Elf_Shdr *sechdrs; |
| 1411 | char *secstrings, *args, *modmagic, *strtab = NULL; | 1482 | char *secstrings, *args, *modmagic, *strtab = NULL; |
| 1412 | unsigned int i, symindex = 0, strindex = 0, setupindex, exindex, | 1483 | unsigned int i; |
| 1413 | exportindex, modindex, obsparmindex, infoindex, gplindex, | 1484 | unsigned int symindex = 0; |
| 1414 | crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex, | 1485 | unsigned int strindex = 0; |
| 1415 | gplfuturecrcindex; | 1486 | unsigned int setupindex; |
| 1487 | unsigned int exindex; | ||
| 1488 | unsigned int exportindex; | ||
| 1489 | unsigned int modindex; | ||
| 1490 | unsigned int obsparmindex; | ||
| 1491 | unsigned int infoindex; | ||
| 1492 | unsigned int gplindex; | ||
| 1493 | unsigned int crcindex; | ||
| 1494 | unsigned int gplcrcindex; | ||
| 1495 | unsigned int versindex; | ||
| 1496 | unsigned int pcpuindex; | ||
| 1497 | unsigned int gplfutureindex; | ||
| 1498 | unsigned int gplfuturecrcindex; | ||
| 1499 | unsigned int unwindex = 0; | ||
| 1500 | unsigned int unusedindex; | ||
| 1501 | unsigned int unusedcrcindex; | ||
| 1502 | unsigned int unusedgplindex; | ||
| 1503 | unsigned int unusedgplcrcindex; | ||
| 1416 | struct module *mod; | 1504 | struct module *mod; |
| 1417 | long err = 0; | 1505 | long err = 0; |
| 1418 | void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ | 1506 | void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ |
| @@ -1493,15 +1581,22 @@ static struct module *load_module(void __user *umod, | |||
| 1493 | exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); | 1581 | exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); |
| 1494 | gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); | 1582 | gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); |
| 1495 | gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future"); | 1583 | gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future"); |
| 1584 | unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused"); | ||
| 1585 | unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl"); | ||
| 1496 | crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); | 1586 | crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); |
| 1497 | gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); | 1587 | gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); |
| 1498 | gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future"); | 1588 | gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future"); |
| 1589 | unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused"); | ||
| 1590 | unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl"); | ||
| 1499 | setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); | 1591 | setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); |
| 1500 | exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); | 1592 | exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); |
| 1501 | obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); | 1593 | obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); |
| 1502 | versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); | 1594 | versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); |
| 1503 | infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); | 1595 | infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); |
| 1504 | pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); | 1596 | pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); |
| 1597 | #ifdef ARCH_UNWIND_SECTION_NAME | ||
| 1598 | unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME); | ||
| 1599 | #endif | ||
| 1505 | 1600 | ||
| 1506 | /* Don't keep modinfo section */ | 1601 | /* Don't keep modinfo section */ |
| 1507 | sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; | 1602 | sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; |
| @@ -1510,6 +1605,8 @@ static struct module *load_module(void __user *umod, | |||
| 1510 | sechdrs[symindex].sh_flags |= SHF_ALLOC; | 1605 | sechdrs[symindex].sh_flags |= SHF_ALLOC; |
| 1511 | sechdrs[strindex].sh_flags |= SHF_ALLOC; | 1606 | sechdrs[strindex].sh_flags |= SHF_ALLOC; |
| 1512 | #endif | 1607 | #endif |
| 1608 | if (unwindex) | ||
| 1609 | sechdrs[unwindex].sh_flags |= SHF_ALLOC; | ||
| 1513 | 1610 | ||
| 1514 | /* Check module struct version now, before we try to use module. */ | 1611 | /* Check module struct version now, before we try to use module. */ |
| 1515 | if (!check_modstruct_version(sechdrs, versindex, mod)) { | 1612 | if (!check_modstruct_version(sechdrs, versindex, mod)) { |
| @@ -1639,14 +1736,27 @@ static struct module *load_module(void __user *umod, | |||
| 1639 | mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; | 1736 | mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; |
| 1640 | mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / | 1737 | mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / |
| 1641 | sizeof(*mod->gpl_future_syms); | 1738 | sizeof(*mod->gpl_future_syms); |
| 1739 | mod->num_unused_syms = sechdrs[unusedindex].sh_size / | ||
| 1740 | sizeof(*mod->unused_syms); | ||
| 1741 | mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size / | ||
| 1742 | sizeof(*mod->unused_gpl_syms); | ||
| 1642 | mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; | 1743 | mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; |
| 1643 | if (gplfuturecrcindex) | 1744 | if (gplfuturecrcindex) |
| 1644 | mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; | 1745 | mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; |
| 1645 | 1746 | ||
| 1747 | mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr; | ||
| 1748 | if (unusedcrcindex) | ||
| 1749 | mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr; | ||
| 1750 | mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr; | ||
| 1751 | if (unusedgplcrcindex) | ||
| 1752 | mod->unused_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr; | ||
| 1753 | |||
| 1646 | #ifdef CONFIG_MODVERSIONS | 1754 | #ifdef CONFIG_MODVERSIONS |
| 1647 | if ((mod->num_syms && !crcindex) || | 1755 | if ((mod->num_syms && !crcindex) || |
| 1648 | (mod->num_gpl_syms && !gplcrcindex) || | 1756 | (mod->num_gpl_syms && !gplcrcindex) || |
| 1649 | (mod->num_gpl_future_syms && !gplfuturecrcindex)) { | 1757 | (mod->num_gpl_future_syms && !gplfuturecrcindex) || |
| 1758 | (mod->num_unused_syms && !unusedcrcindex) || | ||
| 1759 | (mod->num_unused_gpl_syms && !unusedgplcrcindex)) { | ||
| 1650 | printk(KERN_WARNING "%s: No versions for exported symbols." | 1760 | printk(KERN_WARNING "%s: No versions for exported symbols." |
| 1651 | " Tainting kernel.\n", mod->name); | 1761 | " Tainting kernel.\n", mod->name); |
| 1652 | add_taint(TAINT_FORCED_MODULE); | 1762 | add_taint(TAINT_FORCED_MODULE); |
| @@ -1738,6 +1848,11 @@ static struct module *load_module(void __user *umod, | |||
| 1738 | goto arch_cleanup; | 1848 | goto arch_cleanup; |
| 1739 | add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); | 1849 | add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); |
| 1740 | 1850 | ||
| 1851 | /* Size of section 0 is 0, so this works well if no unwind info. */ | ||
| 1852 | mod->unwind_info = unwind_add_table(mod, | ||
| 1853 | (void *)sechdrs[unwindex].sh_addr, | ||
| 1854 | sechdrs[unwindex].sh_size); | ||
| 1855 | |||
| 1741 | /* Get rid of temporary copy */ | 1856 | /* Get rid of temporary copy */ |
| 1742 | vfree(hdr); | 1857 | vfree(hdr); |
| 1743 | 1858 | ||
| @@ -1836,6 +1951,7 @@ sys_init_module(void __user *umod, | |||
| 1836 | mod->state = MODULE_STATE_LIVE; | 1951 | mod->state = MODULE_STATE_LIVE; |
| 1837 | /* Drop initial reference. */ | 1952 | /* Drop initial reference. */ |
| 1838 | module_put(mod); | 1953 | module_put(mod); |
| 1954 | unwind_remove_table(mod->unwind_info, 1); | ||
| 1839 | module_free(mod, mod->module_init); | 1955 | module_free(mod, mod->module_init); |
| 1840 | mod->module_init = NULL; | 1956 | mod->module_init = NULL; |
| 1841 | mod->init_size = 0; | 1957 | mod->init_size = 0; |
| @@ -1923,10 +2039,8 @@ const char *module_address_lookup(unsigned long addr, | |||
| 1923 | return NULL; | 2039 | return NULL; |
| 1924 | } | 2040 | } |
| 1925 | 2041 | ||
| 1926 | struct module *module_get_kallsym(unsigned int symnum, | 2042 | struct module *module_get_kallsym(unsigned int symnum, unsigned long *value, |
| 1927 | unsigned long *value, | 2043 | char *type, char *name, size_t namelen) |
| 1928 | char *type, | ||
| 1929 | char namebuf[128]) | ||
| 1930 | { | 2044 | { |
| 1931 | struct module *mod; | 2045 | struct module *mod; |
| 1932 | 2046 | ||
| @@ -1935,9 +2049,8 @@ struct module *module_get_kallsym(unsigned int symnum, | |||
| 1935 | if (symnum < mod->num_symtab) { | 2049 | if (symnum < mod->num_symtab) { |
| 1936 | *value = mod->symtab[symnum].st_value; | 2050 | *value = mod->symtab[symnum].st_value; |
| 1937 | *type = mod->symtab[symnum].st_info; | 2051 | *type = mod->symtab[symnum].st_info; |
| 1938 | strncpy(namebuf, | 2052 | strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, |
| 1939 | mod->strtab + mod->symtab[symnum].st_name, | 2053 | namelen); |
| 1940 | 127); | ||
| 1941 | mutex_unlock(&module_mutex); | 2054 | mutex_unlock(&module_mutex); |
| 1942 | return mod; | 2055 | return mod; |
| 1943 | } | 2056 | } |
| @@ -2066,6 +2179,29 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) | |||
| 2066 | return e; | 2179 | return e; |
| 2067 | } | 2180 | } |
| 2068 | 2181 | ||
| 2182 | /* | ||
| 2183 | * Is this a valid module address? | ||
| 2184 | */ | ||
| 2185 | int is_module_address(unsigned long addr) | ||
| 2186 | { | ||
| 2187 | unsigned long flags; | ||
| 2188 | struct module *mod; | ||
| 2189 | |||
| 2190 | spin_lock_irqsave(&modlist_lock, flags); | ||
| 2191 | |||
| 2192 | list_for_each_entry(mod, &modules, list) { | ||
| 2193 | if (within(addr, mod->module_core, mod->core_size)) { | ||
| 2194 | spin_unlock_irqrestore(&modlist_lock, flags); | ||
| 2195 | return 1; | ||
| 2196 | } | ||
| 2197 | } | ||
| 2198 | |||
| 2199 | spin_unlock_irqrestore(&modlist_lock, flags); | ||
| 2200 | |||
| 2201 | return 0; | ||
| 2202 | } | ||
| 2203 | |||
| 2204 | |||
| 2069 | /* Is this a valid kernel address? We don't grab the lock: we are oopsing. */ | 2205 | /* Is this a valid kernel address? We don't grab the lock: we are oopsing. */ |
| 2070 | struct module *__module_text_address(unsigned long addr) | 2206 | struct module *__module_text_address(unsigned long addr) |
| 2071 | { | 2207 | { |
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index f4913c376950..e3203c654dda 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c | |||
| @@ -16,395 +16,48 @@ | |||
| 16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
| 17 | #include <linux/delay.h> | 17 | #include <linux/delay.h> |
| 18 | #include <linux/module.h> | 18 | #include <linux/module.h> |
| 19 | #include <linux/poison.h> | ||
| 19 | #include <linux/spinlock.h> | 20 | #include <linux/spinlock.h> |
| 20 | #include <linux/kallsyms.h> | 21 | #include <linux/kallsyms.h> |
| 21 | #include <linux/interrupt.h> | 22 | #include <linux/interrupt.h> |
| 23 | #include <linux/debug_locks.h> | ||
| 22 | 24 | ||
| 23 | #include "mutex-debug.h" | 25 | #include "mutex-debug.h" |
| 24 | 26 | ||
| 25 | /* | 27 | /* |
| 26 | * We need a global lock when we walk through the multi-process | ||
| 27 | * lock tree. Only used in the deadlock-debugging case. | ||
| 28 | */ | ||
| 29 | DEFINE_SPINLOCK(debug_mutex_lock); | ||
| 30 | |||
| 31 | /* | ||
| 32 | * All locks held by all tasks, in a single global list: | ||
| 33 | */ | ||
| 34 | LIST_HEAD(debug_mutex_held_locks); | ||
| 35 | |||
| 36 | /* | ||
| 37 | * In the debug case we carry the caller's instruction pointer into | ||
| 38 | * other functions, but we dont want the function argument overhead | ||
| 39 | * in the nondebug case - hence these macros: | ||
| 40 | */ | ||
| 41 | #define __IP_DECL__ , unsigned long ip | ||
| 42 | #define __IP__ , ip | ||
| 43 | #define __RET_IP__ , (unsigned long)__builtin_return_address(0) | ||
| 44 | |||
| 45 | /* | ||
| 46 | * "mutex debugging enabled" flag. We turn it off when we detect | ||
| 47 | * the first problem because we dont want to recurse back | ||
| 48 | * into the tracing code when doing error printk or | ||
| 49 | * executing a BUG(): | ||
| 50 | */ | ||
| 51 | int debug_mutex_on = 1; | ||
| 52 | |||
| 53 | static void printk_task(struct task_struct *p) | ||
| 54 | { | ||
| 55 | if (p) | ||
| 56 | printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio); | ||
| 57 | else | ||
| 58 | printk("<none>"); | ||
| 59 | } | ||
| 60 | |||
| 61 | static void printk_ti(struct thread_info *ti) | ||
| 62 | { | ||
| 63 | if (ti) | ||
| 64 | printk_task(ti->task); | ||
| 65 | else | ||
| 66 | printk("<none>"); | ||
| 67 | } | ||
| 68 | |||
| 69 | static void printk_task_short(struct task_struct *p) | ||
| 70 | { | ||
| 71 | if (p) | ||
| 72 | printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio); | ||
| 73 | else | ||
| 74 | printk("<none>"); | ||
| 75 | } | ||
| 76 | |||
| 77 | static void printk_lock(struct mutex *lock, int print_owner) | ||
| 78 | { | ||
| 79 | printk(" [%p] {%s}\n", lock, lock->name); | ||
| 80 | |||
| 81 | if (print_owner && lock->owner) { | ||
| 82 | printk(".. held by: "); | ||
| 83 | printk_ti(lock->owner); | ||
| 84 | printk("\n"); | ||
| 85 | } | ||
| 86 | if (lock->owner) { | ||
| 87 | printk("... acquired at: "); | ||
| 88 | print_symbol("%s\n", lock->acquire_ip); | ||
| 89 | } | ||
| 90 | } | ||
| 91 | |||
| 92 | /* | ||
| 93 | * printk locks held by a task: | ||
| 94 | */ | ||
| 95 | static void show_task_locks(struct task_struct *p) | ||
| 96 | { | ||
| 97 | switch (p->state) { | ||
| 98 | case TASK_RUNNING: printk("R"); break; | ||
| 99 | case TASK_INTERRUPTIBLE: printk("S"); break; | ||
| 100 | case TASK_UNINTERRUPTIBLE: printk("D"); break; | ||
| 101 | case TASK_STOPPED: printk("T"); break; | ||
| 102 | case EXIT_ZOMBIE: printk("Z"); break; | ||
| 103 | case EXIT_DEAD: printk("X"); break; | ||
| 104 | default: printk("?"); break; | ||
| 105 | } | ||
| 106 | printk_task(p); | ||
| 107 | if (p->blocked_on) { | ||
| 108 | struct mutex *lock = p->blocked_on->lock; | ||
| 109 | |||
| 110 | printk(" blocked on mutex:"); | ||
| 111 | printk_lock(lock, 1); | ||
| 112 | } else | ||
| 113 | printk(" (not blocked on mutex)\n"); | ||
| 114 | } | ||
| 115 | |||
| 116 | /* | ||
| 117 | * printk all locks held in the system (if filter == NULL), | ||
| 118 | * or all locks belonging to a single task (if filter != NULL): | ||
| 119 | */ | ||
| 120 | void show_held_locks(struct task_struct *filter) | ||
| 121 | { | ||
| 122 | struct list_head *curr, *cursor = NULL; | ||
| 123 | struct mutex *lock; | ||
| 124 | struct thread_info *t; | ||
| 125 | unsigned long flags; | ||
| 126 | int count = 0; | ||
| 127 | |||
| 128 | if (filter) { | ||
| 129 | printk("------------------------------\n"); | ||
| 130 | printk("| showing all locks held by: | ("); | ||
| 131 | printk_task_short(filter); | ||
| 132 | printk("):\n"); | ||
| 133 | printk("------------------------------\n"); | ||
| 134 | } else { | ||
| 135 | printk("---------------------------\n"); | ||
| 136 | printk("| showing all locks held: |\n"); | ||
| 137 | printk("---------------------------\n"); | ||
| 138 | } | ||
| 139 | |||
| 140 | /* | ||
| 141 | * Play safe and acquire the global trace lock. We | ||
| 142 | * cannot printk with that lock held so we iterate | ||
| 143 | * very carefully: | ||
| 144 | */ | ||
| 145 | next: | ||
| 146 | debug_spin_lock_save(&debug_mutex_lock, flags); | ||
| 147 | list_for_each(curr, &debug_mutex_held_locks) { | ||
| 148 | if (cursor && curr != cursor) | ||
| 149 | continue; | ||
| 150 | lock = list_entry(curr, struct mutex, held_list); | ||
| 151 | t = lock->owner; | ||
| 152 | if (filter && (t != filter->thread_info)) | ||
| 153 | continue; | ||
| 154 | count++; | ||
| 155 | cursor = curr->next; | ||
| 156 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
| 157 | |||
| 158 | printk("\n#%03d: ", count); | ||
| 159 | printk_lock(lock, filter ? 0 : 1); | ||
| 160 | goto next; | ||
| 161 | } | ||
| 162 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
| 163 | printk("\n"); | ||
| 164 | } | ||
| 165 | |||
| 166 | void mutex_debug_show_all_locks(void) | ||
| 167 | { | ||
| 168 | struct task_struct *g, *p; | ||
| 169 | int count = 10; | ||
| 170 | int unlock = 1; | ||
| 171 | |||
| 172 | printk("\nShowing all blocking locks in the system:\n"); | ||
| 173 | |||
| 174 | /* | ||
| 175 | * Here we try to get the tasklist_lock as hard as possible, | ||
| 176 | * if not successful after 2 seconds we ignore it (but keep | ||
| 177 | * trying). This is to enable a debug printout even if a | ||
| 178 | * tasklist_lock-holding task deadlocks or crashes. | ||
| 179 | */ | ||
| 180 | retry: | ||
| 181 | if (!read_trylock(&tasklist_lock)) { | ||
| 182 | if (count == 10) | ||
| 183 | printk("hm, tasklist_lock locked, retrying... "); | ||
| 184 | if (count) { | ||
| 185 | count--; | ||
| 186 | printk(" #%d", 10-count); | ||
| 187 | mdelay(200); | ||
| 188 | goto retry; | ||
| 189 | } | ||
| 190 | printk(" ignoring it.\n"); | ||
| 191 | unlock = 0; | ||
| 192 | } | ||
| 193 | if (count != 10) | ||
| 194 | printk(" locked it.\n"); | ||
| 195 | |||
| 196 | do_each_thread(g, p) { | ||
| 197 | show_task_locks(p); | ||
| 198 | if (!unlock) | ||
| 199 | if (read_trylock(&tasklist_lock)) | ||
| 200 | unlock = 1; | ||
| 201 | } while_each_thread(g, p); | ||
| 202 | |||
| 203 | printk("\n"); | ||
| 204 | show_held_locks(NULL); | ||
| 205 | printk("=============================================\n\n"); | ||
| 206 | |||
| 207 | if (unlock) | ||
| 208 | read_unlock(&tasklist_lock); | ||
| 209 | } | ||
| 210 | |||
| 211 | static void report_deadlock(struct task_struct *task, struct mutex *lock, | ||
| 212 | struct mutex *lockblk, unsigned long ip) | ||
| 213 | { | ||
| 214 | printk("\n%s/%d is trying to acquire this lock:\n", | ||
| 215 | current->comm, current->pid); | ||
| 216 | printk_lock(lock, 1); | ||
| 217 | printk("... trying at: "); | ||
| 218 | print_symbol("%s\n", ip); | ||
| 219 | show_held_locks(current); | ||
| 220 | |||
| 221 | if (lockblk) { | ||
| 222 | printk("but %s/%d is deadlocking current task %s/%d!\n\n", | ||
| 223 | task->comm, task->pid, current->comm, current->pid); | ||
| 224 | printk("\n%s/%d is blocked on this lock:\n", | ||
| 225 | task->comm, task->pid); | ||
| 226 | printk_lock(lockblk, 1); | ||
| 227 | |||
| 228 | show_held_locks(task); | ||
| 229 | |||
| 230 | printk("\n%s/%d's [blocked] stackdump:\n\n", | ||
| 231 | task->comm, task->pid); | ||
| 232 | show_stack(task, NULL); | ||
| 233 | } | ||
| 234 | |||
| 235 | printk("\n%s/%d's [current] stackdump:\n\n", | ||
| 236 | current->comm, current->pid); | ||
| 237 | dump_stack(); | ||
| 238 | mutex_debug_show_all_locks(); | ||
| 239 | printk("[ turning off deadlock detection. Please report this. ]\n\n"); | ||
| 240 | local_irq_disable(); | ||
| 241 | } | ||
| 242 | |||
| 243 | /* | ||
| 244 | * Recursively check for mutex deadlocks: | ||
| 245 | */ | ||
| 246 | static int check_deadlock(struct mutex *lock, int depth, | ||
| 247 | struct thread_info *ti, unsigned long ip) | ||
| 248 | { | ||
| 249 | struct mutex *lockblk; | ||
| 250 | struct task_struct *task; | ||
| 251 | |||
| 252 | if (!debug_mutex_on) | ||
| 253 | return 0; | ||
| 254 | |||
| 255 | ti = lock->owner; | ||
| 256 | if (!ti) | ||
| 257 | return 0; | ||
| 258 | |||
| 259 | task = ti->task; | ||
| 260 | lockblk = NULL; | ||
| 261 | if (task->blocked_on) | ||
| 262 | lockblk = task->blocked_on->lock; | ||
| 263 | |||
| 264 | /* Self-deadlock: */ | ||
| 265 | if (current == task) { | ||
| 266 | DEBUG_OFF(); | ||
| 267 | if (depth) | ||
| 268 | return 1; | ||
| 269 | printk("\n==========================================\n"); | ||
| 270 | printk( "[ BUG: lock recursion deadlock detected! |\n"); | ||
| 271 | printk( "------------------------------------------\n"); | ||
| 272 | report_deadlock(task, lock, NULL, ip); | ||
| 273 | return 0; | ||
| 274 | } | ||
| 275 | |||
| 276 | /* Ugh, something corrupted the lock data structure? */ | ||
| 277 | if (depth > 20) { | ||
| 278 | DEBUG_OFF(); | ||
| 279 | printk("\n===========================================\n"); | ||
| 280 | printk( "[ BUG: infinite lock dependency detected!? |\n"); | ||
| 281 | printk( "-------------------------------------------\n"); | ||
| 282 | report_deadlock(task, lock, lockblk, ip); | ||
| 283 | return 0; | ||
| 284 | } | ||
| 285 | |||
| 286 | /* Recursively check for dependencies: */ | ||
| 287 | if (lockblk && check_deadlock(lockblk, depth+1, ti, ip)) { | ||
| 288 | printk("\n============================================\n"); | ||
| 289 | printk( "[ BUG: circular locking deadlock detected! ]\n"); | ||
| 290 | printk( "--------------------------------------------\n"); | ||
| 291 | report_deadlock(task, lock, lockblk, ip); | ||
| 292 | return 0; | ||
| 293 | } | ||
| 294 | return 0; | ||
| 295 | } | ||
| 296 | |||
| 297 | /* | ||
| 298 | * Called when a task exits, this function checks whether the | ||
| 299 | * task is holding any locks, and reports the first one if so: | ||
| 300 | */ | ||
| 301 | void mutex_debug_check_no_locks_held(struct task_struct *task) | ||
| 302 | { | ||
| 303 | struct list_head *curr, *next; | ||
| 304 | struct thread_info *t; | ||
| 305 | unsigned long flags; | ||
| 306 | struct mutex *lock; | ||
| 307 | |||
| 308 | if (!debug_mutex_on) | ||
| 309 | return; | ||
| 310 | |||
| 311 | debug_spin_lock_save(&debug_mutex_lock, flags); | ||
| 312 | list_for_each_safe(curr, next, &debug_mutex_held_locks) { | ||
| 313 | lock = list_entry(curr, struct mutex, held_list); | ||
| 314 | t = lock->owner; | ||
| 315 | if (t != task->thread_info) | ||
| 316 | continue; | ||
| 317 | list_del_init(curr); | ||
| 318 | DEBUG_OFF(); | ||
| 319 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
| 320 | |||
| 321 | printk("BUG: %s/%d, lock held at task exit time!\n", | ||
| 322 | task->comm, task->pid); | ||
| 323 | printk_lock(lock, 1); | ||
| 324 | if (lock->owner != task->thread_info) | ||
| 325 | printk("exiting task is not even the owner??\n"); | ||
| 326 | return; | ||
| 327 | } | ||
| 328 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
| 329 | } | ||
| 330 | |||
| 331 | /* | ||
| 332 | * Called when kernel memory is freed (or unmapped), or if a mutex | ||
| 333 | * is destroyed or reinitialized - this code checks whether there is | ||
| 334 | * any held lock in the memory range of <from> to <to>: | ||
| 335 | */ | ||
| 336 | void mutex_debug_check_no_locks_freed(const void *from, unsigned long len) | ||
| 337 | { | ||
| 338 | struct list_head *curr, *next; | ||
| 339 | const void *to = from + len; | ||
| 340 | unsigned long flags; | ||
| 341 | struct mutex *lock; | ||
| 342 | void *lock_addr; | ||
| 343 | |||
| 344 | if (!debug_mutex_on) | ||
| 345 | return; | ||
| 346 | |||
| 347 | debug_spin_lock_save(&debug_mutex_lock, flags); | ||
| 348 | list_for_each_safe(curr, next, &debug_mutex_held_locks) { | ||
| 349 | lock = list_entry(curr, struct mutex, held_list); | ||
| 350 | lock_addr = lock; | ||
| 351 | if (lock_addr < from || lock_addr >= to) | ||
| 352 | continue; | ||
| 353 | list_del_init(curr); | ||
| 354 | DEBUG_OFF(); | ||
| 355 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
| 356 | |||
| 357 | printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", | ||
| 358 | current->comm, current->pid, lock, from, to); | ||
| 359 | dump_stack(); | ||
| 360 | printk_lock(lock, 1); | ||
| 361 | if (lock->owner != current_thread_info()) | ||
| 362 | printk("freeing task is not even the owner??\n"); | ||
| 363 | return; | ||
| 364 | } | ||
| 365 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
| 366 | } | ||
| 367 | |||
| 368 | /* | ||
| 369 | * Must be called with lock->wait_lock held. | 28 | * Must be called with lock->wait_lock held. |
| 370 | */ | 29 | */ |
| 371 | void debug_mutex_set_owner(struct mutex *lock, | 30 | void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner) |
| 372 | struct thread_info *new_owner __IP_DECL__) | ||
| 373 | { | 31 | { |
| 374 | lock->owner = new_owner; | 32 | lock->owner = new_owner; |
| 375 | DEBUG_WARN_ON(!list_empty(&lock->held_list)); | ||
| 376 | if (debug_mutex_on) { | ||
| 377 | list_add_tail(&lock->held_list, &debug_mutex_held_locks); | ||
| 378 | lock->acquire_ip = ip; | ||
| 379 | } | ||
| 380 | } | 33 | } |
| 381 | 34 | ||
| 382 | void debug_mutex_init_waiter(struct mutex_waiter *waiter) | 35 | void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) |
| 383 | { | 36 | { |
| 384 | memset(waiter, 0x11, sizeof(*waiter)); | 37 | memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); |
| 385 | waiter->magic = waiter; | 38 | waiter->magic = waiter; |
| 386 | INIT_LIST_HEAD(&waiter->list); | 39 | INIT_LIST_HEAD(&waiter->list); |
| 387 | } | 40 | } |
| 388 | 41 | ||
| 389 | void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) | 42 | void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) |
| 390 | { | 43 | { |
| 391 | SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock)); | 44 | SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); |
| 392 | DEBUG_WARN_ON(list_empty(&lock->wait_list)); | 45 | DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list)); |
| 393 | DEBUG_WARN_ON(waiter->magic != waiter); | 46 | DEBUG_LOCKS_WARN_ON(waiter->magic != waiter); |
| 394 | DEBUG_WARN_ON(list_empty(&waiter->list)); | 47 | DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); |
| 395 | } | 48 | } |
| 396 | 49 | ||
| 397 | void debug_mutex_free_waiter(struct mutex_waiter *waiter) | 50 | void debug_mutex_free_waiter(struct mutex_waiter *waiter) |
| 398 | { | 51 | { |
| 399 | DEBUG_WARN_ON(!list_empty(&waiter->list)); | 52 | DEBUG_LOCKS_WARN_ON(!list_empty(&waiter->list)); |
| 400 | memset(waiter, 0x22, sizeof(*waiter)); | 53 | memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter)); |
| 401 | } | 54 | } |
| 402 | 55 | ||
| 403 | void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, | 56 | void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, |
| 404 | struct thread_info *ti __IP_DECL__) | 57 | struct thread_info *ti) |
| 405 | { | 58 | { |
| 406 | SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock)); | 59 | SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); |
| 407 | check_deadlock(lock, 0, ti, ip); | 60 | |
| 408 | /* Mark the current thread as blocked on the lock: */ | 61 | /* Mark the current thread as blocked on the lock: */ |
| 409 | ti->task->blocked_on = waiter; | 62 | ti->task->blocked_on = waiter; |
| 410 | waiter->lock = lock; | 63 | waiter->lock = lock; |
| @@ -413,9 +66,9 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, | |||
| 413 | void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, | 66 | void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, |
| 414 | struct thread_info *ti) | 67 | struct thread_info *ti) |
| 415 | { | 68 | { |
| 416 | DEBUG_WARN_ON(list_empty(&waiter->list)); | 69 | DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); |
| 417 | DEBUG_WARN_ON(waiter->task != ti->task); | 70 | DEBUG_LOCKS_WARN_ON(waiter->task != ti->task); |
| 418 | DEBUG_WARN_ON(ti->task->blocked_on != waiter); | 71 | DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter); |
| 419 | ti->task->blocked_on = NULL; | 72 | ti->task->blocked_on = NULL; |
| 420 | 73 | ||
| 421 | list_del_init(&waiter->list); | 74 | list_del_init(&waiter->list); |
| @@ -424,24 +77,23 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, | |||
| 424 | 77 | ||
| 425 | void debug_mutex_unlock(struct mutex *lock) | 78 | void debug_mutex_unlock(struct mutex *lock) |
| 426 | { | 79 | { |
| 427 | DEBUG_WARN_ON(lock->magic != lock); | 80 | DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); |
| 428 | DEBUG_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); | 81 | DEBUG_LOCKS_WARN_ON(lock->magic != lock); |
| 429 | DEBUG_WARN_ON(lock->owner != current_thread_info()); | 82 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); |
| 430 | if (debug_mutex_on) { | 83 | DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); |
| 431 | DEBUG_WARN_ON(list_empty(&lock->held_list)); | ||
| 432 | list_del_init(&lock->held_list); | ||
| 433 | } | ||
| 434 | } | 84 | } |
| 435 | 85 | ||
| 436 | void debug_mutex_init(struct mutex *lock, const char *name) | 86 | void debug_mutex_init(struct mutex *lock, const char *name, |
| 87 | struct lock_class_key *key) | ||
| 437 | { | 88 | { |
| 89 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
| 438 | /* | 90 | /* |
| 439 | * Make sure we are not reinitializing a held lock: | 91 | * Make sure we are not reinitializing a held lock: |
| 440 | */ | 92 | */ |
| 441 | mutex_debug_check_no_locks_freed((void *)lock, sizeof(*lock)); | 93 | debug_check_no_locks_freed((void *)lock, sizeof(*lock)); |
| 94 | lockdep_init_map(&lock->dep_map, name, key); | ||
| 95 | #endif | ||
| 442 | lock->owner = NULL; | 96 | lock->owner = NULL; |
| 443 | INIT_LIST_HEAD(&lock->held_list); | ||
| 444 | lock->name = name; | ||
| 445 | lock->magic = lock; | 97 | lock->magic = lock; |
| 446 | } | 98 | } |
| 447 | 99 | ||
| @@ -455,7 +107,7 @@ void debug_mutex_init(struct mutex *lock, const char *name) | |||
| 455 | */ | 107 | */ |
| 456 | void fastcall mutex_destroy(struct mutex *lock) | 108 | void fastcall mutex_destroy(struct mutex *lock) |
| 457 | { | 109 | { |
| 458 | DEBUG_WARN_ON(mutex_is_locked(lock)); | 110 | DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock)); |
| 459 | lock->magic = NULL; | 111 | lock->magic = NULL; |
| 460 | } | 112 | } |
| 461 | 113 | ||
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index fd384050acb1..babfbdfc534b 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h | |||
| @@ -10,125 +10,44 @@ | |||
| 10 | * More details are in kernel/mutex-debug.c. | 10 | * More details are in kernel/mutex-debug.c. |
| 11 | */ | 11 | */ |
| 12 | 12 | ||
| 13 | extern spinlock_t debug_mutex_lock; | ||
| 14 | extern struct list_head debug_mutex_held_locks; | ||
| 15 | extern int debug_mutex_on; | ||
| 16 | |||
| 17 | /* | ||
| 18 | * In the debug case we carry the caller's instruction pointer into | ||
| 19 | * other functions, but we dont want the function argument overhead | ||
| 20 | * in the nondebug case - hence these macros: | ||
| 21 | */ | ||
| 22 | #define __IP_DECL__ , unsigned long ip | ||
| 23 | #define __IP__ , ip | ||
| 24 | #define __RET_IP__ , (unsigned long)__builtin_return_address(0) | ||
| 25 | |||
| 26 | /* | 13 | /* |
| 27 | * This must be called with lock->wait_lock held. | 14 | * This must be called with lock->wait_lock held. |
| 28 | */ | 15 | */ |
| 29 | extern void debug_mutex_set_owner(struct mutex *lock, | 16 | extern void |
| 30 | struct thread_info *new_owner __IP_DECL__); | 17 | debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner); |
| 31 | 18 | ||
| 32 | static inline void debug_mutex_clear_owner(struct mutex *lock) | 19 | static inline void debug_mutex_clear_owner(struct mutex *lock) |
| 33 | { | 20 | { |
| 34 | lock->owner = NULL; | 21 | lock->owner = NULL; |
| 35 | } | 22 | } |
| 36 | 23 | ||
| 37 | extern void debug_mutex_init_waiter(struct mutex_waiter *waiter); | 24 | extern void debug_mutex_lock_common(struct mutex *lock, |
| 25 | struct mutex_waiter *waiter); | ||
| 38 | extern void debug_mutex_wake_waiter(struct mutex *lock, | 26 | extern void debug_mutex_wake_waiter(struct mutex *lock, |
| 39 | struct mutex_waiter *waiter); | 27 | struct mutex_waiter *waiter); |
| 40 | extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); | 28 | extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); |
| 41 | extern void debug_mutex_add_waiter(struct mutex *lock, | 29 | extern void debug_mutex_add_waiter(struct mutex *lock, |
| 42 | struct mutex_waiter *waiter, | 30 | struct mutex_waiter *waiter, |
| 43 | struct thread_info *ti __IP_DECL__); | 31 | struct thread_info *ti); |
| 44 | extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, | 32 | extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, |
| 45 | struct thread_info *ti); | 33 | struct thread_info *ti); |
| 46 | extern void debug_mutex_unlock(struct mutex *lock); | 34 | extern void debug_mutex_unlock(struct mutex *lock); |
| 47 | extern void debug_mutex_init(struct mutex *lock, const char *name); | 35 | extern void debug_mutex_init(struct mutex *lock, const char *name, |
| 48 | 36 | struct lock_class_key *key); | |
| 49 | #define debug_spin_lock(lock) \ | ||
| 50 | do { \ | ||
| 51 | local_irq_disable(); \ | ||
| 52 | if (debug_mutex_on) \ | ||
| 53 | spin_lock(lock); \ | ||
| 54 | } while (0) | ||
| 55 | 37 | ||
| 56 | #define debug_spin_unlock(lock) \ | 38 | #define spin_lock_mutex(lock, flags) \ |
| 57 | do { \ | ||
| 58 | if (debug_mutex_on) \ | ||
| 59 | spin_unlock(lock); \ | ||
| 60 | local_irq_enable(); \ | ||
| 61 | preempt_check_resched(); \ | ||
| 62 | } while (0) | ||
| 63 | |||
| 64 | #define debug_spin_lock_save(lock, flags) \ | ||
| 65 | do { \ | 39 | do { \ |
| 40 | struct mutex *l = container_of(lock, struct mutex, wait_lock); \ | ||
| 41 | \ | ||
| 42 | DEBUG_LOCKS_WARN_ON(in_interrupt()); \ | ||
| 66 | local_irq_save(flags); \ | 43 | local_irq_save(flags); \ |
| 67 | if (debug_mutex_on) \ | 44 | __raw_spin_lock(&(lock)->raw_lock); \ |
| 68 | spin_lock(lock); \ | 45 | DEBUG_LOCKS_WARN_ON(l->magic != l); \ |
| 69 | } while (0) | 46 | } while (0) |
| 70 | 47 | ||
| 71 | #define debug_spin_lock_restore(lock, flags) \ | 48 | #define spin_unlock_mutex(lock, flags) \ |
| 72 | do { \ | 49 | do { \ |
| 73 | if (debug_mutex_on) \ | 50 | __raw_spin_unlock(&(lock)->raw_lock); \ |
| 74 | spin_unlock(lock); \ | ||
| 75 | local_irq_restore(flags); \ | 51 | local_irq_restore(flags); \ |
| 76 | preempt_check_resched(); \ | 52 | preempt_check_resched(); \ |
| 77 | } while (0) | 53 | } while (0) |
| 78 | |||
| 79 | #define spin_lock_mutex(lock) \ | ||
| 80 | do { \ | ||
| 81 | struct mutex *l = container_of(lock, struct mutex, wait_lock); \ | ||
| 82 | \ | ||
| 83 | DEBUG_WARN_ON(in_interrupt()); \ | ||
| 84 | debug_spin_lock(&debug_mutex_lock); \ | ||
| 85 | spin_lock(lock); \ | ||
| 86 | DEBUG_WARN_ON(l->magic != l); \ | ||
| 87 | } while (0) | ||
| 88 | |||
| 89 | #define spin_unlock_mutex(lock) \ | ||
| 90 | do { \ | ||
| 91 | spin_unlock(lock); \ | ||
| 92 | debug_spin_unlock(&debug_mutex_lock); \ | ||
| 93 | } while (0) | ||
| 94 | |||
| 95 | #define DEBUG_OFF() \ | ||
| 96 | do { \ | ||
| 97 | if (debug_mutex_on) { \ | ||
| 98 | debug_mutex_on = 0; \ | ||
| 99 | console_verbose(); \ | ||
| 100 | if (spin_is_locked(&debug_mutex_lock)) \ | ||
| 101 | spin_unlock(&debug_mutex_lock); \ | ||
| 102 | } \ | ||
| 103 | } while (0) | ||
| 104 | |||
| 105 | #define DEBUG_BUG() \ | ||
| 106 | do { \ | ||
| 107 | if (debug_mutex_on) { \ | ||
| 108 | DEBUG_OFF(); \ | ||
| 109 | BUG(); \ | ||
| 110 | } \ | ||
| 111 | } while (0) | ||
| 112 | |||
| 113 | #define DEBUG_WARN_ON(c) \ | ||
| 114 | do { \ | ||
| 115 | if (unlikely(c && debug_mutex_on)) { \ | ||
| 116 | DEBUG_OFF(); \ | ||
| 117 | WARN_ON(1); \ | ||
| 118 | } \ | ||
| 119 | } while (0) | ||
| 120 | |||
| 121 | # define DEBUG_BUG_ON(c) \ | ||
| 122 | do { \ | ||
| 123 | if (unlikely(c)) \ | ||
| 124 | DEBUG_BUG(); \ | ||
| 125 | } while (0) | ||
| 126 | |||
| 127 | #ifdef CONFIG_SMP | ||
| 128 | # define SMP_DEBUG_WARN_ON(c) DEBUG_WARN_ON(c) | ||
| 129 | # define SMP_DEBUG_BUG_ON(c) DEBUG_BUG_ON(c) | ||
| 130 | #else | ||
| 131 | # define SMP_DEBUG_WARN_ON(c) do { } while (0) | ||
| 132 | # define SMP_DEBUG_BUG_ON(c) do { } while (0) | ||
| 133 | #endif | ||
| 134 | |||
diff --git a/kernel/mutex.c b/kernel/mutex.c index 5449b210d9ed..8c71cf72a497 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
| @@ -17,6 +17,7 @@ | |||
| 17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
| 18 | #include <linux/spinlock.h> | 18 | #include <linux/spinlock.h> |
| 19 | #include <linux/interrupt.h> | 19 | #include <linux/interrupt.h> |
| 20 | #include <linux/debug_locks.h> | ||
| 20 | 21 | ||
| 21 | /* | 22 | /* |
| 22 | * In the DEBUG case we are using the "NULL fastpath" for mutexes, | 23 | * In the DEBUG case we are using the "NULL fastpath" for mutexes, |
| @@ -38,13 +39,14 @@ | |||
| 38 | * | 39 | * |
| 39 | * It is not allowed to initialize an already locked mutex. | 40 | * It is not allowed to initialize an already locked mutex. |
| 40 | */ | 41 | */ |
| 41 | void fastcall __mutex_init(struct mutex *lock, const char *name) | 42 | void |
| 43 | __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) | ||
| 42 | { | 44 | { |
| 43 | atomic_set(&lock->count, 1); | 45 | atomic_set(&lock->count, 1); |
| 44 | spin_lock_init(&lock->wait_lock); | 46 | spin_lock_init(&lock->wait_lock); |
| 45 | INIT_LIST_HEAD(&lock->wait_list); | 47 | INIT_LIST_HEAD(&lock->wait_list); |
| 46 | 48 | ||
| 47 | debug_mutex_init(lock, name); | 49 | debug_mutex_init(lock, name, key); |
| 48 | } | 50 | } |
| 49 | 51 | ||
| 50 | EXPORT_SYMBOL(__mutex_init); | 52 | EXPORT_SYMBOL(__mutex_init); |
| @@ -56,7 +58,7 @@ EXPORT_SYMBOL(__mutex_init); | |||
| 56 | * branch is predicted by the CPU as default-untaken. | 58 | * branch is predicted by the CPU as default-untaken. |
| 57 | */ | 59 | */ |
| 58 | static void fastcall noinline __sched | 60 | static void fastcall noinline __sched |
| 59 | __mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__); | 61 | __mutex_lock_slowpath(atomic_t *lock_count); |
| 60 | 62 | ||
| 61 | /*** | 63 | /*** |
| 62 | * mutex_lock - acquire the mutex | 64 | * mutex_lock - acquire the mutex |
| @@ -79,7 +81,7 @@ __mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__); | |||
| 79 | * | 81 | * |
| 80 | * This function is similar to (but not equivalent to) down(). | 82 | * This function is similar to (but not equivalent to) down(). |
| 81 | */ | 83 | */ |
| 82 | void fastcall __sched mutex_lock(struct mutex *lock) | 84 | void inline fastcall __sched mutex_lock(struct mutex *lock) |
| 83 | { | 85 | { |
| 84 | might_sleep(); | 86 | might_sleep(); |
| 85 | /* | 87 | /* |
| @@ -92,7 +94,7 @@ void fastcall __sched mutex_lock(struct mutex *lock) | |||
| 92 | EXPORT_SYMBOL(mutex_lock); | 94 | EXPORT_SYMBOL(mutex_lock); |
| 93 | 95 | ||
| 94 | static void fastcall noinline __sched | 96 | static void fastcall noinline __sched |
| 95 | __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__); | 97 | __mutex_unlock_slowpath(atomic_t *lock_count); |
| 96 | 98 | ||
| 97 | /*** | 99 | /*** |
| 98 | * mutex_unlock - release the mutex | 100 | * mutex_unlock - release the mutex |
| @@ -120,17 +122,18 @@ EXPORT_SYMBOL(mutex_unlock); | |||
| 120 | * Lock a mutex (possibly interruptible), slowpath: | 122 | * Lock a mutex (possibly interruptible), slowpath: |
| 121 | */ | 123 | */ |
| 122 | static inline int __sched | 124 | static inline int __sched |
| 123 | __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) | 125 | __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) |
| 124 | { | 126 | { |
| 125 | struct task_struct *task = current; | 127 | struct task_struct *task = current; |
| 126 | struct mutex_waiter waiter; | 128 | struct mutex_waiter waiter; |
| 127 | unsigned int old_val; | 129 | unsigned int old_val; |
| 130 | unsigned long flags; | ||
| 128 | 131 | ||
| 129 | debug_mutex_init_waiter(&waiter); | 132 | spin_lock_mutex(&lock->wait_lock, flags); |
| 130 | 133 | ||
| 131 | spin_lock_mutex(&lock->wait_lock); | 134 | debug_mutex_lock_common(lock, &waiter); |
| 132 | 135 | mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); | |
| 133 | debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip); | 136 | debug_mutex_add_waiter(lock, &waiter, task->thread_info); |
| 134 | 137 | ||
| 135 | /* add waiting tasks to the end of the waitqueue (FIFO): */ | 138 | /* add waiting tasks to the end of the waitqueue (FIFO): */ |
| 136 | list_add_tail(&waiter.list, &lock->wait_list); | 139 | list_add_tail(&waiter.list, &lock->wait_list); |
| @@ -157,7 +160,8 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) | |||
| 157 | if (unlikely(state == TASK_INTERRUPTIBLE && | 160 | if (unlikely(state == TASK_INTERRUPTIBLE && |
| 158 | signal_pending(task))) { | 161 | signal_pending(task))) { |
| 159 | mutex_remove_waiter(lock, &waiter, task->thread_info); | 162 | mutex_remove_waiter(lock, &waiter, task->thread_info); |
| 160 | spin_unlock_mutex(&lock->wait_lock); | 163 | mutex_release(&lock->dep_map, 1, _RET_IP_); |
| 164 | spin_unlock_mutex(&lock->wait_lock, flags); | ||
| 161 | 165 | ||
| 162 | debug_mutex_free_waiter(&waiter); | 166 | debug_mutex_free_waiter(&waiter); |
| 163 | return -EINTR; | 167 | return -EINTR; |
| @@ -165,48 +169,57 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) | |||
| 165 | __set_task_state(task, state); | 169 | __set_task_state(task, state); |
| 166 | 170 | ||
| 167 | /* didnt get the lock, go to sleep: */ | 171 | /* didnt get the lock, go to sleep: */ |
| 168 | spin_unlock_mutex(&lock->wait_lock); | 172 | spin_unlock_mutex(&lock->wait_lock, flags); |
| 169 | schedule(); | 173 | schedule(); |
| 170 | spin_lock_mutex(&lock->wait_lock); | 174 | spin_lock_mutex(&lock->wait_lock, flags); |
| 171 | } | 175 | } |
| 172 | 176 | ||
| 173 | /* got the lock - rejoice! */ | 177 | /* got the lock - rejoice! */ |
| 174 | mutex_remove_waiter(lock, &waiter, task->thread_info); | 178 | mutex_remove_waiter(lock, &waiter, task->thread_info); |
| 175 | debug_mutex_set_owner(lock, task->thread_info __IP__); | 179 | debug_mutex_set_owner(lock, task->thread_info); |
| 176 | 180 | ||
| 177 | /* set it to 0 if there are no waiters left: */ | 181 | /* set it to 0 if there are no waiters left: */ |
| 178 | if (likely(list_empty(&lock->wait_list))) | 182 | if (likely(list_empty(&lock->wait_list))) |
| 179 | atomic_set(&lock->count, 0); | 183 | atomic_set(&lock->count, 0); |
| 180 | 184 | ||
| 181 | spin_unlock_mutex(&lock->wait_lock); | 185 | spin_unlock_mutex(&lock->wait_lock, flags); |
| 182 | 186 | ||
| 183 | debug_mutex_free_waiter(&waiter); | 187 | debug_mutex_free_waiter(&waiter); |
| 184 | 188 | ||
| 185 | DEBUG_WARN_ON(list_empty(&lock->held_list)); | ||
| 186 | DEBUG_WARN_ON(lock->owner != task->thread_info); | ||
| 187 | |||
| 188 | return 0; | 189 | return 0; |
| 189 | } | 190 | } |
| 190 | 191 | ||
| 191 | static void fastcall noinline __sched | 192 | static void fastcall noinline __sched |
| 192 | __mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__) | 193 | __mutex_lock_slowpath(atomic_t *lock_count) |
| 193 | { | 194 | { |
| 194 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 195 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
| 195 | 196 | ||
| 196 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE __IP__); | 197 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0); |
| 198 | } | ||
| 199 | |||
| 200 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
| 201 | void __sched | ||
| 202 | mutex_lock_nested(struct mutex *lock, unsigned int subclass) | ||
| 203 | { | ||
| 204 | might_sleep(); | ||
| 205 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass); | ||
| 197 | } | 206 | } |
| 198 | 207 | ||
| 208 | EXPORT_SYMBOL_GPL(mutex_lock_nested); | ||
| 209 | #endif | ||
| 210 | |||
| 199 | /* | 211 | /* |
| 200 | * Release the lock, slowpath: | 212 | * Release the lock, slowpath: |
| 201 | */ | 213 | */ |
| 202 | static fastcall noinline void | 214 | static fastcall inline void |
| 203 | __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) | 215 | __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) |
| 204 | { | 216 | { |
| 205 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 217 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
| 218 | unsigned long flags; | ||
| 206 | 219 | ||
| 207 | DEBUG_WARN_ON(lock->owner != current_thread_info()); | 220 | spin_lock_mutex(&lock->wait_lock, flags); |
| 208 | 221 | mutex_release(&lock->dep_map, nested, _RET_IP_); | |
| 209 | spin_lock_mutex(&lock->wait_lock); | 222 | debug_mutex_unlock(lock); |
| 210 | 223 | ||
| 211 | /* | 224 | /* |
| 212 | * some architectures leave the lock unlocked in the fastpath failure | 225 | * some architectures leave the lock unlocked in the fastpath failure |
| @@ -216,8 +229,6 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) | |||
| 216 | if (__mutex_slowpath_needs_to_unlock()) | 229 | if (__mutex_slowpath_needs_to_unlock()) |
| 217 | atomic_set(&lock->count, 1); | 230 | atomic_set(&lock->count, 1); |
| 218 | 231 | ||
| 219 | debug_mutex_unlock(lock); | ||
| 220 | |||
| 221 | if (!list_empty(&lock->wait_list)) { | 232 | if (!list_empty(&lock->wait_list)) { |
| 222 | /* get the first entry from the wait-list: */ | 233 | /* get the first entry from the wait-list: */ |
| 223 | struct mutex_waiter *waiter = | 234 | struct mutex_waiter *waiter = |
| @@ -231,7 +242,16 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) | |||
| 231 | 242 | ||
| 232 | debug_mutex_clear_owner(lock); | 243 | debug_mutex_clear_owner(lock); |
| 233 | 244 | ||
| 234 | spin_unlock_mutex(&lock->wait_lock); | 245 | spin_unlock_mutex(&lock->wait_lock, flags); |
| 246 | } | ||
| 247 | |||
| 248 | /* | ||
| 249 | * Release the lock, slowpath: | ||
| 250 | */ | ||
| 251 | static fastcall noinline void | ||
| 252 | __mutex_unlock_slowpath(atomic_t *lock_count) | ||
| 253 | { | ||
| 254 | __mutex_unlock_common_slowpath(lock_count, 1); | ||
| 235 | } | 255 | } |
| 236 | 256 | ||
| 237 | /* | 257 | /* |
| @@ -239,7 +259,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) | |||
| 239 | * mutex_lock_interruptible() and mutex_trylock(). | 259 | * mutex_lock_interruptible() and mutex_trylock(). |
| 240 | */ | 260 | */ |
| 241 | static int fastcall noinline __sched | 261 | static int fastcall noinline __sched |
| 242 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__); | 262 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count); |
| 243 | 263 | ||
| 244 | /*** | 264 | /*** |
| 245 | * mutex_lock_interruptible - acquire the mutex, interruptable | 265 | * mutex_lock_interruptible - acquire the mutex, interruptable |
| @@ -262,11 +282,11 @@ int fastcall __sched mutex_lock_interruptible(struct mutex *lock) | |||
| 262 | EXPORT_SYMBOL(mutex_lock_interruptible); | 282 | EXPORT_SYMBOL(mutex_lock_interruptible); |
| 263 | 283 | ||
| 264 | static int fastcall noinline __sched | 284 | static int fastcall noinline __sched |
| 265 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__) | 285 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count) |
| 266 | { | 286 | { |
| 267 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 287 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
| 268 | 288 | ||
| 269 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE __IP__); | 289 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0); |
| 270 | } | 290 | } |
| 271 | 291 | ||
| 272 | /* | 292 | /* |
| @@ -276,18 +296,21 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__) | |||
| 276 | static inline int __mutex_trylock_slowpath(atomic_t *lock_count) | 296 | static inline int __mutex_trylock_slowpath(atomic_t *lock_count) |
| 277 | { | 297 | { |
| 278 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 298 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
| 299 | unsigned long flags; | ||
| 279 | int prev; | 300 | int prev; |
| 280 | 301 | ||
| 281 | spin_lock_mutex(&lock->wait_lock); | 302 | spin_lock_mutex(&lock->wait_lock, flags); |
| 282 | 303 | ||
| 283 | prev = atomic_xchg(&lock->count, -1); | 304 | prev = atomic_xchg(&lock->count, -1); |
| 284 | if (likely(prev == 1)) | 305 | if (likely(prev == 1)) { |
| 285 | debug_mutex_set_owner(lock, current_thread_info() __RET_IP__); | 306 | debug_mutex_set_owner(lock, current_thread_info()); |
| 307 | mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); | ||
| 308 | } | ||
| 286 | /* Set it back to 0 if there are no waiters: */ | 309 | /* Set it back to 0 if there are no waiters: */ |
| 287 | if (likely(list_empty(&lock->wait_list))) | 310 | if (likely(list_empty(&lock->wait_list))) |
| 288 | atomic_set(&lock->count, 0); | 311 | atomic_set(&lock->count, 0); |
| 289 | 312 | ||
| 290 | spin_unlock_mutex(&lock->wait_lock); | 313 | spin_unlock_mutex(&lock->wait_lock, flags); |
| 291 | 314 | ||
| 292 | return prev == 1; | 315 | return prev == 1; |
| 293 | } | 316 | } |
| @@ -306,7 +329,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) | |||
| 306 | * This function must not be used in interrupt context. The | 329 | * This function must not be used in interrupt context. The |
| 307 | * mutex must be released by the same task that acquired it. | 330 | * mutex must be released by the same task that acquired it. |
| 308 | */ | 331 | */ |
| 309 | int fastcall mutex_trylock(struct mutex *lock) | 332 | int fastcall __sched mutex_trylock(struct mutex *lock) |
| 310 | { | 333 | { |
| 311 | return __mutex_fastpath_trylock(&lock->count, | 334 | return __mutex_fastpath_trylock(&lock->count, |
| 312 | __mutex_trylock_slowpath); | 335 | __mutex_trylock_slowpath); |
diff --git a/kernel/mutex.h b/kernel/mutex.h index 00fe84e7b672..a075dafbb290 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h | |||
| @@ -9,27 +9,22 @@ | |||
| 9 | * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: | 9 | * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: |
| 10 | */ | 10 | */ |
| 11 | 11 | ||
| 12 | #define spin_lock_mutex(lock) spin_lock(lock) | 12 | #define spin_lock_mutex(lock, flags) \ |
| 13 | #define spin_unlock_mutex(lock) spin_unlock(lock) | 13 | do { spin_lock(lock); (void)(flags); } while (0) |
| 14 | #define spin_unlock_mutex(lock, flags) \ | ||
| 15 | do { spin_unlock(lock); (void)(flags); } while (0) | ||
| 14 | #define mutex_remove_waiter(lock, waiter, ti) \ | 16 | #define mutex_remove_waiter(lock, waiter, ti) \ |
| 15 | __list_del((waiter)->list.prev, (waiter)->list.next) | 17 | __list_del((waiter)->list.prev, (waiter)->list.next) |
| 16 | 18 | ||
| 17 | #define DEBUG_WARN_ON(c) do { } while (0) | ||
| 18 | #define debug_mutex_set_owner(lock, new_owner) do { } while (0) | 19 | #define debug_mutex_set_owner(lock, new_owner) do { } while (0) |
| 19 | #define debug_mutex_clear_owner(lock) do { } while (0) | 20 | #define debug_mutex_clear_owner(lock) do { } while (0) |
| 20 | #define debug_mutex_init_waiter(waiter) do { } while (0) | ||
| 21 | #define debug_mutex_wake_waiter(lock, waiter) do { } while (0) | 21 | #define debug_mutex_wake_waiter(lock, waiter) do { } while (0) |
| 22 | #define debug_mutex_free_waiter(waiter) do { } while (0) | 22 | #define debug_mutex_free_waiter(waiter) do { } while (0) |
| 23 | #define debug_mutex_add_waiter(lock, waiter, ti, ip) do { } while (0) | 23 | #define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) |
| 24 | #define debug_mutex_unlock(lock) do { } while (0) | 24 | #define debug_mutex_unlock(lock) do { } while (0) |
| 25 | #define debug_mutex_init(lock, name) do { } while (0) | 25 | #define debug_mutex_init(lock, name, key) do { } while (0) |
| 26 | |||
| 27 | /* | ||
| 28 | * Return-address parameters/declarations. They are very useful for | ||
| 29 | * debugging, but add overhead in the !DEBUG case - so we go the | ||
| 30 | * trouble of using this not too elegant but zero-cost solution: | ||
| 31 | */ | ||
| 32 | #define __IP_DECL__ | ||
| 33 | #define __IP__ | ||
| 34 | #define __RET_IP__ | ||
| 35 | 26 | ||
| 27 | static inline void | ||
| 28 | debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) | ||
| 29 | { | ||
| 30 | } | ||
diff --git a/kernel/panic.c b/kernel/panic.c index cc2a4c9c36ac..525e365f7239 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
| @@ -8,7 +8,6 @@ | |||
| 8 | * This function is used through-out the kernel (including mm and fs) | 8 | * This function is used through-out the kernel (including mm and fs) |
| 9 | * to indicate a major problem. | 9 | * to indicate a major problem. |
| 10 | */ | 10 | */ |
| 11 | #include <linux/config.h> | ||
| 12 | #include <linux/module.h> | 11 | #include <linux/module.h> |
| 13 | #include <linux/sched.h> | 12 | #include <linux/sched.h> |
| 14 | #include <linux/delay.h> | 13 | #include <linux/delay.h> |
| @@ -19,6 +18,7 @@ | |||
| 19 | #include <linux/interrupt.h> | 18 | #include <linux/interrupt.h> |
| 20 | #include <linux/nmi.h> | 19 | #include <linux/nmi.h> |
| 21 | #include <linux/kexec.h> | 20 | #include <linux/kexec.h> |
| 21 | #include <linux/debug_locks.h> | ||
| 22 | 22 | ||
| 23 | int panic_on_oops; | 23 | int panic_on_oops; |
| 24 | int tainted; | 24 | int tainted; |
| @@ -173,6 +173,7 @@ const char *print_tainted(void) | |||
| 173 | 173 | ||
| 174 | void add_taint(unsigned flag) | 174 | void add_taint(unsigned flag) |
| 175 | { | 175 | { |
| 176 | debug_locks = 0; /* can't trust the integrity of the kernel anymore */ | ||
| 176 | tainted |= flag; | 177 | tainted |= flag; |
| 177 | } | 178 | } |
| 178 | EXPORT_SYMBOL(add_taint); | 179 | EXPORT_SYMBOL(add_taint); |
| @@ -257,6 +258,7 @@ int oops_may_print(void) | |||
| 257 | */ | 258 | */ |
| 258 | void oops_enter(void) | 259 | void oops_enter(void) |
| 259 | { | 260 | { |
| 261 | debug_locks_off(); /* can't trust the integrity of the kernel anymore */ | ||
| 260 | do_oops_enter_exit(); | 262 | do_oops_enter_exit(); |
| 261 | } | 263 | } |
| 262 | 264 | ||
| @@ -268,3 +270,15 @@ void oops_exit(void) | |||
| 268 | { | 270 | { |
| 269 | do_oops_enter_exit(); | 271 | do_oops_enter_exit(); |
| 270 | } | 272 | } |
| 273 | |||
| 274 | #ifdef CONFIG_CC_STACKPROTECTOR | ||
| 275 | /* | ||
| 276 | * Called when gcc's -fstack-protector feature is used, and | ||
| 277 | * gcc detects corruption of the on-stack canary value | ||
| 278 | */ | ||
| 279 | void __stack_chk_fail(void) | ||
| 280 | { | ||
| 281 | panic("stack-protector: Kernel stack is corrupted"); | ||
| 282 | } | ||
| 283 | EXPORT_SYMBOL(__stack_chk_fail); | ||
| 284 | #endif | ||
diff --git a/kernel/params.c b/kernel/params.c index af43ecdc8d9b..f406655d6653 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
| @@ -15,7 +15,6 @@ | |||
| 15 | along with this program; if not, write to the Free Software | 15 | along with this program; if not, write to the Free Software |
| 16 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 16 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
| 17 | */ | 17 | */ |
| 18 | #include <linux/config.h> | ||
| 19 | #include <linux/moduleparam.h> | 18 | #include <linux/moduleparam.h> |
| 20 | #include <linux/kernel.h> | 19 | #include <linux/kernel.h> |
| 21 | #include <linux/string.h> | 20 | #include <linux/string.h> |
| @@ -548,6 +547,7 @@ static void __init kernel_param_sysfs_setup(const char *name, | |||
| 548 | unsigned int name_skip) | 547 | unsigned int name_skip) |
| 549 | { | 548 | { |
| 550 | struct module_kobject *mk; | 549 | struct module_kobject *mk; |
| 550 | int ret; | ||
| 551 | 551 | ||
| 552 | mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); | 552 | mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); |
| 553 | BUG_ON(!mk); | 553 | BUG_ON(!mk); |
| @@ -555,7 +555,8 @@ static void __init kernel_param_sysfs_setup(const char *name, | |||
| 555 | mk->mod = THIS_MODULE; | 555 | mk->mod = THIS_MODULE; |
| 556 | kobj_set_kset_s(mk, module_subsys); | 556 | kobj_set_kset_s(mk, module_subsys); |
| 557 | kobject_set_name(&mk->kobj, name); | 557 | kobject_set_name(&mk->kobj, name); |
| 558 | kobject_register(&mk->kobj); | 558 | ret = kobject_register(&mk->kobj); |
| 559 | BUG_ON(ret < 0); | ||
| 559 | 560 | ||
| 560 | /* no need to keep the kobject if no parameter is exported */ | 561 | /* no need to keep the kobject if no parameter is exported */ |
| 561 | if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) { | 562 | if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) { |
| @@ -685,13 +686,20 @@ decl_subsys(module, &module_ktype, NULL); | |||
| 685 | */ | 686 | */ |
| 686 | static int __init param_sysfs_init(void) | 687 | static int __init param_sysfs_init(void) |
| 687 | { | 688 | { |
| 688 | subsystem_register(&module_subsys); | 689 | int ret; |
| 690 | |||
| 691 | ret = subsystem_register(&module_subsys); | ||
| 692 | if (ret < 0) { | ||
| 693 | printk(KERN_WARNING "%s (%d): subsystem_register error: %d\n", | ||
| 694 | __FILE__, __LINE__, ret); | ||
| 695 | return ret; | ||
| 696 | } | ||
| 689 | 697 | ||
| 690 | param_sysfs_builtin(); | 698 | param_sysfs_builtin(); |
| 691 | 699 | ||
| 692 | return 0; | 700 | return 0; |
| 693 | } | 701 | } |
| 694 | __initcall(param_sysfs_init); | 702 | subsys_initcall(param_sysfs_init); |
| 695 | 703 | ||
| 696 | EXPORT_SYMBOL(param_set_byte); | 704 | EXPORT_SYMBOL(param_set_byte); |
| 697 | EXPORT_SYMBOL(param_get_byte); | 705 | EXPORT_SYMBOL(param_get_byte); |
diff --git a/kernel/pid.c b/kernel/pid.c index eeb836b65ca4..8387e8c68193 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
| @@ -218,14 +218,11 @@ struct pid * fastcall find_pid(int nr) | |||
| 218 | return NULL; | 218 | return NULL; |
| 219 | } | 219 | } |
| 220 | 220 | ||
| 221 | int fastcall attach_pid(task_t *task, enum pid_type type, int nr) | 221 | int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr) |
| 222 | { | 222 | { |
| 223 | struct pid_link *link; | 223 | struct pid_link *link; |
| 224 | struct pid *pid; | 224 | struct pid *pid; |
| 225 | 225 | ||
| 226 | WARN_ON(!task->pid); /* to be removed soon */ | ||
| 227 | WARN_ON(!nr); /* to be removed soon */ | ||
| 228 | |||
| 229 | link = &task->pids[type]; | 226 | link = &task->pids[type]; |
| 230 | link->pid = pid = find_pid(nr); | 227 | link->pid = pid = find_pid(nr); |
| 231 | hlist_add_head_rcu(&link->node, &pid->tasks[type]); | 228 | hlist_add_head_rcu(&link->node, &pid->tasks[type]); |
| @@ -233,7 +230,7 @@ int fastcall attach_pid(task_t *task, enum pid_type type, int nr) | |||
| 233 | return 0; | 230 | return 0; |
| 234 | } | 231 | } |
| 235 | 232 | ||
| 236 | void fastcall detach_pid(task_t *task, enum pid_type type) | 233 | void fastcall detach_pid(struct task_struct *task, enum pid_type type) |
| 237 | { | 234 | { |
| 238 | struct pid_link *link; | 235 | struct pid_link *link; |
| 239 | struct pid *pid; | 236 | struct pid *pid; |
| @@ -252,6 +249,15 @@ void fastcall detach_pid(task_t *task, enum pid_type type) | |||
| 252 | free_pid(pid); | 249 | free_pid(pid); |
| 253 | } | 250 | } |
| 254 | 251 | ||
| 252 | /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ | ||
| 253 | void fastcall transfer_pid(struct task_struct *old, struct task_struct *new, | ||
| 254 | enum pid_type type) | ||
| 255 | { | ||
| 256 | new->pids[type].pid = old->pids[type].pid; | ||
| 257 | hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node); | ||
| 258 | old->pids[type].pid = NULL; | ||
| 259 | } | ||
| 260 | |||
| 255 | struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type) | 261 | struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type) |
| 256 | { | 262 | { |
| 257 | struct task_struct *result = NULL; | 263 | struct task_struct *result = NULL; |
| @@ -267,7 +273,7 @@ struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type) | |||
| 267 | /* | 273 | /* |
| 268 | * Must be called under rcu_read_lock() or with tasklist_lock read-held. | 274 | * Must be called under rcu_read_lock() or with tasklist_lock read-held. |
| 269 | */ | 275 | */ |
| 270 | task_t *find_task_by_pid_type(int type, int nr) | 276 | struct task_struct *find_task_by_pid_type(int type, int nr) |
| 271 | { | 277 | { |
| 272 | return pid_task(find_pid(nr), type); | 278 | return pid_task(find_pid(nr), type); |
| 273 | } | 279 | } |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index d38d9ec3276c..479b16b44f79 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
| @@ -1393,25 +1393,13 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, | |||
| 1393 | } | 1393 | } |
| 1394 | } | 1394 | } |
| 1395 | 1395 | ||
| 1396 | static long posix_cpu_clock_nanosleep_restart(struct restart_block *); | 1396 | static int do_cpu_nanosleep(const clockid_t which_clock, int flags, |
| 1397 | 1397 | struct timespec *rqtp, struct itimerspec *it) | |
| 1398 | int posix_cpu_nsleep(const clockid_t which_clock, int flags, | ||
| 1399 | struct timespec *rqtp, struct timespec __user *rmtp) | ||
| 1400 | { | 1398 | { |
| 1401 | struct restart_block *restart_block = | ||
| 1402 | ¤t_thread_info()->restart_block; | ||
| 1403 | struct k_itimer timer; | 1399 | struct k_itimer timer; |
| 1404 | int error; | 1400 | int error; |
| 1405 | 1401 | ||
| 1406 | /* | 1402 | /* |
| 1407 | * Diagnose required errors first. | ||
| 1408 | */ | ||
| 1409 | if (CPUCLOCK_PERTHREAD(which_clock) && | ||
| 1410 | (CPUCLOCK_PID(which_clock) == 0 || | ||
| 1411 | CPUCLOCK_PID(which_clock) == current->pid)) | ||
| 1412 | return -EINVAL; | ||
| 1413 | |||
| 1414 | /* | ||
| 1415 | * Set up a temporary timer and then wait for it to go off. | 1403 | * Set up a temporary timer and then wait for it to go off. |
| 1416 | */ | 1404 | */ |
| 1417 | memset(&timer, 0, sizeof timer); | 1405 | memset(&timer, 0, sizeof timer); |
| @@ -1422,11 +1410,12 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags, | |||
| 1422 | timer.it_process = current; | 1410 | timer.it_process = current; |
| 1423 | if (!error) { | 1411 | if (!error) { |
| 1424 | static struct itimerspec zero_it; | 1412 | static struct itimerspec zero_it; |
| 1425 | struct itimerspec it = { .it_value = *rqtp, | 1413 | |
| 1426 | .it_interval = {} }; | 1414 | memset(it, 0, sizeof *it); |
| 1415 | it->it_value = *rqtp; | ||
| 1427 | 1416 | ||
| 1428 | spin_lock_irq(&timer.it_lock); | 1417 | spin_lock_irq(&timer.it_lock); |
| 1429 | error = posix_cpu_timer_set(&timer, flags, &it, NULL); | 1418 | error = posix_cpu_timer_set(&timer, flags, it, NULL); |
| 1430 | if (error) { | 1419 | if (error) { |
| 1431 | spin_unlock_irq(&timer.it_lock); | 1420 | spin_unlock_irq(&timer.it_lock); |
| 1432 | return error; | 1421 | return error; |
| @@ -1454,49 +1443,89 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags, | |||
| 1454 | * We were interrupted by a signal. | 1443 | * We were interrupted by a signal. |
| 1455 | */ | 1444 | */ |
| 1456 | sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); | 1445 | sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); |
| 1457 | posix_cpu_timer_set(&timer, 0, &zero_it, &it); | 1446 | posix_cpu_timer_set(&timer, 0, &zero_it, it); |
| 1458 | spin_unlock_irq(&timer.it_lock); | 1447 | spin_unlock_irq(&timer.it_lock); |
| 1459 | 1448 | ||
| 1460 | if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) { | 1449 | if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { |
| 1461 | /* | 1450 | /* |
| 1462 | * It actually did fire already. | 1451 | * It actually did fire already. |
| 1463 | */ | 1452 | */ |
| 1464 | return 0; | 1453 | return 0; |
| 1465 | } | 1454 | } |
| 1466 | 1455 | ||
| 1456 | error = -ERESTART_RESTARTBLOCK; | ||
| 1457 | } | ||
| 1458 | |||
| 1459 | return error; | ||
| 1460 | } | ||
| 1461 | |||
| 1462 | int posix_cpu_nsleep(const clockid_t which_clock, int flags, | ||
| 1463 | struct timespec *rqtp, struct timespec __user *rmtp) | ||
| 1464 | { | ||
| 1465 | struct restart_block *restart_block = | ||
| 1466 | ¤t_thread_info()->restart_block; | ||
| 1467 | struct itimerspec it; | ||
| 1468 | int error; | ||
| 1469 | |||
| 1470 | /* | ||
| 1471 | * Diagnose required errors first. | ||
| 1472 | */ | ||
| 1473 | if (CPUCLOCK_PERTHREAD(which_clock) && | ||
| 1474 | (CPUCLOCK_PID(which_clock) == 0 || | ||
| 1475 | CPUCLOCK_PID(which_clock) == current->pid)) | ||
| 1476 | return -EINVAL; | ||
| 1477 | |||
| 1478 | error = do_cpu_nanosleep(which_clock, flags, rqtp, &it); | ||
| 1479 | |||
| 1480 | if (error == -ERESTART_RESTARTBLOCK) { | ||
| 1481 | |||
| 1482 | if (flags & TIMER_ABSTIME) | ||
| 1483 | return -ERESTARTNOHAND; | ||
| 1467 | /* | 1484 | /* |
| 1468 | * Report back to the user the time still remaining. | 1485 | * Report back to the user the time still remaining. |
| 1469 | */ | 1486 | */ |
| 1470 | if (rmtp != NULL && !(flags & TIMER_ABSTIME) && | 1487 | if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) |
| 1471 | copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) | ||
| 1472 | return -EFAULT; | 1488 | return -EFAULT; |
| 1473 | 1489 | ||
| 1474 | restart_block->fn = posix_cpu_clock_nanosleep_restart; | 1490 | restart_block->fn = posix_cpu_nsleep_restart; |
| 1475 | /* Caller already set restart_block->arg1 */ | ||
| 1476 | restart_block->arg0 = which_clock; | 1491 | restart_block->arg0 = which_clock; |
| 1477 | restart_block->arg1 = (unsigned long) rmtp; | 1492 | restart_block->arg1 = (unsigned long) rmtp; |
| 1478 | restart_block->arg2 = rqtp->tv_sec; | 1493 | restart_block->arg2 = rqtp->tv_sec; |
| 1479 | restart_block->arg3 = rqtp->tv_nsec; | 1494 | restart_block->arg3 = rqtp->tv_nsec; |
| 1480 | |||
| 1481 | error = -ERESTART_RESTARTBLOCK; | ||
| 1482 | } | 1495 | } |
| 1483 | |||
| 1484 | return error; | 1496 | return error; |
| 1485 | } | 1497 | } |
| 1486 | 1498 | ||
| 1487 | static long | 1499 | long posix_cpu_nsleep_restart(struct restart_block *restart_block) |
| 1488 | posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block) | ||
| 1489 | { | 1500 | { |
| 1490 | clockid_t which_clock = restart_block->arg0; | 1501 | clockid_t which_clock = restart_block->arg0; |
| 1491 | struct timespec __user *rmtp; | 1502 | struct timespec __user *rmtp; |
| 1492 | struct timespec t; | 1503 | struct timespec t; |
| 1504 | struct itimerspec it; | ||
| 1505 | int error; | ||
| 1493 | 1506 | ||
| 1494 | rmtp = (struct timespec __user *) restart_block->arg1; | 1507 | rmtp = (struct timespec __user *) restart_block->arg1; |
| 1495 | t.tv_sec = restart_block->arg2; | 1508 | t.tv_sec = restart_block->arg2; |
| 1496 | t.tv_nsec = restart_block->arg3; | 1509 | t.tv_nsec = restart_block->arg3; |
| 1497 | 1510 | ||
| 1498 | restart_block->fn = do_no_restart_syscall; | 1511 | restart_block->fn = do_no_restart_syscall; |
| 1499 | return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t, rmtp); | 1512 | error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); |
| 1513 | |||
| 1514 | if (error == -ERESTART_RESTARTBLOCK) { | ||
| 1515 | /* | ||
| 1516 | * Report back to the user the time still remaining. | ||
| 1517 | */ | ||
| 1518 | if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) | ||
| 1519 | return -EFAULT; | ||
| 1520 | |||
| 1521 | restart_block->fn = posix_cpu_nsleep_restart; | ||
| 1522 | restart_block->arg0 = which_clock; | ||
| 1523 | restart_block->arg1 = (unsigned long) rmtp; | ||
| 1524 | restart_block->arg2 = t.tv_sec; | ||
| 1525 | restart_block->arg3 = t.tv_nsec; | ||
| 1526 | } | ||
| 1527 | return error; | ||
| 1528 | |||
| 1500 | } | 1529 | } |
| 1501 | 1530 | ||
| 1502 | 1531 | ||
| @@ -1524,6 +1553,10 @@ static int process_cpu_nsleep(const clockid_t which_clock, int flags, | |||
| 1524 | { | 1553 | { |
| 1525 | return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); | 1554 | return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); |
| 1526 | } | 1555 | } |
| 1556 | static long process_cpu_nsleep_restart(struct restart_block *restart_block) | ||
| 1557 | { | ||
| 1558 | return -EINVAL; | ||
| 1559 | } | ||
| 1527 | static int thread_cpu_clock_getres(const clockid_t which_clock, | 1560 | static int thread_cpu_clock_getres(const clockid_t which_clock, |
| 1528 | struct timespec *tp) | 1561 | struct timespec *tp) |
| 1529 | { | 1562 | { |
| @@ -1544,6 +1577,10 @@ static int thread_cpu_nsleep(const clockid_t which_clock, int flags, | |||
| 1544 | { | 1577 | { |
| 1545 | return -EINVAL; | 1578 | return -EINVAL; |
| 1546 | } | 1579 | } |
| 1580 | static long thread_cpu_nsleep_restart(struct restart_block *restart_block) | ||
| 1581 | { | ||
| 1582 | return -EINVAL; | ||
| 1583 | } | ||
| 1547 | 1584 | ||
| 1548 | static __init int init_posix_cpu_timers(void) | 1585 | static __init int init_posix_cpu_timers(void) |
| 1549 | { | 1586 | { |
| @@ -1553,6 +1590,7 @@ static __init int init_posix_cpu_timers(void) | |||
| 1553 | .clock_set = do_posix_clock_nosettime, | 1590 | .clock_set = do_posix_clock_nosettime, |
| 1554 | .timer_create = process_cpu_timer_create, | 1591 | .timer_create = process_cpu_timer_create, |
| 1555 | .nsleep = process_cpu_nsleep, | 1592 | .nsleep = process_cpu_nsleep, |
| 1593 | .nsleep_restart = process_cpu_nsleep_restart, | ||
| 1556 | }; | 1594 | }; |
| 1557 | struct k_clock thread = { | 1595 | struct k_clock thread = { |
| 1558 | .clock_getres = thread_cpu_clock_getres, | 1596 | .clock_getres = thread_cpu_clock_getres, |
| @@ -1560,6 +1598,7 @@ static __init int init_posix_cpu_timers(void) | |||
| 1560 | .clock_set = do_posix_clock_nosettime, | 1598 | .clock_set = do_posix_clock_nosettime, |
| 1561 | .timer_create = thread_cpu_timer_create, | 1599 | .timer_create = thread_cpu_timer_create, |
| 1562 | .nsleep = thread_cpu_nsleep, | 1600 | .nsleep = thread_cpu_nsleep, |
| 1601 | .nsleep_restart = thread_cpu_nsleep_restart, | ||
| 1563 | }; | 1602 | }; |
| 1564 | 1603 | ||
| 1565 | register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); | 1604 | register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index ac6dc8744429..e5ebcc1ec3a0 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
| @@ -973,3 +973,24 @@ sys_clock_nanosleep(const clockid_t which_clock, int flags, | |||
| 973 | return CLOCK_DISPATCH(which_clock, nsleep, | 973 | return CLOCK_DISPATCH(which_clock, nsleep, |
| 974 | (which_clock, flags, &t, rmtp)); | 974 | (which_clock, flags, &t, rmtp)); |
| 975 | } | 975 | } |
| 976 | |||
| 977 | /* | ||
| 978 | * nanosleep_restart for monotonic and realtime clocks | ||
| 979 | */ | ||
| 980 | static int common_nsleep_restart(struct restart_block *restart_block) | ||
| 981 | { | ||
| 982 | return hrtimer_nanosleep_restart(restart_block); | ||
| 983 | } | ||
| 984 | |||
| 985 | /* | ||
| 986 | * This will restart clock_nanosleep. This is required only by | ||
| 987 | * compat_clock_nanosleep_restart for now. | ||
| 988 | */ | ||
| 989 | long | ||
| 990 | clock_nanosleep_restart(struct restart_block *restart_block) | ||
| 991 | { | ||
| 992 | clockid_t which_clock = restart_block->arg0; | ||
| 993 | |||
| 994 | return CLOCK_DISPATCH(which_clock, nsleep_restart, | ||
| 995 | (restart_block)); | ||
| 996 | } | ||
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ce0dfb8f4a4e..825068ca3479 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
| @@ -36,9 +36,49 @@ config PM_DEBUG | |||
| 36 | code. This is helpful when debugging and reporting various PM bugs, | 36 | code. This is helpful when debugging and reporting various PM bugs, |
| 37 | like suspend support. | 37 | like suspend support. |
| 38 | 38 | ||
| 39 | config DISABLE_CONSOLE_SUSPEND | ||
| 40 | bool "Keep console(s) enabled during suspend/resume (DANGEROUS)" | ||
| 41 | depends on PM && PM_DEBUG | ||
| 42 | default n | ||
| 43 | ---help--- | ||
| 44 | This option turns off the console suspend mechanism that prevents | ||
| 45 | debug messages from reaching the console during the suspend/resume | ||
| 46 | operations. This may be helpful when debugging device drivers' | ||
| 47 | suspend/resume routines, but may itself lead to problems, for example | ||
| 48 | if netconsole is used. | ||
| 49 | |||
| 50 | config PM_TRACE | ||
| 51 | bool "Suspend/resume event tracing" | ||
| 52 | depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL | ||
| 53 | default n | ||
| 54 | ---help--- | ||
| 55 | This enables some cheesy code to save the last PM event point in the | ||
| 56 | RTC across reboots, so that you can debug a machine that just hangs | ||
| 57 | during suspend (or more commonly, during resume). | ||
| 58 | |||
| 59 | To use this debugging feature you should attempt to suspend the machine, | ||
| 60 | then reboot it, then run | ||
| 61 | |||
| 62 | dmesg -s 1000000 | grep 'hash matches' | ||
| 63 | |||
| 64 | CAUTION: this option will cause your machine's real-time clock to be | ||
| 65 | set to an invalid time after a resume. | ||
| 66 | |||
| 67 | config PM_SYSFS_DEPRECATED | ||
| 68 | bool "Driver model /sys/devices/.../power/state files (DEPRECATED)" | ||
| 69 | depends on PM && SYSFS | ||
| 70 | default n | ||
| 71 | help | ||
| 72 | The driver model started out with a sysfs file intended to provide | ||
| 73 | a userspace hook for device power management. This feature has never | ||
| 74 | worked very well, except for limited testing purposes, and so it will | ||
| 75 | be removed. It's not clear that a generic mechanism could really | ||
| 76 | handle the wide variability of device power states; any replacements | ||
| 77 | are likely to be bus or driver specific. | ||
| 78 | |||
| 39 | config SOFTWARE_SUSPEND | 79 | config SOFTWARE_SUSPEND |
| 40 | bool "Software Suspend" | 80 | bool "Software Suspend" |
| 41 | depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) | 81 | depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP)) |
| 42 | ---help--- | 82 | ---help--- |
| 43 | Enable the possibility of suspending the machine. | 83 | Enable the possibility of suspending the machine. |
| 44 | It doesn't need ACPI or APM. | 84 | It doesn't need ACPI or APM. |
| @@ -60,6 +100,10 @@ config SOFTWARE_SUSPEND | |||
| 60 | 100 | ||
| 61 | For more information take a look at <file:Documentation/power/swsusp.txt>. | 101 | For more information take a look at <file:Documentation/power/swsusp.txt>. |
| 62 | 102 | ||
| 103 | (For now, swsusp is incompatible with PAE aka HIGHMEM_64G on i386. | ||
| 104 | we need identity mapping for resume to work, and that is trivial | ||
| 105 | to get with 4MB pages, but less than trivial on PAE). | ||
| 106 | |||
| 63 | config PM_STD_PARTITION | 107 | config PM_STD_PARTITION |
| 64 | string "Default resume partition" | 108 | string "Default resume partition" |
| 65 | depends on SOFTWARE_SUSPEND | 109 | depends on SOFTWARE_SUSPEND |
| @@ -82,18 +126,6 @@ config PM_STD_PARTITION | |||
| 82 | suspended image to. It will simply pick the first available swap | 126 | suspended image to. It will simply pick the first available swap |
| 83 | device. | 127 | device. |
| 84 | 128 | ||
| 85 | config SWSUSP_ENCRYPT | ||
| 86 | bool "Encrypt suspend image" | ||
| 87 | depends on SOFTWARE_SUSPEND && CRYPTO=y && (CRYPTO_AES=y || CRYPTO_AES_586=y || CRYPTO_AES_X86_64=y) | ||
| 88 | default "" | ||
| 89 | ---help--- | ||
| 90 | To prevent data gathering from swap after resume you can encrypt | ||
| 91 | the suspend image with a temporary key that is deleted on | ||
| 92 | resume. | ||
| 93 | |||
| 94 | Note that the temporary key is stored unencrypted on disk while the | ||
| 95 | system is suspended. | ||
| 96 | |||
| 97 | config SUSPEND_SMP | 129 | config SUSPEND_SMP |
| 98 | bool | 130 | bool |
| 99 | depends on HOTPLUG_CPU && X86 && PM | 131 | depends on HOTPLUG_CPU && X86 && PM |
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 8d0af3d37a4b..38725f526afc 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
| @@ -7,6 +7,4 @@ obj-y := main.o process.o console.o | |||
| 7 | obj-$(CONFIG_PM_LEGACY) += pm.o | 7 | obj-$(CONFIG_PM_LEGACY) += pm.o |
| 8 | obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o | 8 | obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o |
| 9 | 9 | ||
| 10 | obj-$(CONFIG_SUSPEND_SMP) += smp.o | ||
| 11 | |||
| 12 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o | 10 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o |
diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 81d4d982f3f0..d72234942798 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c | |||
| @@ -18,6 +18,7 @@ | |||
| 18 | #include <linux/fs.h> | 18 | #include <linux/fs.h> |
| 19 | #include <linux/mount.h> | 19 | #include <linux/mount.h> |
| 20 | #include <linux/pm.h> | 20 | #include <linux/pm.h> |
| 21 | #include <linux/cpu.h> | ||
| 21 | 22 | ||
| 22 | #include "power.h" | 23 | #include "power.h" |
| 23 | 24 | ||
| @@ -72,7 +73,10 @@ static int prepare_processes(void) | |||
| 72 | int error; | 73 | int error; |
| 73 | 74 | ||
| 74 | pm_prepare_console(); | 75 | pm_prepare_console(); |
| 75 | disable_nonboot_cpus(); | 76 | |
| 77 | error = disable_nonboot_cpus(); | ||
| 78 | if (error) | ||
| 79 | goto enable_cpus; | ||
| 76 | 80 | ||
| 77 | if (freeze_processes()) { | 81 | if (freeze_processes()) { |
| 78 | error = -EBUSY; | 82 | error = -EBUSY; |
| @@ -84,6 +88,7 @@ static int prepare_processes(void) | |||
| 84 | return 0; | 88 | return 0; |
| 85 | thaw: | 89 | thaw: |
| 86 | thaw_processes(); | 90 | thaw_processes(); |
| 91 | enable_cpus: | ||
| 87 | enable_nonboot_cpus(); | 92 | enable_nonboot_cpus(); |
| 88 | pm_restore_console(); | 93 | pm_restore_console(); |
| 89 | return error; | 94 | return error; |
| @@ -98,7 +103,7 @@ static void unprepare_processes(void) | |||
| 98 | } | 103 | } |
| 99 | 104 | ||
| 100 | /** | 105 | /** |
| 101 | * pm_suspend_disk - The granpappy of power management. | 106 | * pm_suspend_disk - The granpappy of hibernation power management. |
| 102 | * | 107 | * |
| 103 | * If we're going through the firmware, then get it over with quickly. | 108 | * If we're going through the firmware, then get it over with quickly. |
| 104 | * | 109 | * |
| @@ -207,7 +212,7 @@ static int software_resume(void) | |||
| 207 | 212 | ||
| 208 | pr_debug("PM: Preparing devices for restore.\n"); | 213 | pr_debug("PM: Preparing devices for restore.\n"); |
| 209 | 214 | ||
| 210 | if ((error = device_suspend(PMSG_FREEZE))) { | 215 | if ((error = device_suspend(PMSG_PRETHAW))) { |
| 211 | printk("Some devices failed to suspend\n"); | 216 | printk("Some devices failed to suspend\n"); |
| 212 | swsusp_free(); | 217 | swsusp_free(); |
| 213 | goto Thaw; | 218 | goto Thaw; |
| @@ -231,7 +236,7 @@ static int software_resume(void) | |||
| 231 | late_initcall(software_resume); | 236 | late_initcall(software_resume); |
| 232 | 237 | ||
| 233 | 238 | ||
| 234 | static char * pm_disk_modes[] = { | 239 | static const char * const pm_disk_modes[] = { |
| 235 | [PM_DISK_FIRMWARE] = "firmware", | 240 | [PM_DISK_FIRMWARE] = "firmware", |
| 236 | [PM_DISK_PLATFORM] = "platform", | 241 | [PM_DISK_PLATFORM] = "platform", |
| 237 | [PM_DISK_SHUTDOWN] = "shutdown", | 242 | [PM_DISK_SHUTDOWN] = "shutdown", |
diff --git a/kernel/power/main.c b/kernel/power/main.c index cdf0f07af92f..873228c71dab 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
| @@ -16,6 +16,8 @@ | |||
| 16 | #include <linux/init.h> | 16 | #include <linux/init.h> |
| 17 | #include <linux/pm.h> | 17 | #include <linux/pm.h> |
| 18 | #include <linux/console.h> | 18 | #include <linux/console.h> |
| 19 | #include <linux/cpu.h> | ||
| 20 | #include <linux/resume-trace.h> | ||
| 19 | 21 | ||
| 20 | #include "power.h" | 22 | #include "power.h" |
| 21 | 23 | ||
| @@ -51,7 +53,7 @@ void pm_set_ops(struct pm_ops * ops) | |||
| 51 | 53 | ||
| 52 | static int suspend_prepare(suspend_state_t state) | 54 | static int suspend_prepare(suspend_state_t state) |
| 53 | { | 55 | { |
| 54 | int error = 0; | 56 | int error; |
| 55 | unsigned int free_pages; | 57 | unsigned int free_pages; |
| 56 | 58 | ||
| 57 | if (!pm_ops || !pm_ops->enter) | 59 | if (!pm_ops || !pm_ops->enter) |
| @@ -59,12 +61,9 @@ static int suspend_prepare(suspend_state_t state) | |||
| 59 | 61 | ||
| 60 | pm_prepare_console(); | 62 | pm_prepare_console(); |
| 61 | 63 | ||
| 62 | disable_nonboot_cpus(); | 64 | error = disable_nonboot_cpus(); |
| 63 | 65 | if (error) | |
| 64 | if (num_online_cpus() != 1) { | ||
| 65 | error = -EPERM; | ||
| 66 | goto Enable_cpu; | 66 | goto Enable_cpu; |
| 67 | } | ||
| 68 | 67 | ||
| 69 | if (freeze_processes()) { | 68 | if (freeze_processes()) { |
| 70 | error = -EAGAIN; | 69 | error = -EAGAIN; |
| @@ -145,7 +144,7 @@ static void suspend_finish(suspend_state_t state) | |||
| 145 | 144 | ||
| 146 | 145 | ||
| 147 | 146 | ||
| 148 | static char *pm_states[PM_SUSPEND_MAX] = { | 147 | static const char * const pm_states[PM_SUSPEND_MAX] = { |
| 149 | [PM_SUSPEND_STANDBY] = "standby", | 148 | [PM_SUSPEND_STANDBY] = "standby", |
| 150 | [PM_SUSPEND_MEM] = "mem", | 149 | [PM_SUSPEND_MEM] = "mem", |
| 151 | #ifdef CONFIG_SOFTWARE_SUSPEND | 150 | #ifdef CONFIG_SOFTWARE_SUSPEND |
| @@ -262,7 +261,7 @@ static ssize_t state_show(struct subsystem * subsys, char * buf) | |||
| 262 | static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n) | 261 | static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n) |
| 263 | { | 262 | { |
| 264 | suspend_state_t state = PM_SUSPEND_STANDBY; | 263 | suspend_state_t state = PM_SUSPEND_STANDBY; |
| 265 | char ** s; | 264 | const char * const *s; |
| 266 | char *p; | 265 | char *p; |
| 267 | int error; | 266 | int error; |
| 268 | int len; | 267 | int len; |
| @@ -283,10 +282,39 @@ static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n | |||
| 283 | 282 | ||
| 284 | power_attr(state); | 283 | power_attr(state); |
| 285 | 284 | ||
| 285 | #ifdef CONFIG_PM_TRACE | ||
| 286 | int pm_trace_enabled; | ||
| 287 | |||
| 288 | static ssize_t pm_trace_show(struct subsystem * subsys, char * buf) | ||
| 289 | { | ||
| 290 | return sprintf(buf, "%d\n", pm_trace_enabled); | ||
| 291 | } | ||
| 292 | |||
| 293 | static ssize_t | ||
| 294 | pm_trace_store(struct subsystem * subsys, const char * buf, size_t n) | ||
| 295 | { | ||
| 296 | int val; | ||
| 297 | |||
| 298 | if (sscanf(buf, "%d", &val) == 1) { | ||
| 299 | pm_trace_enabled = !!val; | ||
| 300 | return n; | ||
| 301 | } | ||
| 302 | return -EINVAL; | ||
| 303 | } | ||
| 304 | |||
| 305 | power_attr(pm_trace); | ||
| 306 | |||
| 307 | static struct attribute * g[] = { | ||
| 308 | &state_attr.attr, | ||
| 309 | &pm_trace_attr.attr, | ||
| 310 | NULL, | ||
| 311 | }; | ||
| 312 | #else | ||
| 286 | static struct attribute * g[] = { | 313 | static struct attribute * g[] = { |
| 287 | &state_attr.attr, | 314 | &state_attr.attr, |
| 288 | NULL, | 315 | NULL, |
| 289 | }; | 316 | }; |
| 317 | #endif /* CONFIG_PM_TRACE */ | ||
| 290 | 318 | ||
| 291 | static struct attribute_group attr_group = { | 319 | static struct attribute_group attr_group = { |
| 292 | .attrs = g, | 320 | .attrs = g, |
diff --git a/kernel/power/pm.c b/kernel/power/pm.c index 84063ac8fcfc..c50d15266c10 100644 --- a/kernel/power/pm.c +++ b/kernel/power/pm.c | |||
| @@ -75,42 +75,6 @@ struct pm_dev *pm_register(pm_dev_t type, | |||
| 75 | return dev; | 75 | return dev; |
| 76 | } | 76 | } |
| 77 | 77 | ||
| 78 | static void __pm_unregister(struct pm_dev *dev) | ||
| 79 | { | ||
| 80 | if (dev) { | ||
| 81 | list_del(&dev->entry); | ||
| 82 | kfree(dev); | ||
| 83 | } | ||
| 84 | } | ||
| 85 | |||
| 86 | /** | ||
| 87 | * pm_unregister_all - unregister all devices with matching callback | ||
| 88 | * @callback: callback function pointer | ||
| 89 | * | ||
| 90 | * Unregister every device that would call the callback passed. This | ||
| 91 | * is primarily meant as a helper function for loadable modules. It | ||
| 92 | * enables a module to give up all its managed devices without keeping | ||
| 93 | * its own private list. | ||
| 94 | */ | ||
| 95 | |||
| 96 | void pm_unregister_all(pm_callback callback) | ||
| 97 | { | ||
| 98 | struct list_head *entry; | ||
| 99 | |||
| 100 | if (!callback) | ||
| 101 | return; | ||
| 102 | |||
| 103 | mutex_lock(&pm_devs_lock); | ||
| 104 | entry = pm_devs.next; | ||
| 105 | while (entry != &pm_devs) { | ||
| 106 | struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); | ||
| 107 | entry = entry->next; | ||
| 108 | if (dev->callback == callback) | ||
| 109 | __pm_unregister(dev); | ||
| 110 | } | ||
| 111 | mutex_unlock(&pm_devs_lock); | ||
| 112 | } | ||
| 113 | |||
| 114 | /** | 78 | /** |
| 115 | * pm_send - send request to a single device | 79 | * pm_send - send request to a single device |
| 116 | * @dev: device to send to | 80 | * @dev: device to send to |
| @@ -239,7 +203,6 @@ int pm_send_all(pm_request_t rqst, void *data) | |||
| 239 | } | 203 | } |
| 240 | 204 | ||
| 241 | EXPORT_SYMBOL(pm_register); | 205 | EXPORT_SYMBOL(pm_register); |
| 242 | EXPORT_SYMBOL(pm_unregister_all); | ||
| 243 | EXPORT_SYMBOL(pm_send_all); | 206 | EXPORT_SYMBOL(pm_send_all); |
| 244 | EXPORT_SYMBOL(pm_active); | 207 | EXPORT_SYMBOL(pm_active); |
| 245 | 208 | ||
diff --git a/kernel/power/power.h b/kernel/power/power.h index 98c41423f3b1..bfe999f7b272 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
| @@ -38,8 +38,6 @@ extern struct subsystem power_subsys; | |||
| 38 | /* References to section boundaries */ | 38 | /* References to section boundaries */ |
| 39 | extern const void __nosave_begin, __nosave_end; | 39 | extern const void __nosave_begin, __nosave_end; |
| 40 | 40 | ||
| 41 | extern struct pbe *pagedir_nosave; | ||
| 42 | |||
| 43 | /* Preferred image size in bytes (default 500 MB) */ | 41 | /* Preferred image size in bytes (default 500 MB) */ |
| 44 | extern unsigned long image_size; | 42 | extern unsigned long image_size; |
| 45 | extern int in_suspend; | 43 | extern int in_suspend; |
| @@ -50,21 +48,62 @@ extern asmlinkage int swsusp_arch_resume(void); | |||
| 50 | 48 | ||
| 51 | extern unsigned int count_data_pages(void); | 49 | extern unsigned int count_data_pages(void); |
| 52 | 50 | ||
| 51 | /** | ||
| 52 | * Auxiliary structure used for reading the snapshot image data and | ||
| 53 | * metadata from and writing them to the list of page backup entries | ||
| 54 | * (PBEs) which is the main data structure of swsusp. | ||
| 55 | * | ||
| 56 | * Using struct snapshot_handle we can transfer the image, including its | ||
| 57 | * metadata, as a continuous sequence of bytes with the help of | ||
| 58 | * snapshot_read_next() and snapshot_write_next(). | ||
| 59 | * | ||
| 60 | * The code that writes the image to a storage or transfers it to | ||
| 61 | * the user land is required to use snapshot_read_next() for this | ||
| 62 | * purpose and it should not make any assumptions regarding the internal | ||
| 63 | * structure of the image. Similarly, the code that reads the image from | ||
| 64 | * a storage or transfers it from the user land is required to use | ||
| 65 | * snapshot_write_next(). | ||
| 66 | * | ||
| 67 | * This may allow us to change the internal structure of the image | ||
| 68 | * in the future with considerably less effort. | ||
| 69 | */ | ||
| 70 | |||
| 53 | struct snapshot_handle { | 71 | struct snapshot_handle { |
| 54 | loff_t offset; | 72 | loff_t offset; /* number of the last byte ready for reading |
| 55 | unsigned int page; | 73 | * or writing in the sequence |
| 56 | unsigned int page_offset; | 74 | */ |
| 57 | unsigned int prev; | 75 | unsigned int cur; /* number of the block of PAGE_SIZE bytes the |
| 58 | struct pbe *pbe, *last_pbe; | 76 | * next operation will refer to (ie. current) |
| 59 | void *buffer; | 77 | */ |
| 60 | unsigned int buf_offset; | 78 | unsigned int cur_offset; /* offset with respect to the current |
| 79 | * block (for the next operation) | ||
| 80 | */ | ||
| 81 | unsigned int prev; /* number of the block of PAGE_SIZE bytes that | ||
| 82 | * was the current one previously | ||
| 83 | */ | ||
| 84 | void *buffer; /* address of the block to read from | ||
| 85 | * or write to | ||
| 86 | */ | ||
| 87 | unsigned int buf_offset; /* location to read from or write to, | ||
| 88 | * given as a displacement from 'buffer' | ||
| 89 | */ | ||
| 90 | int sync_read; /* Set to one to notify the caller of | ||
| 91 | * snapshot_write_next() that it may | ||
| 92 | * need to call wait_on_bio_chain() | ||
| 93 | */ | ||
| 61 | }; | 94 | }; |
| 62 | 95 | ||
| 96 | /* This macro returns the address from/to which the caller of | ||
| 97 | * snapshot_read_next()/snapshot_write_next() is allowed to | ||
| 98 | * read/write data after the function returns | ||
| 99 | */ | ||
| 63 | #define data_of(handle) ((handle).buffer + (handle).buf_offset) | 100 | #define data_of(handle) ((handle).buffer + (handle).buf_offset) |
| 64 | 101 | ||
| 102 | extern unsigned int snapshot_additional_pages(struct zone *zone); | ||
| 65 | extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); | 103 | extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); |
| 66 | extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); | 104 | extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); |
| 67 | int snapshot_image_loaded(struct snapshot_handle *handle); | 105 | extern int snapshot_image_loaded(struct snapshot_handle *handle); |
| 106 | extern void snapshot_free_unused_memory(struct snapshot_handle *handle); | ||
| 68 | 107 | ||
| 69 | #define SNAPSHOT_IOC_MAGIC '3' | 108 | #define SNAPSHOT_IOC_MAGIC '3' |
| 70 | #define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) | 109 | #define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) |
| @@ -105,10 +144,6 @@ extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); | |||
| 105 | extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap); | 144 | extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap); |
| 106 | extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); | 145 | extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); |
| 107 | 146 | ||
| 108 | extern unsigned int count_special_pages(void); | ||
| 109 | extern int save_special_mem(void); | ||
| 110 | extern int restore_special_mem(void); | ||
| 111 | |||
| 112 | extern int swsusp_check(void); | 147 | extern int swsusp_check(void); |
| 113 | extern int swsusp_shrink_memory(void); | 148 | extern int swsusp_shrink_memory(void); |
| 114 | extern void swsusp_free(void); | 149 | extern void swsusp_free(void); |
diff --git a/kernel/power/process.c b/kernel/power/process.c index b2a5f671d6cd..72e72d2c61e6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
| @@ -66,13 +66,25 @@ static inline void freeze_process(struct task_struct *p) | |||
| 66 | } | 66 | } |
| 67 | } | 67 | } |
| 68 | 68 | ||
| 69 | static void cancel_freezing(struct task_struct *p) | ||
| 70 | { | ||
| 71 | unsigned long flags; | ||
| 72 | |||
| 73 | if (freezing(p)) { | ||
| 74 | pr_debug(" clean up: %s\n", p->comm); | ||
| 75 | do_not_freeze(p); | ||
| 76 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
| 77 | recalc_sigpending_tsk(p); | ||
| 78 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | ||
| 79 | } | ||
| 80 | } | ||
| 81 | |||
| 69 | /* 0 = success, else # of processes that we failed to stop */ | 82 | /* 0 = success, else # of processes that we failed to stop */ |
| 70 | int freeze_processes(void) | 83 | int freeze_processes(void) |
| 71 | { | 84 | { |
| 72 | int todo, nr_user, user_frozen; | 85 | int todo, nr_user, user_frozen; |
| 73 | unsigned long start_time; | 86 | unsigned long start_time; |
| 74 | struct task_struct *g, *p; | 87 | struct task_struct *g, *p; |
| 75 | unsigned long flags; | ||
| 76 | 88 | ||
| 77 | printk( "Stopping tasks: " ); | 89 | printk( "Stopping tasks: " ); |
| 78 | start_time = jiffies; | 90 | start_time = jiffies; |
| @@ -85,6 +97,10 @@ int freeze_processes(void) | |||
| 85 | continue; | 97 | continue; |
| 86 | if (frozen(p)) | 98 | if (frozen(p)) |
| 87 | continue; | 99 | continue; |
| 100 | if (p->state == TASK_TRACED && frozen(p->parent)) { | ||
| 101 | cancel_freezing(p); | ||
| 102 | continue; | ||
| 103 | } | ||
| 88 | if (p->mm && !(p->flags & PF_BORROWED_MM)) { | 104 | if (p->mm && !(p->flags & PF_BORROWED_MM)) { |
| 89 | /* The task is a user-space one. | 105 | /* The task is a user-space one. |
| 90 | * Freeze it unless there's a vfork completion | 106 | * Freeze it unless there's a vfork completion |
| @@ -126,13 +142,7 @@ int freeze_processes(void) | |||
| 126 | do_each_thread(g, p) { | 142 | do_each_thread(g, p) { |
| 127 | if (freezeable(p) && !frozen(p)) | 143 | if (freezeable(p) && !frozen(p)) |
| 128 | printk(KERN_ERR " %s\n", p->comm); | 144 | printk(KERN_ERR " %s\n", p->comm); |
| 129 | if (freezing(p)) { | 145 | cancel_freezing(p); |
| 130 | pr_debug(" clean up: %s\n", p->comm); | ||
| 131 | p->flags &= ~PF_FREEZE; | ||
| 132 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
| 133 | recalc_sigpending_tsk(p); | ||
| 134 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | ||
| 135 | } | ||
| 136 | } while_each_thread(g, p); | 146 | } while_each_thread(g, p); |
| 137 | read_unlock(&tasklist_lock); | 147 | read_unlock(&tasklist_lock); |
| 138 | return todo; | 148 | return todo; |
diff --git a/kernel/power/smp.c b/kernel/power/smp.c deleted file mode 100644 index 5957312b2d68..000000000000 --- a/kernel/power/smp.c +++ /dev/null | |||
| @@ -1,62 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * drivers/power/smp.c - Functions for stopping other CPUs. | ||
| 3 | * | ||
| 4 | * Copyright 2004 Pavel Machek <pavel@suse.cz> | ||
| 5 | * Copyright (C) 2002-2003 Nigel Cunningham <ncunningham@clear.net.nz> | ||
| 6 | * | ||
| 7 | * This file is released under the GPLv2. | ||
| 8 | */ | ||
| 9 | |||
| 10 | #undef DEBUG | ||
| 11 | |||
| 12 | #include <linux/smp_lock.h> | ||
| 13 | #include <linux/interrupt.h> | ||
| 14 | #include <linux/suspend.h> | ||
| 15 | #include <linux/module.h> | ||
| 16 | #include <linux/cpu.h> | ||
| 17 | #include <asm/atomic.h> | ||
| 18 | #include <asm/tlbflush.h> | ||
| 19 | |||
| 20 | /* This is protected by pm_sem semaphore */ | ||
| 21 | static cpumask_t frozen_cpus; | ||
| 22 | |||
| 23 | void disable_nonboot_cpus(void) | ||
| 24 | { | ||
| 25 | int cpu, error; | ||
| 26 | |||
| 27 | error = 0; | ||
| 28 | cpus_clear(frozen_cpus); | ||
| 29 | printk("Freezing cpus ...\n"); | ||
| 30 | for_each_online_cpu(cpu) { | ||
| 31 | if (cpu == 0) | ||
| 32 | continue; | ||
| 33 | error = cpu_down(cpu); | ||
| 34 | if (!error) { | ||
| 35 | cpu_set(cpu, frozen_cpus); | ||
| 36 | printk("CPU%d is down\n", cpu); | ||
| 37 | continue; | ||
| 38 | } | ||
| 39 | printk("Error taking cpu %d down: %d\n", cpu, error); | ||
| 40 | } | ||
| 41 | BUG_ON(raw_smp_processor_id() != 0); | ||
| 42 | if (error) | ||
| 43 | panic("cpus not sleeping"); | ||
| 44 | } | ||
| 45 | |||
| 46 | void enable_nonboot_cpus(void) | ||
| 47 | { | ||
| 48 | int cpu, error; | ||
| 49 | |||
| 50 | printk("Thawing cpus ...\n"); | ||
| 51 | for_each_cpu_mask(cpu, frozen_cpus) { | ||
| 52 | error = cpu_up(cpu); | ||
| 53 | if (!error) { | ||
| 54 | printk("CPU%d is up\n", cpu); | ||
| 55 | continue; | ||
| 56 | } | ||
| 57 | printk("Error taking cpu %d up: %d\n", cpu, error); | ||
| 58 | panic("Not enough cpus"); | ||
| 59 | } | ||
| 60 | cpus_clear(frozen_cpus); | ||
| 61 | } | ||
| 62 | |||
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3d9284100b22..1b84313cbab5 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
| @@ -34,95 +34,15 @@ | |||
| 34 | 34 | ||
| 35 | #include "power.h" | 35 | #include "power.h" |
| 36 | 36 | ||
| 37 | struct pbe *pagedir_nosave; | 37 | /* List of PBEs used for creating and restoring the suspend image */ |
| 38 | struct pbe *restore_pblist; | ||
| 39 | |||
| 38 | static unsigned int nr_copy_pages; | 40 | static unsigned int nr_copy_pages; |
| 39 | static unsigned int nr_meta_pages; | 41 | static unsigned int nr_meta_pages; |
| 40 | static unsigned long *buffer; | 42 | static void *buffer; |
| 41 | |||
| 42 | struct arch_saveable_page { | ||
| 43 | unsigned long start; | ||
| 44 | unsigned long end; | ||
| 45 | char *data; | ||
| 46 | struct arch_saveable_page *next; | ||
| 47 | }; | ||
| 48 | static struct arch_saveable_page *arch_pages; | ||
| 49 | |||
| 50 | int swsusp_add_arch_pages(unsigned long start, unsigned long end) | ||
| 51 | { | ||
| 52 | struct arch_saveable_page *tmp; | ||
| 53 | |||
| 54 | while (start < end) { | ||
| 55 | tmp = kzalloc(sizeof(struct arch_saveable_page), GFP_KERNEL); | ||
| 56 | if (!tmp) | ||
| 57 | return -ENOMEM; | ||
| 58 | tmp->start = start; | ||
| 59 | tmp->end = ((start >> PAGE_SHIFT) + 1) << PAGE_SHIFT; | ||
| 60 | if (tmp->end > end) | ||
| 61 | tmp->end = end; | ||
| 62 | tmp->next = arch_pages; | ||
| 63 | start = tmp->end; | ||
| 64 | arch_pages = tmp; | ||
| 65 | } | ||
| 66 | return 0; | ||
| 67 | } | ||
| 68 | |||
| 69 | static unsigned int count_arch_pages(void) | ||
| 70 | { | ||
| 71 | unsigned int count = 0; | ||
| 72 | struct arch_saveable_page *tmp = arch_pages; | ||
| 73 | while (tmp) { | ||
| 74 | count++; | ||
| 75 | tmp = tmp->next; | ||
| 76 | } | ||
| 77 | return count; | ||
| 78 | } | ||
| 79 | |||
| 80 | static int save_arch_mem(void) | ||
| 81 | { | ||
| 82 | char *kaddr; | ||
| 83 | struct arch_saveable_page *tmp = arch_pages; | ||
| 84 | int offset; | ||
| 85 | |||
| 86 | pr_debug("swsusp: Saving arch specific memory"); | ||
| 87 | while (tmp) { | ||
| 88 | tmp->data = (char *)__get_free_page(GFP_ATOMIC); | ||
| 89 | if (!tmp->data) | ||
| 90 | return -ENOMEM; | ||
| 91 | offset = tmp->start - (tmp->start & PAGE_MASK); | ||
| 92 | /* arch pages might haven't a 'struct page' */ | ||
| 93 | kaddr = kmap_atomic_pfn(tmp->start >> PAGE_SHIFT, KM_USER0); | ||
| 94 | memcpy(tmp->data + offset, kaddr + offset, | ||
| 95 | tmp->end - tmp->start); | ||
| 96 | kunmap_atomic(kaddr, KM_USER0); | ||
| 97 | |||
| 98 | tmp = tmp->next; | ||
| 99 | } | ||
| 100 | return 0; | ||
| 101 | } | ||
| 102 | |||
| 103 | static int restore_arch_mem(void) | ||
| 104 | { | ||
| 105 | char *kaddr; | ||
| 106 | struct arch_saveable_page *tmp = arch_pages; | ||
| 107 | int offset; | ||
| 108 | |||
| 109 | while (tmp) { | ||
| 110 | if (!tmp->data) | ||
| 111 | continue; | ||
| 112 | offset = tmp->start - (tmp->start & PAGE_MASK); | ||
| 113 | kaddr = kmap_atomic_pfn(tmp->start >> PAGE_SHIFT, KM_USER0); | ||
| 114 | memcpy(kaddr + offset, tmp->data + offset, | ||
| 115 | tmp->end - tmp->start); | ||
| 116 | kunmap_atomic(kaddr, KM_USER0); | ||
| 117 | free_page((long)tmp->data); | ||
| 118 | tmp->data = NULL; | ||
| 119 | tmp = tmp->next; | ||
| 120 | } | ||
| 121 | return 0; | ||
| 122 | } | ||
| 123 | 43 | ||
| 124 | #ifdef CONFIG_HIGHMEM | 44 | #ifdef CONFIG_HIGHMEM |
| 125 | static unsigned int count_highmem_pages(void) | 45 | unsigned int count_highmem_pages(void) |
| 126 | { | 46 | { |
| 127 | struct zone *zone; | 47 | struct zone *zone; |
| 128 | unsigned long zone_pfn; | 48 | unsigned long zone_pfn; |
| @@ -199,7 +119,7 @@ static int save_highmem_zone(struct zone *zone) | |||
| 199 | return 0; | 119 | return 0; |
| 200 | } | 120 | } |
| 201 | 121 | ||
| 202 | static int save_highmem(void) | 122 | int save_highmem(void) |
| 203 | { | 123 | { |
| 204 | struct zone *zone; | 124 | struct zone *zone; |
| 205 | int res = 0; | 125 | int res = 0; |
| @@ -216,7 +136,7 @@ static int save_highmem(void) | |||
| 216 | return 0; | 136 | return 0; |
| 217 | } | 137 | } |
| 218 | 138 | ||
| 219 | static int restore_highmem(void) | 139 | int restore_highmem(void) |
| 220 | { | 140 | { |
| 221 | printk("swsusp: Restoring Highmem\n"); | 141 | printk("swsusp: Restoring Highmem\n"); |
| 222 | while (highmem_copy) { | 142 | while (highmem_copy) { |
| @@ -238,256 +158,637 @@ static inline int save_highmem(void) {return 0;} | |||
| 238 | static inline int restore_highmem(void) {return 0;} | 158 | static inline int restore_highmem(void) {return 0;} |
| 239 | #endif | 159 | #endif |
| 240 | 160 | ||
| 241 | unsigned int count_special_pages(void) | 161 | /** |
| 162 | * @safe_needed - on resume, for storing the PBE list and the image, | ||
| 163 | * we can only use memory pages that do not conflict with the pages | ||
| 164 | * used before suspend. | ||
| 165 | * | ||
| 166 | * The unsafe pages are marked with the PG_nosave_free flag | ||
| 167 | * and we count them using unsafe_pages | ||
| 168 | */ | ||
| 169 | |||
| 170 | #define PG_ANY 0 | ||
| 171 | #define PG_SAFE 1 | ||
| 172 | #define PG_UNSAFE_CLEAR 1 | ||
| 173 | #define PG_UNSAFE_KEEP 0 | ||
| 174 | |||
| 175 | static unsigned int allocated_unsafe_pages; | ||
| 176 | |||
| 177 | static void *alloc_image_page(gfp_t gfp_mask, int safe_needed) | ||
| 242 | { | 178 | { |
| 243 | return count_arch_pages() + count_highmem_pages(); | 179 | void *res; |
| 180 | |||
| 181 | res = (void *)get_zeroed_page(gfp_mask); | ||
| 182 | if (safe_needed) | ||
| 183 | while (res && PageNosaveFree(virt_to_page(res))) { | ||
| 184 | /* The page is unsafe, mark it for swsusp_free() */ | ||
| 185 | SetPageNosave(virt_to_page(res)); | ||
| 186 | allocated_unsafe_pages++; | ||
| 187 | res = (void *)get_zeroed_page(gfp_mask); | ||
| 188 | } | ||
| 189 | if (res) { | ||
| 190 | SetPageNosave(virt_to_page(res)); | ||
| 191 | SetPageNosaveFree(virt_to_page(res)); | ||
| 192 | } | ||
| 193 | return res; | ||
| 244 | } | 194 | } |
| 245 | 195 | ||
| 246 | int save_special_mem(void) | 196 | unsigned long get_safe_page(gfp_t gfp_mask) |
| 247 | { | 197 | { |
| 248 | int ret; | 198 | return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE); |
| 249 | ret = save_arch_mem(); | 199 | } |
| 250 | if (!ret) | 200 | |
| 251 | ret = save_highmem(); | 201 | /** |
| 252 | return ret; | 202 | * free_image_page - free page represented by @addr, allocated with |
| 203 | * alloc_image_page (page flags set by it must be cleared) | ||
| 204 | */ | ||
| 205 | |||
| 206 | static inline void free_image_page(void *addr, int clear_nosave_free) | ||
| 207 | { | ||
| 208 | ClearPageNosave(virt_to_page(addr)); | ||
| 209 | if (clear_nosave_free) | ||
| 210 | ClearPageNosaveFree(virt_to_page(addr)); | ||
| 211 | free_page((unsigned long)addr); | ||
| 212 | } | ||
| 213 | |||
| 214 | /* struct linked_page is used to build chains of pages */ | ||
| 215 | |||
| 216 | #define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *)) | ||
| 217 | |||
| 218 | struct linked_page { | ||
| 219 | struct linked_page *next; | ||
| 220 | char data[LINKED_PAGE_DATA_SIZE]; | ||
| 221 | } __attribute__((packed)); | ||
| 222 | |||
| 223 | static inline void | ||
| 224 | free_list_of_pages(struct linked_page *list, int clear_page_nosave) | ||
| 225 | { | ||
| 226 | while (list) { | ||
| 227 | struct linked_page *lp = list->next; | ||
| 228 | |||
| 229 | free_image_page(list, clear_page_nosave); | ||
| 230 | list = lp; | ||
| 231 | } | ||
| 232 | } | ||
| 233 | |||
| 234 | /** | ||
| 235 | * struct chain_allocator is used for allocating small objects out of | ||
| 236 | * a linked list of pages called 'the chain'. | ||
| 237 | * | ||
| 238 | * The chain grows each time when there is no room for a new object in | ||
| 239 | * the current page. The allocated objects cannot be freed individually. | ||
| 240 | * It is only possible to free them all at once, by freeing the entire | ||
| 241 | * chain. | ||
| 242 | * | ||
| 243 | * NOTE: The chain allocator may be inefficient if the allocated objects | ||
| 244 | * are not much smaller than PAGE_SIZE. | ||
| 245 | */ | ||
| 246 | |||
| 247 | struct chain_allocator { | ||
| 248 | struct linked_page *chain; /* the chain */ | ||
| 249 | unsigned int used_space; /* total size of objects allocated out | ||
| 250 | * of the current page | ||
| 251 | */ | ||
| 252 | gfp_t gfp_mask; /* mask for allocating pages */ | ||
| 253 | int safe_needed; /* if set, only "safe" pages are allocated */ | ||
| 254 | }; | ||
| 255 | |||
| 256 | static void | ||
| 257 | chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed) | ||
| 258 | { | ||
| 259 | ca->chain = NULL; | ||
| 260 | ca->used_space = LINKED_PAGE_DATA_SIZE; | ||
| 261 | ca->gfp_mask = gfp_mask; | ||
| 262 | ca->safe_needed = safe_needed; | ||
| 253 | } | 263 | } |
| 254 | 264 | ||
| 255 | int restore_special_mem(void) | 265 | static void *chain_alloc(struct chain_allocator *ca, unsigned int size) |
| 256 | { | 266 | { |
| 257 | int ret; | 267 | void *ret; |
| 258 | ret = restore_arch_mem(); | 268 | |
| 259 | if (!ret) | 269 | if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { |
| 260 | ret = restore_highmem(); | 270 | struct linked_page *lp; |
| 271 | |||
| 272 | lp = alloc_image_page(ca->gfp_mask, ca->safe_needed); | ||
| 273 | if (!lp) | ||
| 274 | return NULL; | ||
| 275 | |||
| 276 | lp->next = ca->chain; | ||
| 277 | ca->chain = lp; | ||
| 278 | ca->used_space = 0; | ||
| 279 | } | ||
| 280 | ret = ca->chain->data + ca->used_space; | ||
| 281 | ca->used_space += size; | ||
| 261 | return ret; | 282 | return ret; |
| 262 | } | 283 | } |
| 263 | 284 | ||
| 264 | static int pfn_is_nosave(unsigned long pfn) | 285 | static void chain_free(struct chain_allocator *ca, int clear_page_nosave) |
| 265 | { | 286 | { |
| 266 | unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; | 287 | free_list_of_pages(ca->chain, clear_page_nosave); |
| 267 | unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; | 288 | memset(ca, 0, sizeof(struct chain_allocator)); |
| 268 | return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); | ||
| 269 | } | 289 | } |
| 270 | 290 | ||
| 271 | /** | 291 | /** |
| 272 | * saveable - Determine whether a page should be cloned or not. | 292 | * Data types related to memory bitmaps. |
| 273 | * @pfn: The page | 293 | * |
| 294 | * Memory bitmap is a structure consiting of many linked lists of | ||
| 295 | * objects. The main list's elements are of type struct zone_bitmap | ||
| 296 | * and each of them corresonds to one zone. For each zone bitmap | ||
| 297 | * object there is a list of objects of type struct bm_block that | ||
| 298 | * represent each blocks of bit chunks in which information is | ||
| 299 | * stored. | ||
| 300 | * | ||
| 301 | * struct memory_bitmap contains a pointer to the main list of zone | ||
| 302 | * bitmap objects, a struct bm_position used for browsing the bitmap, | ||
| 303 | * and a pointer to the list of pages used for allocating all of the | ||
| 304 | * zone bitmap objects and bitmap block objects. | ||
| 305 | * | ||
| 306 | * NOTE: It has to be possible to lay out the bitmap in memory | ||
| 307 | * using only allocations of order 0. Additionally, the bitmap is | ||
| 308 | * designed to work with arbitrary number of zones (this is over the | ||
| 309 | * top for now, but let's avoid making unnecessary assumptions ;-). | ||
| 274 | * | 310 | * |
| 275 | * We save a page if it's Reserved, and not in the range of pages | 311 | * struct zone_bitmap contains a pointer to a list of bitmap block |
| 276 | * statically defined as 'unsaveable', or if it isn't reserved, and | 312 | * objects and a pointer to the bitmap block object that has been |
| 277 | * isn't part of a free chunk of pages. | 313 | * most recently used for setting bits. Additionally, it contains the |
| 314 | * pfns that correspond to the start and end of the represented zone. | ||
| 315 | * | ||
| 316 | * struct bm_block contains a pointer to the memory page in which | ||
| 317 | * information is stored (in the form of a block of bit chunks | ||
| 318 | * of type unsigned long each). It also contains the pfns that | ||
| 319 | * correspond to the start and end of the represented memory area and | ||
| 320 | * the number of bit chunks in the block. | ||
| 321 | * | ||
| 322 | * NOTE: Memory bitmaps are used for two types of operations only: | ||
| 323 | * "set a bit" and "find the next bit set". Moreover, the searching | ||
| 324 | * is always carried out after all of the "set a bit" operations | ||
| 325 | * on given bitmap. | ||
| 278 | */ | 326 | */ |
| 279 | 327 | ||
| 280 | static int saveable(struct zone *zone, unsigned long *zone_pfn) | 328 | #define BM_END_OF_MAP (~0UL) |
| 329 | |||
| 330 | #define BM_CHUNKS_PER_BLOCK (PAGE_SIZE / sizeof(long)) | ||
| 331 | #define BM_BITS_PER_CHUNK (sizeof(long) << 3) | ||
| 332 | #define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) | ||
| 333 | |||
| 334 | struct bm_block { | ||
| 335 | struct bm_block *next; /* next element of the list */ | ||
| 336 | unsigned long start_pfn; /* pfn represented by the first bit */ | ||
| 337 | unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ | ||
| 338 | unsigned int size; /* number of bit chunks */ | ||
| 339 | unsigned long *data; /* chunks of bits representing pages */ | ||
| 340 | }; | ||
| 341 | |||
| 342 | struct zone_bitmap { | ||
| 343 | struct zone_bitmap *next; /* next element of the list */ | ||
| 344 | unsigned long start_pfn; /* minimal pfn in this zone */ | ||
| 345 | unsigned long end_pfn; /* maximal pfn in this zone plus 1 */ | ||
| 346 | struct bm_block *bm_blocks; /* list of bitmap blocks */ | ||
| 347 | struct bm_block *cur_block; /* recently used bitmap block */ | ||
| 348 | }; | ||
| 349 | |||
| 350 | /* strcut bm_position is used for browsing memory bitmaps */ | ||
| 351 | |||
| 352 | struct bm_position { | ||
| 353 | struct zone_bitmap *zone_bm; | ||
| 354 | struct bm_block *block; | ||
| 355 | int chunk; | ||
| 356 | int bit; | ||
| 357 | }; | ||
| 358 | |||
| 359 | struct memory_bitmap { | ||
| 360 | struct zone_bitmap *zone_bm_list; /* list of zone bitmaps */ | ||
| 361 | struct linked_page *p_list; /* list of pages used to store zone | ||
| 362 | * bitmap objects and bitmap block | ||
| 363 | * objects | ||
| 364 | */ | ||
| 365 | struct bm_position cur; /* most recently used bit position */ | ||
| 366 | }; | ||
| 367 | |||
| 368 | /* Functions that operate on memory bitmaps */ | ||
| 369 | |||
| 370 | static inline void memory_bm_reset_chunk(struct memory_bitmap *bm) | ||
| 281 | { | 371 | { |
| 282 | unsigned long pfn = *zone_pfn + zone->zone_start_pfn; | 372 | bm->cur.chunk = 0; |
| 283 | struct page *page; | 373 | bm->cur.bit = -1; |
| 374 | } | ||
| 284 | 375 | ||
| 285 | if (!pfn_valid(pfn)) | 376 | static void memory_bm_position_reset(struct memory_bitmap *bm) |
| 286 | return 0; | 377 | { |
| 378 | struct zone_bitmap *zone_bm; | ||
| 287 | 379 | ||
| 288 | page = pfn_to_page(pfn); | 380 | zone_bm = bm->zone_bm_list; |
| 289 | if (PageNosave(page)) | 381 | bm->cur.zone_bm = zone_bm; |
| 290 | return 0; | 382 | bm->cur.block = zone_bm->bm_blocks; |
| 291 | if (PageReserved(page) && pfn_is_nosave(pfn)) | 383 | memory_bm_reset_chunk(bm); |
| 292 | return 0; | 384 | } |
| 293 | if (PageNosaveFree(page)) | 385 | |
| 294 | return 0; | 386 | static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); |
| 295 | 387 | ||
| 296 | return 1; | 388 | /** |
| 389 | * create_bm_block_list - create a list of block bitmap objects | ||
| 390 | */ | ||
| 391 | |||
| 392 | static inline struct bm_block * | ||
| 393 | create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca) | ||
| 394 | { | ||
| 395 | struct bm_block *bblist = NULL; | ||
| 396 | |||
| 397 | while (nr_blocks-- > 0) { | ||
| 398 | struct bm_block *bb; | ||
| 399 | |||
| 400 | bb = chain_alloc(ca, sizeof(struct bm_block)); | ||
| 401 | if (!bb) | ||
| 402 | return NULL; | ||
| 403 | |||
| 404 | bb->next = bblist; | ||
| 405 | bblist = bb; | ||
| 406 | } | ||
| 407 | return bblist; | ||
| 297 | } | 408 | } |
| 298 | 409 | ||
| 299 | unsigned int count_data_pages(void) | 410 | /** |
| 411 | * create_zone_bm_list - create a list of zone bitmap objects | ||
| 412 | */ | ||
| 413 | |||
| 414 | static inline struct zone_bitmap * | ||
| 415 | create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca) | ||
| 300 | { | 416 | { |
| 301 | struct zone *zone; | 417 | struct zone_bitmap *zbmlist = NULL; |
| 302 | unsigned long zone_pfn; | ||
| 303 | unsigned int n = 0; | ||
| 304 | 418 | ||
| 305 | for_each_zone (zone) { | 419 | while (nr_zones-- > 0) { |
| 306 | if (is_highmem(zone)) | 420 | struct zone_bitmap *zbm; |
| 307 | continue; | 421 | |
| 308 | mark_free_pages(zone); | 422 | zbm = chain_alloc(ca, sizeof(struct zone_bitmap)); |
| 309 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | 423 | if (!zbm) |
| 310 | n += saveable(zone, &zone_pfn); | 424 | return NULL; |
| 425 | |||
| 426 | zbm->next = zbmlist; | ||
| 427 | zbmlist = zbm; | ||
| 311 | } | 428 | } |
| 312 | return n; | 429 | return zbmlist; |
| 313 | } | 430 | } |
| 314 | 431 | ||
| 315 | static void copy_data_pages(struct pbe *pblist) | 432 | /** |
| 433 | * memory_bm_create - allocate memory for a memory bitmap | ||
| 434 | */ | ||
| 435 | |||
| 436 | static int | ||
| 437 | memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) | ||
| 316 | { | 438 | { |
| 439 | struct chain_allocator ca; | ||
| 317 | struct zone *zone; | 440 | struct zone *zone; |
| 318 | unsigned long zone_pfn; | 441 | struct zone_bitmap *zone_bm; |
| 319 | struct pbe *pbe, *p; | 442 | struct bm_block *bb; |
| 443 | unsigned int nr; | ||
| 444 | |||
| 445 | chain_init(&ca, gfp_mask, safe_needed); | ||
| 446 | |||
| 447 | /* Compute the number of zones */ | ||
| 448 | nr = 0; | ||
| 449 | for_each_zone (zone) | ||
| 450 | if (populated_zone(zone) && !is_highmem(zone)) | ||
| 451 | nr++; | ||
| 452 | |||
| 453 | /* Allocate the list of zones bitmap objects */ | ||
| 454 | zone_bm = create_zone_bm_list(nr, &ca); | ||
| 455 | bm->zone_bm_list = zone_bm; | ||
| 456 | if (!zone_bm) { | ||
| 457 | chain_free(&ca, PG_UNSAFE_CLEAR); | ||
| 458 | return -ENOMEM; | ||
| 459 | } | ||
| 320 | 460 | ||
| 321 | pbe = pblist; | 461 | /* Initialize the zone bitmap objects */ |
| 322 | for_each_zone (zone) { | 462 | for_each_zone (zone) { |
| 323 | if (is_highmem(zone)) | 463 | unsigned long pfn; |
| 464 | |||
| 465 | if (!populated_zone(zone) || is_highmem(zone)) | ||
| 324 | continue; | 466 | continue; |
| 325 | mark_free_pages(zone); | 467 | |
| 326 | /* This is necessary for swsusp_free() */ | 468 | zone_bm->start_pfn = zone->zone_start_pfn; |
| 327 | for_each_pb_page (p, pblist) | 469 | zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages; |
| 328 | SetPageNosaveFree(virt_to_page(p)); | 470 | /* Allocate the list of bitmap block objects */ |
| 329 | for_each_pbe (p, pblist) | 471 | nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); |
| 330 | SetPageNosaveFree(virt_to_page(p->address)); | 472 | bb = create_bm_block_list(nr, &ca); |
| 331 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { | 473 | zone_bm->bm_blocks = bb; |
| 332 | if (saveable(zone, &zone_pfn)) { | 474 | zone_bm->cur_block = bb; |
| 333 | struct page *page; | 475 | if (!bb) |
| 334 | page = pfn_to_page(zone_pfn + zone->zone_start_pfn); | 476 | goto Free; |
| 335 | BUG_ON(!pbe); | 477 | |
| 336 | pbe->orig_address = (unsigned long)page_address(page); | 478 | nr = zone->spanned_pages; |
| 337 | /* copy_page is not usable for copying task structs. */ | 479 | pfn = zone->zone_start_pfn; |
| 338 | memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE); | 480 | /* Initialize the bitmap block objects */ |
| 339 | pbe = pbe->next; | 481 | while (bb) { |
| 482 | unsigned long *ptr; | ||
| 483 | |||
| 484 | ptr = alloc_image_page(gfp_mask, safe_needed); | ||
| 485 | bb->data = ptr; | ||
| 486 | if (!ptr) | ||
| 487 | goto Free; | ||
| 488 | |||
| 489 | bb->start_pfn = pfn; | ||
| 490 | if (nr >= BM_BITS_PER_BLOCK) { | ||
| 491 | pfn += BM_BITS_PER_BLOCK; | ||
| 492 | bb->size = BM_CHUNKS_PER_BLOCK; | ||
| 493 | nr -= BM_BITS_PER_BLOCK; | ||
| 494 | } else { | ||
| 495 | /* This is executed only once in the loop */ | ||
| 496 | pfn += nr; | ||
| 497 | bb->size = DIV_ROUND_UP(nr, BM_BITS_PER_CHUNK); | ||
| 340 | } | 498 | } |
| 499 | bb->end_pfn = pfn; | ||
| 500 | bb = bb->next; | ||
| 341 | } | 501 | } |
| 502 | zone_bm = zone_bm->next; | ||
| 342 | } | 503 | } |
| 343 | BUG_ON(pbe); | 504 | bm->p_list = ca.chain; |
| 344 | } | 505 | memory_bm_position_reset(bm); |
| 506 | return 0; | ||
| 345 | 507 | ||
| 508 | Free: | ||
| 509 | bm->p_list = ca.chain; | ||
| 510 | memory_bm_free(bm, PG_UNSAFE_CLEAR); | ||
| 511 | return -ENOMEM; | ||
| 512 | } | ||
| 346 | 513 | ||
| 347 | /** | 514 | /** |
| 348 | * free_pagedir - free pages allocated with alloc_pagedir() | 515 | * memory_bm_free - free memory occupied by the memory bitmap @bm |
| 349 | */ | 516 | */ |
| 350 | 517 | ||
| 351 | static void free_pagedir(struct pbe *pblist, int clear_nosave_free) | 518 | static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) |
| 352 | { | 519 | { |
| 353 | struct pbe *pbe; | 520 | struct zone_bitmap *zone_bm; |
| 354 | 521 | ||
| 355 | while (pblist) { | 522 | /* Free the list of bit blocks for each zone_bitmap object */ |
| 356 | pbe = (pblist + PB_PAGE_SKIP)->next; | 523 | zone_bm = bm->zone_bm_list; |
| 357 | ClearPageNosave(virt_to_page(pblist)); | 524 | while (zone_bm) { |
| 358 | if (clear_nosave_free) | 525 | struct bm_block *bb; |
| 359 | ClearPageNosaveFree(virt_to_page(pblist)); | 526 | |
| 360 | free_page((unsigned long)pblist); | 527 | bb = zone_bm->bm_blocks; |
| 361 | pblist = pbe; | 528 | while (bb) { |
| 529 | if (bb->data) | ||
| 530 | free_image_page(bb->data, clear_nosave_free); | ||
| 531 | bb = bb->next; | ||
| 532 | } | ||
| 533 | zone_bm = zone_bm->next; | ||
| 362 | } | 534 | } |
| 535 | free_list_of_pages(bm->p_list, clear_nosave_free); | ||
| 536 | bm->zone_bm_list = NULL; | ||
| 363 | } | 537 | } |
| 364 | 538 | ||
| 365 | /** | 539 | /** |
| 366 | * fill_pb_page - Create a list of PBEs on a given memory page | 540 | * memory_bm_set_bit - set the bit in the bitmap @bm that corresponds |
| 541 | * to given pfn. The cur_zone_bm member of @bm and the cur_block member | ||
| 542 | * of @bm->cur_zone_bm are updated. | ||
| 543 | * | ||
| 544 | * If the bit cannot be set, the function returns -EINVAL . | ||
| 367 | */ | 545 | */ |
| 368 | 546 | ||
| 369 | static inline void fill_pb_page(struct pbe *pbpage) | 547 | static int |
| 548 | memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) | ||
| 370 | { | 549 | { |
| 371 | struct pbe *p; | 550 | struct zone_bitmap *zone_bm; |
| 372 | 551 | struct bm_block *bb; | |
| 373 | p = pbpage; | 552 | |
| 374 | pbpage += PB_PAGE_SKIP; | 553 | /* Check if the pfn is from the current zone */ |
| 375 | do | 554 | zone_bm = bm->cur.zone_bm; |
| 376 | p->next = p + 1; | 555 | if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { |
| 377 | while (++p < pbpage); | 556 | zone_bm = bm->zone_bm_list; |
| 557 | /* We don't assume that the zones are sorted by pfns */ | ||
| 558 | while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { | ||
| 559 | zone_bm = zone_bm->next; | ||
| 560 | if (unlikely(!zone_bm)) | ||
| 561 | return -EINVAL; | ||
| 562 | } | ||
| 563 | bm->cur.zone_bm = zone_bm; | ||
| 564 | } | ||
| 565 | /* Check if the pfn corresponds to the current bitmap block */ | ||
| 566 | bb = zone_bm->cur_block; | ||
| 567 | if (pfn < bb->start_pfn) | ||
| 568 | bb = zone_bm->bm_blocks; | ||
| 569 | |||
| 570 | while (pfn >= bb->end_pfn) { | ||
| 571 | bb = bb->next; | ||
| 572 | if (unlikely(!bb)) | ||
| 573 | return -EINVAL; | ||
| 574 | } | ||
| 575 | zone_bm->cur_block = bb; | ||
| 576 | pfn -= bb->start_pfn; | ||
| 577 | set_bit(pfn % BM_BITS_PER_CHUNK, bb->data + pfn / BM_BITS_PER_CHUNK); | ||
| 578 | return 0; | ||
| 378 | } | 579 | } |
| 379 | 580 | ||
| 380 | /** | 581 | /* Two auxiliary functions for memory_bm_next_pfn */ |
| 381 | * create_pbe_list - Create a list of PBEs on top of a given chain | ||
| 382 | * of memory pages allocated with alloc_pagedir() | ||
| 383 | */ | ||
| 384 | 582 | ||
| 385 | static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) | 583 | /* Find the first set bit in the given chunk, if there is one */ |
| 386 | { | ||
| 387 | struct pbe *pbpage, *p; | ||
| 388 | unsigned int num = PBES_PER_PAGE; | ||
| 389 | 584 | ||
| 390 | for_each_pb_page (pbpage, pblist) { | 585 | static inline int next_bit_in_chunk(int bit, unsigned long *chunk_p) |
| 391 | if (num >= nr_pages) | 586 | { |
| 392 | break; | 587 | bit++; |
| 588 | while (bit < BM_BITS_PER_CHUNK) { | ||
| 589 | if (test_bit(bit, chunk_p)) | ||
| 590 | return bit; | ||
| 393 | 591 | ||
| 394 | fill_pb_page(pbpage); | 592 | bit++; |
| 395 | num += PBES_PER_PAGE; | ||
| 396 | } | ||
| 397 | if (pbpage) { | ||
| 398 | for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++) | ||
| 399 | p->next = p + 1; | ||
| 400 | p->next = NULL; | ||
| 401 | } | 593 | } |
| 594 | return -1; | ||
| 402 | } | 595 | } |
| 403 | 596 | ||
| 404 | static unsigned int unsafe_pages; | 597 | /* Find a chunk containing some bits set in given block of bits */ |
| 598 | |||
| 599 | static inline int next_chunk_in_block(int n, struct bm_block *bb) | ||
| 600 | { | ||
| 601 | n++; | ||
| 602 | while (n < bb->size) { | ||
| 603 | if (bb->data[n]) | ||
| 604 | return n; | ||
| 605 | |||
| 606 | n++; | ||
| 607 | } | ||
| 608 | return -1; | ||
| 609 | } | ||
| 405 | 610 | ||
| 406 | /** | 611 | /** |
| 407 | * @safe_needed - on resume, for storing the PBE list and the image, | 612 | * memory_bm_next_pfn - find the pfn that corresponds to the next set bit |
| 408 | * we can only use memory pages that do not conflict with the pages | 613 | * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is |
| 409 | * used before suspend. | 614 | * returned. |
| 410 | * | 615 | * |
| 411 | * The unsafe pages are marked with the PG_nosave_free flag | 616 | * It is required to run memory_bm_position_reset() before the first call to |
| 412 | * and we count them using unsafe_pages | 617 | * this function. |
| 413 | */ | 618 | */ |
| 414 | 619 | ||
| 415 | static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) | 620 | static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) |
| 416 | { | 621 | { |
| 417 | void *res; | 622 | struct zone_bitmap *zone_bm; |
| 418 | 623 | struct bm_block *bb; | |
| 419 | res = (void *)get_zeroed_page(gfp_mask); | 624 | int chunk; |
| 420 | if (safe_needed) | 625 | int bit; |
| 421 | while (res && PageNosaveFree(virt_to_page(res))) { | 626 | |
| 422 | /* The page is unsafe, mark it for swsusp_free() */ | 627 | do { |
| 423 | SetPageNosave(virt_to_page(res)); | 628 | bb = bm->cur.block; |
| 424 | unsafe_pages++; | 629 | do { |
| 425 | res = (void *)get_zeroed_page(gfp_mask); | 630 | chunk = bm->cur.chunk; |
| 631 | bit = bm->cur.bit; | ||
| 632 | do { | ||
| 633 | bit = next_bit_in_chunk(bit, bb->data + chunk); | ||
| 634 | if (bit >= 0) | ||
| 635 | goto Return_pfn; | ||
| 636 | |||
| 637 | chunk = next_chunk_in_block(chunk, bb); | ||
| 638 | bit = -1; | ||
| 639 | } while (chunk >= 0); | ||
| 640 | bb = bb->next; | ||
| 641 | bm->cur.block = bb; | ||
| 642 | memory_bm_reset_chunk(bm); | ||
| 643 | } while (bb); | ||
| 644 | zone_bm = bm->cur.zone_bm->next; | ||
| 645 | if (zone_bm) { | ||
| 646 | bm->cur.zone_bm = zone_bm; | ||
| 647 | bm->cur.block = zone_bm->bm_blocks; | ||
| 648 | memory_bm_reset_chunk(bm); | ||
| 426 | } | 649 | } |
| 427 | if (res) { | 650 | } while (zone_bm); |
| 428 | SetPageNosave(virt_to_page(res)); | 651 | memory_bm_position_reset(bm); |
| 429 | SetPageNosaveFree(virt_to_page(res)); | 652 | return BM_END_OF_MAP; |
| 430 | } | 653 | |
| 654 | Return_pfn: | ||
| 655 | bm->cur.chunk = chunk; | ||
| 656 | bm->cur.bit = bit; | ||
| 657 | return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit; | ||
| 658 | } | ||
| 659 | |||
| 660 | /** | ||
| 661 | * snapshot_additional_pages - estimate the number of additional pages | ||
| 662 | * be needed for setting up the suspend image data structures for given | ||
| 663 | * zone (usually the returned value is greater than the exact number) | ||
| 664 | */ | ||
| 665 | |||
| 666 | unsigned int snapshot_additional_pages(struct zone *zone) | ||
| 667 | { | ||
| 668 | unsigned int res; | ||
| 669 | |||
| 670 | res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); | ||
| 671 | res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); | ||
| 431 | return res; | 672 | return res; |
| 432 | } | 673 | } |
| 433 | 674 | ||
| 434 | unsigned long get_safe_page(gfp_t gfp_mask) | 675 | /** |
| 676 | * pfn_is_nosave - check if given pfn is in the 'nosave' section | ||
| 677 | */ | ||
| 678 | |||
| 679 | static inline int pfn_is_nosave(unsigned long pfn) | ||
| 435 | { | 680 | { |
| 436 | return (unsigned long)alloc_image_page(gfp_mask, 1); | 681 | unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; |
| 682 | unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; | ||
| 683 | return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); | ||
| 437 | } | 684 | } |
| 438 | 685 | ||
| 439 | /** | 686 | /** |
| 440 | * alloc_pagedir - Allocate the page directory. | 687 | * saveable - Determine whether a page should be cloned or not. |
| 441 | * | 688 | * @pfn: The page |
| 442 | * First, determine exactly how many pages we need and | ||
| 443 | * allocate them. | ||
| 444 | * | ||
| 445 | * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE | ||
| 446 | * struct pbe elements (pbes) and the last element in the page points | ||
| 447 | * to the next page. | ||
| 448 | * | 689 | * |
| 449 | * On each page we set up a list of struct_pbe elements. | 690 | * We save a page if it isn't Nosave, and is not in the range of pages |
| 691 | * statically defined as 'unsaveable', and it | ||
| 692 | * isn't a part of a free chunk of pages. | ||
| 450 | */ | 693 | */ |
| 451 | 694 | ||
| 452 | static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, | 695 | static struct page *saveable_page(unsigned long pfn) |
| 453 | int safe_needed) | ||
| 454 | { | 696 | { |
| 455 | unsigned int num; | 697 | struct page *page; |
| 456 | struct pbe *pblist, *pbe; | 698 | |
| 699 | if (!pfn_valid(pfn)) | ||
| 700 | return NULL; | ||
| 457 | 701 | ||
| 458 | if (!nr_pages) | 702 | page = pfn_to_page(pfn); |
| 703 | |||
| 704 | if (PageNosave(page)) | ||
| 705 | return NULL; | ||
| 706 | if (PageReserved(page) && pfn_is_nosave(pfn)) | ||
| 459 | return NULL; | 707 | return NULL; |
| 708 | if (PageNosaveFree(page)) | ||
| 709 | return NULL; | ||
| 710 | |||
| 711 | return page; | ||
| 712 | } | ||
| 713 | |||
| 714 | unsigned int count_data_pages(void) | ||
| 715 | { | ||
| 716 | struct zone *zone; | ||
| 717 | unsigned long pfn, max_zone_pfn; | ||
| 718 | unsigned int n = 0; | ||
| 719 | |||
| 720 | for_each_zone (zone) { | ||
| 721 | if (is_highmem(zone)) | ||
| 722 | continue; | ||
| 723 | mark_free_pages(zone); | ||
| 724 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
| 725 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | ||
| 726 | n += !!saveable_page(pfn); | ||
| 727 | } | ||
| 728 | return n; | ||
| 729 | } | ||
| 730 | |||
| 731 | static inline void copy_data_page(long *dst, long *src) | ||
| 732 | { | ||
| 733 | int n; | ||
| 734 | |||
| 735 | /* copy_page and memcpy are not usable for copying task structs. */ | ||
| 736 | for (n = PAGE_SIZE / sizeof(long); n; n--) | ||
| 737 | *dst++ = *src++; | ||
| 738 | } | ||
| 739 | |||
| 740 | static void | ||
| 741 | copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) | ||
| 742 | { | ||
| 743 | struct zone *zone; | ||
| 744 | unsigned long pfn; | ||
| 745 | |||
| 746 | for_each_zone (zone) { | ||
| 747 | unsigned long max_zone_pfn; | ||
| 460 | 748 | ||
| 461 | pblist = alloc_image_page(gfp_mask, safe_needed); | 749 | if (is_highmem(zone)) |
| 462 | /* FIXME: rewrite this ugly loop */ | 750 | continue; |
| 463 | for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; | 751 | |
| 464 | pbe = pbe->next, num += PBES_PER_PAGE) { | 752 | mark_free_pages(zone); |
| 465 | pbe += PB_PAGE_SKIP; | 753 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; |
| 466 | pbe->next = alloc_image_page(gfp_mask, safe_needed); | 754 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
| 755 | if (saveable_page(pfn)) | ||
| 756 | memory_bm_set_bit(orig_bm, pfn); | ||
| 467 | } | 757 | } |
| 468 | if (!pbe) { /* get_zeroed_page() failed */ | 758 | memory_bm_position_reset(orig_bm); |
| 469 | free_pagedir(pblist, 1); | 759 | memory_bm_position_reset(copy_bm); |
| 470 | pblist = NULL; | 760 | do { |
| 471 | } else | 761 | pfn = memory_bm_next_pfn(orig_bm); |
| 472 | create_pbe_list(pblist, nr_pages); | 762 | if (likely(pfn != BM_END_OF_MAP)) { |
| 473 | return pblist; | 763 | struct page *page; |
| 764 | void *src; | ||
| 765 | |||
| 766 | page = pfn_to_page(pfn); | ||
| 767 | src = page_address(page); | ||
| 768 | page = pfn_to_page(memory_bm_next_pfn(copy_bm)); | ||
| 769 | copy_data_page(page_address(page), src); | ||
| 770 | } | ||
| 771 | } while (pfn != BM_END_OF_MAP); | ||
| 474 | } | 772 | } |
| 475 | 773 | ||
| 476 | /** | 774 | /** |
| 477 | * Free pages we allocated for suspend. Suspend pages are alocated | 775 | * swsusp_free - free pages allocated for the suspend. |
| 478 | * before atomic copy, so we need to free them after resume. | 776 | * |
| 777 | * Suspend pages are alocated before the atomic copy is made, so we | ||
| 778 | * need to release them after the resume. | ||
| 479 | */ | 779 | */ |
| 480 | 780 | ||
| 481 | void swsusp_free(void) | 781 | void swsusp_free(void) |
| 482 | { | 782 | { |
| 483 | struct zone *zone; | 783 | struct zone *zone; |
| 484 | unsigned long zone_pfn; | 784 | unsigned long pfn, max_zone_pfn; |
| 485 | 785 | ||
| 486 | for_each_zone(zone) { | 786 | for_each_zone(zone) { |
| 487 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | 787 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; |
| 488 | if (pfn_valid(zone_pfn + zone->zone_start_pfn)) { | 788 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
| 489 | struct page *page; | 789 | if (pfn_valid(pfn)) { |
| 490 | page = pfn_to_page(zone_pfn + zone->zone_start_pfn); | 790 | struct page *page = pfn_to_page(pfn); |
| 791 | |||
| 491 | if (PageNosave(page) && PageNosaveFree(page)) { | 792 | if (PageNosave(page) && PageNosaveFree(page)) { |
| 492 | ClearPageNosave(page); | 793 | ClearPageNosave(page); |
| 493 | ClearPageNosaveFree(page); | 794 | ClearPageNosaveFree(page); |
| @@ -497,7 +798,7 @@ void swsusp_free(void) | |||
| 497 | } | 798 | } |
| 498 | nr_copy_pages = 0; | 799 | nr_copy_pages = 0; |
| 499 | nr_meta_pages = 0; | 800 | nr_meta_pages = 0; |
| 500 | pagedir_nosave = NULL; | 801 | restore_pblist = NULL; |
| 501 | buffer = NULL; | 802 | buffer = NULL; |
| 502 | } | 803 | } |
| 503 | 804 | ||
| @@ -512,46 +813,57 @@ void swsusp_free(void) | |||
| 512 | static int enough_free_mem(unsigned int nr_pages) | 813 | static int enough_free_mem(unsigned int nr_pages) |
| 513 | { | 814 | { |
| 514 | struct zone *zone; | 815 | struct zone *zone; |
| 515 | unsigned int n = 0; | 816 | unsigned int free = 0, meta = 0; |
| 516 | 817 | ||
| 517 | for_each_zone (zone) | 818 | for_each_zone (zone) |
| 518 | if (!is_highmem(zone)) | 819 | if (!is_highmem(zone)) { |
| 519 | n += zone->free_pages; | 820 | free += zone->free_pages; |
| 520 | pr_debug("swsusp: available memory: %u pages\n", n); | 821 | meta += snapshot_additional_pages(zone); |
| 521 | return n > (nr_pages + PAGES_FOR_IO + | 822 | } |
| 522 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); | ||
| 523 | } | ||
| 524 | 823 | ||
| 525 | static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed) | 824 | pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n", |
| 526 | { | 825 | nr_pages, PAGES_FOR_IO, meta, free); |
| 527 | struct pbe *p; | ||
| 528 | 826 | ||
| 529 | for_each_pbe (p, pblist) { | 827 | return free > nr_pages + PAGES_FOR_IO + meta; |
| 530 | p->address = (unsigned long)alloc_image_page(gfp_mask, safe_needed); | ||
| 531 | if (!p->address) | ||
| 532 | return -ENOMEM; | ||
| 533 | } | ||
| 534 | return 0; | ||
| 535 | } | 828 | } |
| 536 | 829 | ||
| 537 | static struct pbe *swsusp_alloc(unsigned int nr_pages) | 830 | static int |
| 831 | swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, | ||
| 832 | unsigned int nr_pages) | ||
| 538 | { | 833 | { |
| 539 | struct pbe *pblist; | 834 | int error; |
| 540 | 835 | ||
| 541 | if (!(pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, 0))) { | 836 | error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY); |
| 542 | printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); | 837 | if (error) |
| 543 | return NULL; | 838 | goto Free; |
| 544 | } | ||
| 545 | 839 | ||
| 546 | if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) { | 840 | error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY); |
| 547 | printk(KERN_ERR "suspend: Allocating image pages failed.\n"); | 841 | if (error) |
| 548 | swsusp_free(); | 842 | goto Free; |
| 549 | return NULL; | 843 | |
| 844 | while (nr_pages-- > 0) { | ||
| 845 | struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD); | ||
| 846 | if (!page) | ||
| 847 | goto Free; | ||
| 848 | |||
| 849 | SetPageNosave(page); | ||
| 850 | SetPageNosaveFree(page); | ||
| 851 | memory_bm_set_bit(copy_bm, page_to_pfn(page)); | ||
| 550 | } | 852 | } |
| 853 | return 0; | ||
| 551 | 854 | ||
| 552 | return pblist; | 855 | Free: |
| 856 | swsusp_free(); | ||
| 857 | return -ENOMEM; | ||
| 553 | } | 858 | } |
| 554 | 859 | ||
| 860 | /* Memory bitmap used for marking saveable pages */ | ||
| 861 | static struct memory_bitmap orig_bm; | ||
| 862 | /* Memory bitmap used for marking allocated pages that will contain the copies | ||
| 863 | * of saveable pages | ||
| 864 | */ | ||
| 865 | static struct memory_bitmap copy_bm; | ||
| 866 | |||
| 555 | asmlinkage int swsusp_save(void) | 867 | asmlinkage int swsusp_save(void) |
| 556 | { | 868 | { |
| 557 | unsigned int nr_pages; | 869 | unsigned int nr_pages; |
| @@ -562,25 +874,19 @@ asmlinkage int swsusp_save(void) | |||
| 562 | nr_pages = count_data_pages(); | 874 | nr_pages = count_data_pages(); |
| 563 | printk("swsusp: Need to copy %u pages\n", nr_pages); | 875 | printk("swsusp: Need to copy %u pages\n", nr_pages); |
| 564 | 876 | ||
| 565 | pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n", | ||
| 566 | nr_pages, | ||
| 567 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE, | ||
| 568 | PAGES_FOR_IO, nr_free_pages()); | ||
| 569 | |||
| 570 | if (!enough_free_mem(nr_pages)) { | 877 | if (!enough_free_mem(nr_pages)) { |
| 571 | printk(KERN_ERR "swsusp: Not enough free memory\n"); | 878 | printk(KERN_ERR "swsusp: Not enough free memory\n"); |
| 572 | return -ENOMEM; | 879 | return -ENOMEM; |
| 573 | } | 880 | } |
| 574 | 881 | ||
| 575 | pagedir_nosave = swsusp_alloc(nr_pages); | 882 | if (swsusp_alloc(&orig_bm, ©_bm, nr_pages)) |
| 576 | if (!pagedir_nosave) | ||
| 577 | return -ENOMEM; | 883 | return -ENOMEM; |
| 578 | 884 | ||
| 579 | /* During allocating of suspend pagedir, new cold pages may appear. | 885 | /* During allocating of suspend pagedir, new cold pages may appear. |
| 580 | * Kill them. | 886 | * Kill them. |
| 581 | */ | 887 | */ |
| 582 | drain_local_pages(); | 888 | drain_local_pages(); |
| 583 | copy_data_pages(pagedir_nosave); | 889 | copy_data_pages(©_bm, &orig_bm); |
| 584 | 890 | ||
| 585 | /* | 891 | /* |
| 586 | * End of critical section. From now on, we can write to memory, | 892 | * End of critical section. From now on, we can write to memory, |
| @@ -609,22 +915,20 @@ static void init_header(struct swsusp_info *info) | |||
| 609 | } | 915 | } |
| 610 | 916 | ||
| 611 | /** | 917 | /** |
| 612 | * pack_orig_addresses - the .orig_address fields of the PBEs from the | 918 | * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm |
| 613 | * list starting at @pbe are stored in the array @buf[] (1 page) | 919 | * are stored in the array @buf[] (1 page at a time) |
| 614 | */ | 920 | */ |
| 615 | 921 | ||
| 616 | static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe) | 922 | static inline void |
| 923 | pack_pfns(unsigned long *buf, struct memory_bitmap *bm) | ||
| 617 | { | 924 | { |
| 618 | int j; | 925 | int j; |
| 619 | 926 | ||
| 620 | for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { | 927 | for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { |
| 621 | buf[j] = pbe->orig_address; | 928 | buf[j] = memory_bm_next_pfn(bm); |
| 622 | pbe = pbe->next; | 929 | if (unlikely(buf[j] == BM_END_OF_MAP)) |
| 930 | break; | ||
| 623 | } | 931 | } |
| 624 | if (!pbe) | ||
| 625 | for (; j < PAGE_SIZE / sizeof(long); j++) | ||
| 626 | buf[j] = 0; | ||
| 627 | return pbe; | ||
| 628 | } | 932 | } |
| 629 | 933 | ||
| 630 | /** | 934 | /** |
| @@ -651,37 +955,39 @@ static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pb | |||
| 651 | 955 | ||
| 652 | int snapshot_read_next(struct snapshot_handle *handle, size_t count) | 956 | int snapshot_read_next(struct snapshot_handle *handle, size_t count) |
| 653 | { | 957 | { |
| 654 | if (handle->page > nr_meta_pages + nr_copy_pages) | 958 | if (handle->cur > nr_meta_pages + nr_copy_pages) |
| 655 | return 0; | 959 | return 0; |
| 960 | |||
| 656 | if (!buffer) { | 961 | if (!buffer) { |
| 657 | /* This makes the buffer be freed by swsusp_free() */ | 962 | /* This makes the buffer be freed by swsusp_free() */ |
| 658 | buffer = alloc_image_page(GFP_ATOMIC, 0); | 963 | buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); |
| 659 | if (!buffer) | 964 | if (!buffer) |
| 660 | return -ENOMEM; | 965 | return -ENOMEM; |
| 661 | } | 966 | } |
| 662 | if (!handle->offset) { | 967 | if (!handle->offset) { |
| 663 | init_header((struct swsusp_info *)buffer); | 968 | init_header((struct swsusp_info *)buffer); |
| 664 | handle->buffer = buffer; | 969 | handle->buffer = buffer; |
| 665 | handle->pbe = pagedir_nosave; | 970 | memory_bm_position_reset(&orig_bm); |
| 971 | memory_bm_position_reset(©_bm); | ||
| 666 | } | 972 | } |
| 667 | if (handle->prev < handle->page) { | 973 | if (handle->prev < handle->cur) { |
| 668 | if (handle->page <= nr_meta_pages) { | 974 | if (handle->cur <= nr_meta_pages) { |
| 669 | handle->pbe = pack_orig_addresses(buffer, handle->pbe); | 975 | memset(buffer, 0, PAGE_SIZE); |
| 670 | if (!handle->pbe) | 976 | pack_pfns(buffer, &orig_bm); |
| 671 | handle->pbe = pagedir_nosave; | ||
| 672 | } else { | 977 | } else { |
| 673 | handle->buffer = (void *)handle->pbe->address; | 978 | unsigned long pfn = memory_bm_next_pfn(©_bm); |
| 674 | handle->pbe = handle->pbe->next; | 979 | |
| 980 | handle->buffer = page_address(pfn_to_page(pfn)); | ||
| 675 | } | 981 | } |
| 676 | handle->prev = handle->page; | 982 | handle->prev = handle->cur; |
| 677 | } | 983 | } |
| 678 | handle->buf_offset = handle->page_offset; | 984 | handle->buf_offset = handle->cur_offset; |
| 679 | if (handle->page_offset + count >= PAGE_SIZE) { | 985 | if (handle->cur_offset + count >= PAGE_SIZE) { |
| 680 | count = PAGE_SIZE - handle->page_offset; | 986 | count = PAGE_SIZE - handle->cur_offset; |
| 681 | handle->page_offset = 0; | 987 | handle->cur_offset = 0; |
| 682 | handle->page++; | 988 | handle->cur++; |
| 683 | } else { | 989 | } else { |
| 684 | handle->page_offset += count; | 990 | handle->cur_offset += count; |
| 685 | } | 991 | } |
| 686 | handle->offset += count; | 992 | handle->offset += count; |
| 687 | return count; | 993 | return count; |
| @@ -693,47 +999,50 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) | |||
| 693 | * had been used before suspend | 999 | * had been used before suspend |
| 694 | */ | 1000 | */ |
| 695 | 1001 | ||
| 696 | static int mark_unsafe_pages(struct pbe *pblist) | 1002 | static int mark_unsafe_pages(struct memory_bitmap *bm) |
| 697 | { | 1003 | { |
| 698 | struct zone *zone; | 1004 | struct zone *zone; |
| 699 | unsigned long zone_pfn; | 1005 | unsigned long pfn, max_zone_pfn; |
| 700 | struct pbe *p; | ||
| 701 | |||
| 702 | if (!pblist) /* a sanity check */ | ||
| 703 | return -EINVAL; | ||
| 704 | 1006 | ||
| 705 | /* Clear page flags */ | 1007 | /* Clear page flags */ |
| 706 | for_each_zone (zone) { | 1008 | for_each_zone (zone) { |
| 707 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | 1009 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; |
| 708 | if (pfn_valid(zone_pfn + zone->zone_start_pfn)) | 1010 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
| 709 | ClearPageNosaveFree(pfn_to_page(zone_pfn + | 1011 | if (pfn_valid(pfn)) |
| 710 | zone->zone_start_pfn)); | 1012 | ClearPageNosaveFree(pfn_to_page(pfn)); |
| 711 | } | 1013 | } |
| 712 | 1014 | ||
| 713 | /* Mark orig addresses */ | 1015 | /* Mark pages that correspond to the "original" pfns as "unsafe" */ |
| 714 | for_each_pbe (p, pblist) { | 1016 | memory_bm_position_reset(bm); |
| 715 | if (virt_addr_valid(p->orig_address)) | 1017 | do { |
| 716 | SetPageNosaveFree(virt_to_page(p->orig_address)); | 1018 | pfn = memory_bm_next_pfn(bm); |
| 717 | else | 1019 | if (likely(pfn != BM_END_OF_MAP)) { |
| 718 | return -EFAULT; | 1020 | if (likely(pfn_valid(pfn))) |
| 719 | } | 1021 | SetPageNosaveFree(pfn_to_page(pfn)); |
| 1022 | else | ||
| 1023 | return -EFAULT; | ||
| 1024 | } | ||
| 1025 | } while (pfn != BM_END_OF_MAP); | ||
| 720 | 1026 | ||
| 721 | unsafe_pages = 0; | 1027 | allocated_unsafe_pages = 0; |
| 722 | 1028 | ||
| 723 | return 0; | 1029 | return 0; |
| 724 | } | 1030 | } |
| 725 | 1031 | ||
| 726 | static void copy_page_backup_list(struct pbe *dst, struct pbe *src) | 1032 | static void |
| 1033 | duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src) | ||
| 727 | { | 1034 | { |
| 728 | /* We assume both lists contain the same number of elements */ | 1035 | unsigned long pfn; |
| 729 | while (src) { | 1036 | |
| 730 | dst->orig_address = src->orig_address; | 1037 | memory_bm_position_reset(src); |
| 731 | dst = dst->next; | 1038 | pfn = memory_bm_next_pfn(src); |
| 732 | src = src->next; | 1039 | while (pfn != BM_END_OF_MAP) { |
| 1040 | memory_bm_set_bit(dst, pfn); | ||
| 1041 | pfn = memory_bm_next_pfn(src); | ||
| 733 | } | 1042 | } |
| 734 | } | 1043 | } |
| 735 | 1044 | ||
| 736 | static int check_header(struct swsusp_info *info) | 1045 | static inline int check_header(struct swsusp_info *info) |
| 737 | { | 1046 | { |
| 738 | char *reason = NULL; | 1047 | char *reason = NULL; |
| 739 | 1048 | ||
| @@ -760,19 +1069,14 @@ static int check_header(struct swsusp_info *info) | |||
| 760 | * load header - check the image header and copy data from it | 1069 | * load header - check the image header and copy data from it |
| 761 | */ | 1070 | */ |
| 762 | 1071 | ||
| 763 | static int load_header(struct snapshot_handle *handle, | 1072 | static int |
| 764 | struct swsusp_info *info) | 1073 | load_header(struct swsusp_info *info) |
| 765 | { | 1074 | { |
| 766 | int error; | 1075 | int error; |
| 767 | struct pbe *pblist; | ||
| 768 | 1076 | ||
| 1077 | restore_pblist = NULL; | ||
| 769 | error = check_header(info); | 1078 | error = check_header(info); |
| 770 | if (!error) { | 1079 | if (!error) { |
| 771 | pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0); | ||
| 772 | if (!pblist) | ||
| 773 | return -ENOMEM; | ||
| 774 | pagedir_nosave = pblist; | ||
| 775 | handle->pbe = pblist; | ||
| 776 | nr_copy_pages = info->image_pages; | 1080 | nr_copy_pages = info->image_pages; |
| 777 | nr_meta_pages = info->pages - info->image_pages - 1; | 1081 | nr_meta_pages = info->pages - info->image_pages - 1; |
| 778 | } | 1082 | } |
| @@ -780,113 +1084,137 @@ static int load_header(struct snapshot_handle *handle, | |||
| 780 | } | 1084 | } |
| 781 | 1085 | ||
| 782 | /** | 1086 | /** |
| 783 | * unpack_orig_addresses - copy the elements of @buf[] (1 page) to | 1087 | * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set |
| 784 | * the PBEs in the list starting at @pbe | 1088 | * the corresponding bit in the memory bitmap @bm |
| 785 | */ | 1089 | */ |
| 786 | 1090 | ||
| 787 | static inline struct pbe *unpack_orig_addresses(unsigned long *buf, | 1091 | static inline void |
| 788 | struct pbe *pbe) | 1092 | unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) |
| 789 | { | 1093 | { |
| 790 | int j; | 1094 | int j; |
| 791 | 1095 | ||
| 792 | for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { | 1096 | for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { |
| 793 | pbe->orig_address = buf[j]; | 1097 | if (unlikely(buf[j] == BM_END_OF_MAP)) |
| 794 | pbe = pbe->next; | 1098 | break; |
| 1099 | |||
| 1100 | memory_bm_set_bit(bm, buf[j]); | ||
| 795 | } | 1101 | } |
| 796 | return pbe; | ||
| 797 | } | 1102 | } |
| 798 | 1103 | ||
| 799 | /** | 1104 | /** |
| 800 | * prepare_image - use metadata contained in the PBE list | 1105 | * prepare_image - use the memory bitmap @bm to mark the pages that will |
| 801 | * pointed to by pagedir_nosave to mark the pages that will | 1106 | * be overwritten in the process of restoring the system memory state |
| 802 | * be overwritten in the process of restoring the system | 1107 | * from the suspend image ("unsafe" pages) and allocate memory for the |
| 803 | * memory state from the image ("unsafe" pages) and allocate | 1108 | * image. |
| 804 | * memory for the image | ||
| 805 | * | 1109 | * |
| 806 | * The idea is to allocate the PBE list first and then | 1110 | * The idea is to allocate a new memory bitmap first and then allocate |
| 807 | * allocate as many pages as it's needed for the image data, | 1111 | * as many pages as needed for the image data, but not to assign these |
| 808 | * but not to assign these pages to the PBEs initially. | 1112 | * pages to specific tasks initially. Instead, we just mark them as |
| 809 | * Instead, we just mark them as allocated and create a list | 1113 | * allocated and create a list of "safe" pages that will be used later. |
| 810 | * of "safe" which will be used later | ||
| 811 | */ | 1114 | */ |
| 812 | 1115 | ||
| 813 | struct safe_page { | 1116 | #define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) |
| 814 | struct safe_page *next; | ||
| 815 | char padding[PAGE_SIZE - sizeof(void *)]; | ||
| 816 | }; | ||
| 817 | 1117 | ||
| 818 | static struct safe_page *safe_pages; | 1118 | static struct linked_page *safe_pages_list; |
| 819 | 1119 | ||
| 820 | static int prepare_image(struct snapshot_handle *handle) | 1120 | static int |
| 1121 | prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | ||
| 821 | { | 1122 | { |
| 822 | int error = 0; | 1123 | unsigned int nr_pages; |
| 823 | unsigned int nr_pages = nr_copy_pages; | 1124 | struct linked_page *sp_list, *lp; |
| 824 | struct pbe *p, *pblist = NULL; | 1125 | int error; |
| 825 | 1126 | ||
| 826 | p = pagedir_nosave; | 1127 | error = mark_unsafe_pages(bm); |
| 827 | error = mark_unsafe_pages(p); | 1128 | if (error) |
| 828 | if (!error) { | 1129 | goto Free; |
| 829 | pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); | 1130 | |
| 830 | if (pblist) | 1131 | error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE); |
| 831 | copy_page_backup_list(pblist, p); | 1132 | if (error) |
| 832 | free_pagedir(p, 0); | 1133 | goto Free; |
| 833 | if (!pblist) | 1134 | |
| 1135 | duplicate_memory_bitmap(new_bm, bm); | ||
| 1136 | memory_bm_free(bm, PG_UNSAFE_KEEP); | ||
| 1137 | /* Reserve some safe pages for potential later use. | ||
| 1138 | * | ||
| 1139 | * NOTE: This way we make sure there will be enough safe pages for the | ||
| 1140 | * chain_alloc() in get_buffer(). It is a bit wasteful, but | ||
| 1141 | * nr_copy_pages cannot be greater than 50% of the memory anyway. | ||
| 1142 | */ | ||
| 1143 | sp_list = NULL; | ||
| 1144 | /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ | ||
| 1145 | nr_pages = nr_copy_pages - allocated_unsafe_pages; | ||
| 1146 | nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); | ||
| 1147 | while (nr_pages > 0) { | ||
| 1148 | lp = alloc_image_page(GFP_ATOMIC, PG_SAFE); | ||
| 1149 | if (!lp) { | ||
| 834 | error = -ENOMEM; | 1150 | error = -ENOMEM; |
| 1151 | goto Free; | ||
| 1152 | } | ||
| 1153 | lp->next = sp_list; | ||
| 1154 | sp_list = lp; | ||
| 1155 | nr_pages--; | ||
| 835 | } | 1156 | } |
| 836 | safe_pages = NULL; | 1157 | /* Preallocate memory for the image */ |
| 837 | if (!error && nr_pages > unsafe_pages) { | 1158 | safe_pages_list = NULL; |
| 838 | nr_pages -= unsafe_pages; | 1159 | nr_pages = nr_copy_pages - allocated_unsafe_pages; |
| 839 | while (nr_pages--) { | 1160 | while (nr_pages > 0) { |
| 840 | struct safe_page *ptr; | 1161 | lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); |
| 841 | 1162 | if (!lp) { | |
| 842 | ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC); | 1163 | error = -ENOMEM; |
| 843 | if (!ptr) { | 1164 | goto Free; |
| 844 | error = -ENOMEM; | 1165 | } |
| 845 | break; | 1166 | if (!PageNosaveFree(virt_to_page(lp))) { |
| 846 | } | 1167 | /* The page is "safe", add it to the list */ |
| 847 | if (!PageNosaveFree(virt_to_page(ptr))) { | 1168 | lp->next = safe_pages_list; |
| 848 | /* The page is "safe", add it to the list */ | 1169 | safe_pages_list = lp; |
| 849 | ptr->next = safe_pages; | ||
| 850 | safe_pages = ptr; | ||
| 851 | } | ||
| 852 | /* Mark the page as allocated */ | ||
| 853 | SetPageNosave(virt_to_page(ptr)); | ||
| 854 | SetPageNosaveFree(virt_to_page(ptr)); | ||
| 855 | } | 1170 | } |
| 1171 | /* Mark the page as allocated */ | ||
| 1172 | SetPageNosave(virt_to_page(lp)); | ||
| 1173 | SetPageNosaveFree(virt_to_page(lp)); | ||
| 1174 | nr_pages--; | ||
| 856 | } | 1175 | } |
| 857 | if (!error) { | 1176 | /* Free the reserved safe pages so that chain_alloc() can use them */ |
| 858 | pagedir_nosave = pblist; | 1177 | while (sp_list) { |
| 859 | } else { | 1178 | lp = sp_list->next; |
| 860 | handle->pbe = NULL; | 1179 | free_image_page(sp_list, PG_UNSAFE_CLEAR); |
| 861 | swsusp_free(); | 1180 | sp_list = lp; |
| 862 | } | 1181 | } |
| 1182 | return 0; | ||
| 1183 | |||
| 1184 | Free: | ||
| 1185 | swsusp_free(); | ||
| 863 | return error; | 1186 | return error; |
| 864 | } | 1187 | } |
| 865 | 1188 | ||
| 866 | static void *get_buffer(struct snapshot_handle *handle) | 1189 | /** |
| 1190 | * get_buffer - compute the address that snapshot_write_next() should | ||
| 1191 | * set for its caller to write to. | ||
| 1192 | */ | ||
| 1193 | |||
| 1194 | static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) | ||
| 867 | { | 1195 | { |
| 868 | struct pbe *pbe = handle->pbe, *last = handle->last_pbe; | 1196 | struct pbe *pbe; |
| 869 | struct page *page = virt_to_page(pbe->orig_address); | 1197 | struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); |
| 870 | 1198 | ||
| 871 | if (PageNosave(page) && PageNosaveFree(page)) { | 1199 | if (PageNosave(page) && PageNosaveFree(page)) |
| 872 | /* | 1200 | /* We have allocated the "original" page frame and we can |
| 873 | * We have allocated the "original" page frame and we can | 1201 | * use it directly to store the loaded page. |
| 874 | * use it directly to store the read page | ||
| 875 | */ | 1202 | */ |
| 876 | pbe->address = 0; | 1203 | return page_address(page); |
| 877 | if (last && last->next) | 1204 | |
| 878 | last->next = NULL; | 1205 | /* The "original" page frame has not been allocated and we have to |
| 879 | return (void *)pbe->orig_address; | 1206 | * use a "safe" page frame to store the loaded page. |
| 880 | } | ||
| 881 | /* | ||
| 882 | * The "original" page frame has not been allocated and we have to | ||
| 883 | * use a "safe" page frame to store the read page | ||
| 884 | */ | 1207 | */ |
| 885 | pbe->address = (unsigned long)safe_pages; | 1208 | pbe = chain_alloc(ca, sizeof(struct pbe)); |
| 886 | safe_pages = safe_pages->next; | 1209 | if (!pbe) { |
| 887 | if (last) | 1210 | swsusp_free(); |
| 888 | last->next = pbe; | 1211 | return NULL; |
| 889 | handle->last_pbe = pbe; | 1212 | } |
| 1213 | pbe->orig_address = (unsigned long)page_address(page); | ||
| 1214 | pbe->address = (unsigned long)safe_pages_list; | ||
| 1215 | safe_pages_list = safe_pages_list->next; | ||
| 1216 | pbe->next = restore_pblist; | ||
| 1217 | restore_pblist = pbe; | ||
| 890 | return (void *)pbe->address; | 1218 | return (void *)pbe->address; |
| 891 | } | 1219 | } |
| 892 | 1220 | ||
| @@ -914,46 +1242,60 @@ static void *get_buffer(struct snapshot_handle *handle) | |||
| 914 | 1242 | ||
| 915 | int snapshot_write_next(struct snapshot_handle *handle, size_t count) | 1243 | int snapshot_write_next(struct snapshot_handle *handle, size_t count) |
| 916 | { | 1244 | { |
| 1245 | static struct chain_allocator ca; | ||
| 917 | int error = 0; | 1246 | int error = 0; |
| 918 | 1247 | ||
| 919 | if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages) | 1248 | /* Check if we have already loaded the entire image */ |
| 1249 | if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) | ||
| 920 | return 0; | 1250 | return 0; |
| 1251 | |||
| 921 | if (!buffer) { | 1252 | if (!buffer) { |
| 922 | /* This makes the buffer be freed by swsusp_free() */ | 1253 | /* This makes the buffer be freed by swsusp_free() */ |
| 923 | buffer = alloc_image_page(GFP_ATOMIC, 0); | 1254 | buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); |
| 924 | if (!buffer) | 1255 | if (!buffer) |
| 925 | return -ENOMEM; | 1256 | return -ENOMEM; |
| 926 | } | 1257 | } |
| 927 | if (!handle->offset) | 1258 | if (!handle->offset) |
| 928 | handle->buffer = buffer; | 1259 | handle->buffer = buffer; |
| 929 | if (handle->prev < handle->page) { | 1260 | handle->sync_read = 1; |
| 930 | if (!handle->prev) { | 1261 | if (handle->prev < handle->cur) { |
| 931 | error = load_header(handle, (struct swsusp_info *)buffer); | 1262 | if (handle->prev == 0) { |
| 1263 | error = load_header(buffer); | ||
| 1264 | if (error) | ||
| 1265 | return error; | ||
| 1266 | |||
| 1267 | error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY); | ||
| 932 | if (error) | 1268 | if (error) |
| 933 | return error; | 1269 | return error; |
| 1270 | |||
| 934 | } else if (handle->prev <= nr_meta_pages) { | 1271 | } else if (handle->prev <= nr_meta_pages) { |
| 935 | handle->pbe = unpack_orig_addresses(buffer, handle->pbe); | 1272 | unpack_orig_pfns(buffer, ©_bm); |
| 936 | if (!handle->pbe) { | 1273 | if (handle->prev == nr_meta_pages) { |
| 937 | error = prepare_image(handle); | 1274 | error = prepare_image(&orig_bm, ©_bm); |
| 938 | if (error) | 1275 | if (error) |
| 939 | return error; | 1276 | return error; |
| 940 | handle->pbe = pagedir_nosave; | 1277 | |
| 941 | handle->last_pbe = NULL; | 1278 | chain_init(&ca, GFP_ATOMIC, PG_SAFE); |
| 942 | handle->buffer = get_buffer(handle); | 1279 | memory_bm_position_reset(&orig_bm); |
| 1280 | restore_pblist = NULL; | ||
| 1281 | handle->buffer = get_buffer(&orig_bm, &ca); | ||
| 1282 | handle->sync_read = 0; | ||
| 1283 | if (!handle->buffer) | ||
| 1284 | return -ENOMEM; | ||
| 943 | } | 1285 | } |
| 944 | } else { | 1286 | } else { |
| 945 | handle->pbe = handle->pbe->next; | 1287 | handle->buffer = get_buffer(&orig_bm, &ca); |
| 946 | handle->buffer = get_buffer(handle); | 1288 | handle->sync_read = 0; |
| 947 | } | 1289 | } |
| 948 | handle->prev = handle->page; | 1290 | handle->prev = handle->cur; |
| 949 | } | 1291 | } |
| 950 | handle->buf_offset = handle->page_offset; | 1292 | handle->buf_offset = handle->cur_offset; |
| 951 | if (handle->page_offset + count >= PAGE_SIZE) { | 1293 | if (handle->cur_offset + count >= PAGE_SIZE) { |
| 952 | count = PAGE_SIZE - handle->page_offset; | 1294 | count = PAGE_SIZE - handle->cur_offset; |
| 953 | handle->page_offset = 0; | 1295 | handle->cur_offset = 0; |
| 954 | handle->page++; | 1296 | handle->cur++; |
| 955 | } else { | 1297 | } else { |
| 956 | handle->page_offset += count; | 1298 | handle->cur_offset += count; |
| 957 | } | 1299 | } |
| 958 | handle->offset += count; | 1300 | handle->offset += count; |
| 959 | return count; | 1301 | return count; |
| @@ -961,6 +1303,13 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) | |||
| 961 | 1303 | ||
| 962 | int snapshot_image_loaded(struct snapshot_handle *handle) | 1304 | int snapshot_image_loaded(struct snapshot_handle *handle) |
| 963 | { | 1305 | { |
| 964 | return !(!handle->pbe || handle->pbe->next || !nr_copy_pages || | 1306 | return !(!nr_copy_pages || |
| 965 | handle->page <= nr_meta_pages + nr_copy_pages); | 1307 | handle->cur <= nr_meta_pages + nr_copy_pages); |
| 1308 | } | ||
| 1309 | |||
| 1310 | void snapshot_free_unused_memory(struct snapshot_handle *handle) | ||
| 1311 | { | ||
| 1312 | /* Free only if we have loaded the image entirely */ | ||
| 1313 | if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) | ||
| 1314 | memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); | ||
| 966 | } | 1315 | } |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 044b8e0c1025..9b2ee5344dee 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
| @@ -22,6 +22,7 @@ | |||
| 22 | #include <linux/device.h> | 22 | #include <linux/device.h> |
| 23 | #include <linux/buffer_head.h> | 23 | #include <linux/buffer_head.h> |
| 24 | #include <linux/bio.h> | 24 | #include <linux/bio.h> |
| 25 | #include <linux/blkdev.h> | ||
| 25 | #include <linux/swap.h> | 26 | #include <linux/swap.h> |
| 26 | #include <linux/swapops.h> | 27 | #include <linux/swapops.h> |
| 27 | #include <linux/pm.h> | 28 | #include <linux/pm.h> |
| @@ -49,18 +50,16 @@ static int mark_swapfiles(swp_entry_t start) | |||
| 49 | { | 50 | { |
| 50 | int error; | 51 | int error; |
| 51 | 52 | ||
| 52 | rw_swap_page_sync(READ, | 53 | rw_swap_page_sync(READ, swp_entry(root_swap, 0), |
| 53 | swp_entry(root_swap, 0), | 54 | virt_to_page((unsigned long)&swsusp_header), NULL); |
| 54 | virt_to_page((unsigned long)&swsusp_header)); | ||
| 55 | if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || | 55 | if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || |
| 56 | !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { | 56 | !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { |
| 57 | memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); | 57 | memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); |
| 58 | memcpy(swsusp_header.sig,SWSUSP_SIG, 10); | 58 | memcpy(swsusp_header.sig,SWSUSP_SIG, 10); |
| 59 | swsusp_header.image = start; | 59 | swsusp_header.image = start; |
| 60 | error = rw_swap_page_sync(WRITE, | 60 | error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0), |
| 61 | swp_entry(root_swap, 0), | 61 | virt_to_page((unsigned long)&swsusp_header), |
| 62 | virt_to_page((unsigned long) | 62 | NULL); |
| 63 | &swsusp_header)); | ||
| 64 | } else { | 63 | } else { |
| 65 | pr_debug("swsusp: Partition is not swap space.\n"); | 64 | pr_debug("swsusp: Partition is not swap space.\n"); |
| 66 | error = -ENODEV; | 65 | error = -ENODEV; |
| @@ -88,16 +87,37 @@ static int swsusp_swap_check(void) /* This is called before saving image */ | |||
| 88 | * write_page - Write one page to given swap location. | 87 | * write_page - Write one page to given swap location. |
| 89 | * @buf: Address we're writing. | 88 | * @buf: Address we're writing. |
| 90 | * @offset: Offset of the swap page we're writing to. | 89 | * @offset: Offset of the swap page we're writing to. |
| 90 | * @bio_chain: Link the next write BIO here | ||
| 91 | */ | 91 | */ |
| 92 | 92 | ||
| 93 | static int write_page(void *buf, unsigned long offset) | 93 | static int write_page(void *buf, unsigned long offset, struct bio **bio_chain) |
| 94 | { | 94 | { |
| 95 | swp_entry_t entry; | 95 | swp_entry_t entry; |
| 96 | int error = -ENOSPC; | 96 | int error = -ENOSPC; |
| 97 | 97 | ||
| 98 | if (offset) { | 98 | if (offset) { |
| 99 | struct page *page = virt_to_page(buf); | ||
| 100 | |||
| 101 | if (bio_chain) { | ||
| 102 | /* | ||
| 103 | * Whether or not we successfully allocated a copy page, | ||
| 104 | * we take a ref on the page here. It gets undone in | ||
| 105 | * wait_on_bio_chain(). | ||
| 106 | */ | ||
| 107 | struct page *page_copy; | ||
| 108 | page_copy = alloc_page(GFP_ATOMIC); | ||
| 109 | if (page_copy == NULL) { | ||
| 110 | WARN_ON_ONCE(1); | ||
| 111 | bio_chain = NULL; /* Go synchronous */ | ||
| 112 | get_page(page); | ||
| 113 | } else { | ||
| 114 | memcpy(page_address(page_copy), | ||
| 115 | page_address(page), PAGE_SIZE); | ||
| 116 | page = page_copy; | ||
| 117 | } | ||
| 118 | } | ||
| 99 | entry = swp_entry(root_swap, offset); | 119 | entry = swp_entry(root_swap, offset); |
| 100 | error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf)); | 120 | error = rw_swap_page_sync(WRITE, entry, page, bio_chain); |
| 101 | } | 121 | } |
| 102 | return error; | 122 | return error; |
| 103 | } | 123 | } |
| @@ -146,6 +166,26 @@ static void release_swap_writer(struct swap_map_handle *handle) | |||
| 146 | handle->bitmap = NULL; | 166 | handle->bitmap = NULL; |
| 147 | } | 167 | } |
| 148 | 168 | ||
| 169 | static void show_speed(struct timeval *start, struct timeval *stop, | ||
| 170 | unsigned nr_pages, char *msg) | ||
| 171 | { | ||
| 172 | s64 elapsed_centisecs64; | ||
| 173 | int centisecs; | ||
| 174 | int k; | ||
| 175 | int kps; | ||
| 176 | |||
| 177 | elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); | ||
| 178 | do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); | ||
| 179 | centisecs = elapsed_centisecs64; | ||
| 180 | if (centisecs == 0) | ||
| 181 | centisecs = 1; /* avoid div-by-zero */ | ||
| 182 | k = nr_pages * (PAGE_SIZE / 1024); | ||
| 183 | kps = (k * 100) / centisecs; | ||
| 184 | printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k, | ||
| 185 | centisecs / 100, centisecs % 100, | ||
| 186 | kps / 1000, (kps % 1000) / 10); | ||
| 187 | } | ||
| 188 | |||
| 149 | static int get_swap_writer(struct swap_map_handle *handle) | 189 | static int get_swap_writer(struct swap_map_handle *handle) |
| 150 | { | 190 | { |
| 151 | handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); | 191 | handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); |
| @@ -165,37 +205,70 @@ static int get_swap_writer(struct swap_map_handle *handle) | |||
| 165 | return 0; | 205 | return 0; |
| 166 | } | 206 | } |
| 167 | 207 | ||
| 168 | static int swap_write_page(struct swap_map_handle *handle, void *buf) | 208 | static int wait_on_bio_chain(struct bio **bio_chain) |
| 169 | { | 209 | { |
| 170 | int error; | 210 | struct bio *bio; |
| 211 | struct bio *next_bio; | ||
| 212 | int ret = 0; | ||
| 213 | |||
| 214 | if (bio_chain == NULL) | ||
| 215 | return 0; | ||
| 216 | |||
| 217 | bio = *bio_chain; | ||
| 218 | if (bio == NULL) | ||
| 219 | return 0; | ||
| 220 | while (bio) { | ||
| 221 | struct page *page; | ||
| 222 | |||
| 223 | next_bio = bio->bi_private; | ||
| 224 | page = bio->bi_io_vec[0].bv_page; | ||
| 225 | wait_on_page_locked(page); | ||
| 226 | if (!PageUptodate(page) || PageError(page)) | ||
| 227 | ret = -EIO; | ||
| 228 | put_page(page); | ||
| 229 | bio_put(bio); | ||
| 230 | bio = next_bio; | ||
| 231 | } | ||
| 232 | *bio_chain = NULL; | ||
| 233 | return ret; | ||
| 234 | } | ||
| 235 | |||
| 236 | static int swap_write_page(struct swap_map_handle *handle, void *buf, | ||
| 237 | struct bio **bio_chain) | ||
| 238 | { | ||
| 239 | int error = 0; | ||
| 171 | unsigned long offset; | 240 | unsigned long offset; |
| 172 | 241 | ||
| 173 | if (!handle->cur) | 242 | if (!handle->cur) |
| 174 | return -EINVAL; | 243 | return -EINVAL; |
| 175 | offset = alloc_swap_page(root_swap, handle->bitmap); | 244 | offset = alloc_swap_page(root_swap, handle->bitmap); |
| 176 | error = write_page(buf, offset); | 245 | error = write_page(buf, offset, bio_chain); |
| 177 | if (error) | 246 | if (error) |
| 178 | return error; | 247 | return error; |
| 179 | handle->cur->entries[handle->k++] = offset; | 248 | handle->cur->entries[handle->k++] = offset; |
| 180 | if (handle->k >= MAP_PAGE_ENTRIES) { | 249 | if (handle->k >= MAP_PAGE_ENTRIES) { |
| 250 | error = wait_on_bio_chain(bio_chain); | ||
| 251 | if (error) | ||
| 252 | goto out; | ||
| 181 | offset = alloc_swap_page(root_swap, handle->bitmap); | 253 | offset = alloc_swap_page(root_swap, handle->bitmap); |
| 182 | if (!offset) | 254 | if (!offset) |
| 183 | return -ENOSPC; | 255 | return -ENOSPC; |
| 184 | handle->cur->next_swap = offset; | 256 | handle->cur->next_swap = offset; |
| 185 | error = write_page(handle->cur, handle->cur_swap); | 257 | error = write_page(handle->cur, handle->cur_swap, NULL); |
| 186 | if (error) | 258 | if (error) |
| 187 | return error; | 259 | goto out; |
| 188 | memset(handle->cur, 0, PAGE_SIZE); | 260 | memset(handle->cur, 0, PAGE_SIZE); |
| 189 | handle->cur_swap = offset; | 261 | handle->cur_swap = offset; |
| 190 | handle->k = 0; | 262 | handle->k = 0; |
| 191 | } | 263 | } |
| 192 | return 0; | 264 | out: |
| 265 | return error; | ||
| 193 | } | 266 | } |
| 194 | 267 | ||
| 195 | static int flush_swap_writer(struct swap_map_handle *handle) | 268 | static int flush_swap_writer(struct swap_map_handle *handle) |
| 196 | { | 269 | { |
| 197 | if (handle->cur && handle->cur_swap) | 270 | if (handle->cur && handle->cur_swap) |
| 198 | return write_page(handle->cur, handle->cur_swap); | 271 | return write_page(handle->cur, handle->cur_swap, NULL); |
| 199 | else | 272 | else |
| 200 | return -EINVAL; | 273 | return -EINVAL; |
| 201 | } | 274 | } |
| @@ -206,21 +279,29 @@ static int flush_swap_writer(struct swap_map_handle *handle) | |||
| 206 | 279 | ||
| 207 | static int save_image(struct swap_map_handle *handle, | 280 | static int save_image(struct swap_map_handle *handle, |
| 208 | struct snapshot_handle *snapshot, | 281 | struct snapshot_handle *snapshot, |
| 209 | unsigned int nr_pages) | 282 | unsigned int nr_to_write) |
| 210 | { | 283 | { |
| 211 | unsigned int m; | 284 | unsigned int m; |
| 212 | int ret; | 285 | int ret; |
| 213 | int error = 0; | 286 | int error = 0; |
| 287 | int nr_pages; | ||
| 288 | int err2; | ||
| 289 | struct bio *bio; | ||
| 290 | struct timeval start; | ||
| 291 | struct timeval stop; | ||
| 214 | 292 | ||
| 215 | printk("Saving image data pages (%u pages) ... ", nr_pages); | 293 | printk("Saving image data pages (%u pages) ... ", nr_to_write); |
| 216 | m = nr_pages / 100; | 294 | m = nr_to_write / 100; |
| 217 | if (!m) | 295 | if (!m) |
| 218 | m = 1; | 296 | m = 1; |
| 219 | nr_pages = 0; | 297 | nr_pages = 0; |
| 298 | bio = NULL; | ||
| 299 | do_gettimeofday(&start); | ||
| 220 | do { | 300 | do { |
| 221 | ret = snapshot_read_next(snapshot, PAGE_SIZE); | 301 | ret = snapshot_read_next(snapshot, PAGE_SIZE); |
| 222 | if (ret > 0) { | 302 | if (ret > 0) { |
| 223 | error = swap_write_page(handle, data_of(*snapshot)); | 303 | error = swap_write_page(handle, data_of(*snapshot), |
| 304 | &bio); | ||
| 224 | if (error) | 305 | if (error) |
| 225 | break; | 306 | break; |
| 226 | if (!(nr_pages % m)) | 307 | if (!(nr_pages % m)) |
| @@ -228,8 +309,13 @@ static int save_image(struct swap_map_handle *handle, | |||
| 228 | nr_pages++; | 309 | nr_pages++; |
| 229 | } | 310 | } |
| 230 | } while (ret > 0); | 311 | } while (ret > 0); |
| 312 | err2 = wait_on_bio_chain(&bio); | ||
| 313 | do_gettimeofday(&stop); | ||
| 314 | if (!error) | ||
| 315 | error = err2; | ||
| 231 | if (!error) | 316 | if (!error) |
| 232 | printk("\b\b\b\bdone\n"); | 317 | printk("\b\b\b\bdone\n"); |
| 318 | show_speed(&start, &stop, nr_to_write, "Wrote"); | ||
| 233 | return error; | 319 | return error; |
| 234 | } | 320 | } |
| 235 | 321 | ||
| @@ -245,8 +331,7 @@ static int enough_swap(unsigned int nr_pages) | |||
| 245 | unsigned int free_swap = count_swap_pages(root_swap, 1); | 331 | unsigned int free_swap = count_swap_pages(root_swap, 1); |
| 246 | 332 | ||
| 247 | pr_debug("swsusp: free swap pages: %u\n", free_swap); | 333 | pr_debug("swsusp: free swap pages: %u\n", free_swap); |
| 248 | return free_swap > (nr_pages + PAGES_FOR_IO + | 334 | return free_swap > nr_pages + PAGES_FOR_IO; |
| 249 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); | ||
| 250 | } | 335 | } |
| 251 | 336 | ||
| 252 | /** | 337 | /** |
| @@ -263,11 +348,11 @@ int swsusp_write(void) | |||
| 263 | struct swap_map_handle handle; | 348 | struct swap_map_handle handle; |
| 264 | struct snapshot_handle snapshot; | 349 | struct snapshot_handle snapshot; |
| 265 | struct swsusp_info *header; | 350 | struct swsusp_info *header; |
| 266 | unsigned long start; | ||
| 267 | int error; | 351 | int error; |
| 268 | 352 | ||
| 269 | if ((error = swsusp_swap_check())) { | 353 | if ((error = swsusp_swap_check())) { |
| 270 | printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n"); | 354 | printk(KERN_ERR "swsusp: Cannot find swap device, try " |
| 355 | "swapon -a.\n"); | ||
| 271 | return error; | 356 | return error; |
| 272 | } | 357 | } |
| 273 | memset(&snapshot, 0, sizeof(struct snapshot_handle)); | 358 | memset(&snapshot, 0, sizeof(struct snapshot_handle)); |
| @@ -281,16 +366,17 @@ int swsusp_write(void) | |||
| 281 | } | 366 | } |
| 282 | error = get_swap_writer(&handle); | 367 | error = get_swap_writer(&handle); |
| 283 | if (!error) { | 368 | if (!error) { |
| 284 | start = handle.cur_swap; | 369 | unsigned long start = handle.cur_swap; |
| 285 | error = swap_write_page(&handle, header); | 370 | error = swap_write_page(&handle, header, NULL); |
| 286 | } | 371 | if (!error) |
| 287 | if (!error) | 372 | error = save_image(&handle, &snapshot, |
| 288 | error = save_image(&handle, &snapshot, header->pages - 1); | 373 | header->pages - 1); |
| 289 | if (!error) { | 374 | if (!error) { |
| 290 | flush_swap_writer(&handle); | 375 | flush_swap_writer(&handle); |
| 291 | printk("S"); | 376 | printk("S"); |
| 292 | error = mark_swapfiles(swp_entry(root_swap, start)); | 377 | error = mark_swapfiles(swp_entry(root_swap, start)); |
| 293 | printk("|\n"); | 378 | printk("|\n"); |
| 379 | } | ||
| 294 | } | 380 | } |
| 295 | if (error) | 381 | if (error) |
| 296 | free_all_swap_pages(root_swap, handle.bitmap); | 382 | free_all_swap_pages(root_swap, handle.bitmap); |
| @@ -298,25 +384,6 @@ int swsusp_write(void) | |||
| 298 | return error; | 384 | return error; |
| 299 | } | 385 | } |
| 300 | 386 | ||
| 301 | /* | ||
| 302 | * Using bio to read from swap. | ||
| 303 | * This code requires a bit more work than just using buffer heads | ||
| 304 | * but, it is the recommended way for 2.5/2.6. | ||
| 305 | * The following are to signal the beginning and end of I/O. Bios | ||
| 306 | * finish asynchronously, while we want them to happen synchronously. | ||
| 307 | * A simple atomic_t, and a wait loop take care of this problem. | ||
| 308 | */ | ||
| 309 | |||
| 310 | static atomic_t io_done = ATOMIC_INIT(0); | ||
| 311 | |||
| 312 | static int end_io(struct bio *bio, unsigned int num, int err) | ||
| 313 | { | ||
| 314 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | ||
| 315 | panic("I/O error reading memory image"); | ||
| 316 | atomic_set(&io_done, 0); | ||
| 317 | return 0; | ||
| 318 | } | ||
| 319 | |||
| 320 | static struct block_device *resume_bdev; | 387 | static struct block_device *resume_bdev; |
| 321 | 388 | ||
| 322 | /** | 389 | /** |
| @@ -324,15 +391,15 @@ static struct block_device *resume_bdev; | |||
| 324 | * @rw: READ or WRITE. | 391 | * @rw: READ or WRITE. |
| 325 | * @off physical offset of page. | 392 | * @off physical offset of page. |
| 326 | * @page: page we're reading or writing. | 393 | * @page: page we're reading or writing. |
| 394 | * @bio_chain: list of pending biod (for async reading) | ||
| 327 | * | 395 | * |
| 328 | * Straight from the textbook - allocate and initialize the bio. | 396 | * Straight from the textbook - allocate and initialize the bio. |
| 329 | * If we're writing, make sure the page is marked as dirty. | 397 | * If we're reading, make sure the page is marked as dirty. |
| 330 | * Then submit it and wait. | 398 | * Then submit it and, if @bio_chain == NULL, wait. |
| 331 | */ | 399 | */ |
| 332 | 400 | static int submit(int rw, pgoff_t page_off, struct page *page, | |
| 333 | static int submit(int rw, pgoff_t page_off, void *page) | 401 | struct bio **bio_chain) |
| 334 | { | 402 | { |
| 335 | int error = 0; | ||
| 336 | struct bio *bio; | 403 | struct bio *bio; |
| 337 | 404 | ||
| 338 | bio = bio_alloc(GFP_ATOMIC, 1); | 405 | bio = bio_alloc(GFP_ATOMIC, 1); |
| @@ -340,33 +407,40 @@ static int submit(int rw, pgoff_t page_off, void *page) | |||
| 340 | return -ENOMEM; | 407 | return -ENOMEM; |
| 341 | bio->bi_sector = page_off * (PAGE_SIZE >> 9); | 408 | bio->bi_sector = page_off * (PAGE_SIZE >> 9); |
| 342 | bio->bi_bdev = resume_bdev; | 409 | bio->bi_bdev = resume_bdev; |
| 343 | bio->bi_end_io = end_io; | 410 | bio->bi_end_io = end_swap_bio_read; |
| 344 | 411 | ||
| 345 | if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) { | 412 | if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { |
| 346 | printk("swsusp: ERROR: adding page to bio at %ld\n",page_off); | 413 | printk("swsusp: ERROR: adding page to bio at %ld\n", page_off); |
| 347 | error = -EFAULT; | 414 | bio_put(bio); |
| 348 | goto Done; | 415 | return -EFAULT; |
| 349 | } | 416 | } |
| 350 | 417 | ||
| 351 | atomic_set(&io_done, 1); | 418 | lock_page(page); |
| 352 | submit_bio(rw | (1 << BIO_RW_SYNC), bio); | 419 | bio_get(bio); |
| 353 | while (atomic_read(&io_done)) | 420 | |
| 354 | yield(); | 421 | if (bio_chain == NULL) { |
| 355 | if (rw == READ) | 422 | submit_bio(rw | (1 << BIO_RW_SYNC), bio); |
| 356 | bio_set_pages_dirty(bio); | 423 | wait_on_page_locked(page); |
| 357 | Done: | 424 | if (rw == READ) |
| 358 | bio_put(bio); | 425 | bio_set_pages_dirty(bio); |
| 359 | return error; | 426 | bio_put(bio); |
| 427 | } else { | ||
| 428 | get_page(page); | ||
| 429 | bio->bi_private = *bio_chain; | ||
| 430 | *bio_chain = bio; | ||
| 431 | submit_bio(rw | (1 << BIO_RW_SYNC), bio); | ||
| 432 | } | ||
| 433 | return 0; | ||
| 360 | } | 434 | } |
| 361 | 435 | ||
| 362 | static int bio_read_page(pgoff_t page_off, void *page) | 436 | static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) |
| 363 | { | 437 | { |
| 364 | return submit(READ, page_off, page); | 438 | return submit(READ, page_off, virt_to_page(addr), bio_chain); |
| 365 | } | 439 | } |
| 366 | 440 | ||
| 367 | static int bio_write_page(pgoff_t page_off, void *page) | 441 | static int bio_write_page(pgoff_t page_off, void *addr) |
| 368 | { | 442 | { |
| 369 | return submit(WRITE, page_off, page); | 443 | return submit(WRITE, page_off, virt_to_page(addr), NULL); |
| 370 | } | 444 | } |
| 371 | 445 | ||
| 372 | /** | 446 | /** |
| @@ -391,7 +465,7 @@ static int get_swap_reader(struct swap_map_handle *handle, | |||
| 391 | handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); | 465 | handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); |
| 392 | if (!handle->cur) | 466 | if (!handle->cur) |
| 393 | return -ENOMEM; | 467 | return -ENOMEM; |
| 394 | error = bio_read_page(swp_offset(start), handle->cur); | 468 | error = bio_read_page(swp_offset(start), handle->cur, NULL); |
| 395 | if (error) { | 469 | if (error) { |
| 396 | release_swap_reader(handle); | 470 | release_swap_reader(handle); |
| 397 | return error; | 471 | return error; |
| @@ -400,7 +474,8 @@ static int get_swap_reader(struct swap_map_handle *handle, | |||
| 400 | return 0; | 474 | return 0; |
| 401 | } | 475 | } |
| 402 | 476 | ||
| 403 | static int swap_read_page(struct swap_map_handle *handle, void *buf) | 477 | static int swap_read_page(struct swap_map_handle *handle, void *buf, |
| 478 | struct bio **bio_chain) | ||
| 404 | { | 479 | { |
| 405 | unsigned long offset; | 480 | unsigned long offset; |
| 406 | int error; | 481 | int error; |
| @@ -410,16 +485,17 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf) | |||
| 410 | offset = handle->cur->entries[handle->k]; | 485 | offset = handle->cur->entries[handle->k]; |
| 411 | if (!offset) | 486 | if (!offset) |
| 412 | return -EFAULT; | 487 | return -EFAULT; |
| 413 | error = bio_read_page(offset, buf); | 488 | error = bio_read_page(offset, buf, bio_chain); |
| 414 | if (error) | 489 | if (error) |
| 415 | return error; | 490 | return error; |
| 416 | if (++handle->k >= MAP_PAGE_ENTRIES) { | 491 | if (++handle->k >= MAP_PAGE_ENTRIES) { |
| 492 | error = wait_on_bio_chain(bio_chain); | ||
| 417 | handle->k = 0; | 493 | handle->k = 0; |
| 418 | offset = handle->cur->next_swap; | 494 | offset = handle->cur->next_swap; |
| 419 | if (!offset) | 495 | if (!offset) |
| 420 | release_swap_reader(handle); | 496 | release_swap_reader(handle); |
| 421 | else | 497 | else if (!error) |
| 422 | error = bio_read_page(offset, handle->cur); | 498 | error = bio_read_page(offset, handle->cur, NULL); |
| 423 | } | 499 | } |
| 424 | return error; | 500 | return error; |
| 425 | } | 501 | } |
| @@ -432,33 +508,49 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf) | |||
| 432 | 508 | ||
| 433 | static int load_image(struct swap_map_handle *handle, | 509 | static int load_image(struct swap_map_handle *handle, |
| 434 | struct snapshot_handle *snapshot, | 510 | struct snapshot_handle *snapshot, |
| 435 | unsigned int nr_pages) | 511 | unsigned int nr_to_read) |
| 436 | { | 512 | { |
| 437 | unsigned int m; | 513 | unsigned int m; |
| 438 | int ret; | ||
| 439 | int error = 0; | 514 | int error = 0; |
| 515 | struct timeval start; | ||
| 516 | struct timeval stop; | ||
| 517 | struct bio *bio; | ||
| 518 | int err2; | ||
| 519 | unsigned nr_pages; | ||
| 440 | 520 | ||
| 441 | printk("Loading image data pages (%u pages) ... ", nr_pages); | 521 | printk("Loading image data pages (%u pages) ... ", nr_to_read); |
| 442 | m = nr_pages / 100; | 522 | m = nr_to_read / 100; |
| 443 | if (!m) | 523 | if (!m) |
| 444 | m = 1; | 524 | m = 1; |
| 445 | nr_pages = 0; | 525 | nr_pages = 0; |
| 446 | do { | 526 | bio = NULL; |
| 447 | ret = snapshot_write_next(snapshot, PAGE_SIZE); | 527 | do_gettimeofday(&start); |
| 448 | if (ret > 0) { | 528 | for ( ; ; ) { |
| 449 | error = swap_read_page(handle, data_of(*snapshot)); | 529 | error = snapshot_write_next(snapshot, PAGE_SIZE); |
| 450 | if (error) | 530 | if (error <= 0) |
| 451 | break; | 531 | break; |
| 452 | if (!(nr_pages % m)) | 532 | error = swap_read_page(handle, data_of(*snapshot), &bio); |
| 453 | printk("\b\b\b\b%3d%%", nr_pages / m); | 533 | if (error) |
| 454 | nr_pages++; | 534 | break; |
| 455 | } | 535 | if (snapshot->sync_read) |
| 456 | } while (ret > 0); | 536 | error = wait_on_bio_chain(&bio); |
| 537 | if (error) | ||
| 538 | break; | ||
| 539 | if (!(nr_pages % m)) | ||
| 540 | printk("\b\b\b\b%3d%%", nr_pages / m); | ||
| 541 | nr_pages++; | ||
| 542 | } | ||
| 543 | err2 = wait_on_bio_chain(&bio); | ||
| 544 | do_gettimeofday(&stop); | ||
| 545 | if (!error) | ||
| 546 | error = err2; | ||
| 457 | if (!error) { | 547 | if (!error) { |
| 458 | printk("\b\b\b\bdone\n"); | 548 | printk("\b\b\b\bdone\n"); |
| 549 | snapshot_free_unused_memory(snapshot); | ||
| 459 | if (!snapshot_image_loaded(snapshot)) | 550 | if (!snapshot_image_loaded(snapshot)) |
| 460 | error = -ENODATA; | 551 | error = -ENODATA; |
| 461 | } | 552 | } |
| 553 | show_speed(&start, &stop, nr_to_read, "Read"); | ||
| 462 | return error; | 554 | return error; |
| 463 | } | 555 | } |
| 464 | 556 | ||
| @@ -481,7 +573,7 @@ int swsusp_read(void) | |||
| 481 | header = (struct swsusp_info *)data_of(snapshot); | 573 | header = (struct swsusp_info *)data_of(snapshot); |
| 482 | error = get_swap_reader(&handle, swsusp_header.image); | 574 | error = get_swap_reader(&handle, swsusp_header.image); |
| 483 | if (!error) | 575 | if (!error) |
| 484 | error = swap_read_page(&handle, header); | 576 | error = swap_read_page(&handle, header, NULL); |
| 485 | if (!error) | 577 | if (!error) |
| 486 | error = load_image(&handle, &snapshot, header->pages - 1); | 578 | error = load_image(&handle, &snapshot, header->pages - 1); |
| 487 | release_swap_reader(&handle); | 579 | release_swap_reader(&handle); |
| @@ -507,7 +599,7 @@ int swsusp_check(void) | |||
| 507 | if (!IS_ERR(resume_bdev)) { | 599 | if (!IS_ERR(resume_bdev)) { |
| 508 | set_blocksize(resume_bdev, PAGE_SIZE); | 600 | set_blocksize(resume_bdev, PAGE_SIZE); |
| 509 | memset(&swsusp_header, 0, sizeof(swsusp_header)); | 601 | memset(&swsusp_header, 0, sizeof(swsusp_header)); |
| 510 | if ((error = bio_read_page(0, &swsusp_header))) | 602 | if ((error = bio_read_page(0, &swsusp_header, NULL))) |
| 511 | return error; | 603 | return error; |
| 512 | if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { | 604 | if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { |
| 513 | memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); | 605 | memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); |
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index f0ee4e7780d6..0b66659dc516 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c | |||
| @@ -62,6 +62,16 @@ unsigned long image_size = 500 * 1024 * 1024; | |||
| 62 | 62 | ||
| 63 | int in_suspend __nosavedata = 0; | 63 | int in_suspend __nosavedata = 0; |
| 64 | 64 | ||
| 65 | #ifdef CONFIG_HIGHMEM | ||
| 66 | unsigned int count_highmem_pages(void); | ||
| 67 | int save_highmem(void); | ||
| 68 | int restore_highmem(void); | ||
| 69 | #else | ||
| 70 | static inline int save_highmem(void) { return 0; } | ||
| 71 | static inline int restore_highmem(void) { return 0; } | ||
| 72 | static inline unsigned int count_highmem_pages(void) { return 0; } | ||
| 73 | #endif | ||
| 74 | |||
| 65 | /** | 75 | /** |
| 66 | * The following functions are used for tracing the allocated | 76 | * The following functions are used for tracing the allocated |
| 67 | * swap pages, so that they can be freed in case of an error. | 77 | * swap pages, so that they can be freed in case of an error. |
| @@ -182,15 +192,14 @@ int swsusp_shrink_memory(void) | |||
| 182 | 192 | ||
| 183 | printk("Shrinking memory... "); | 193 | printk("Shrinking memory... "); |
| 184 | do { | 194 | do { |
| 185 | size = 2 * count_special_pages(); | 195 | size = 2 * count_highmem_pages(); |
| 186 | size += size / 50 + count_data_pages(); | 196 | size += size / 50 + count_data_pages() + PAGES_FOR_IO; |
| 187 | size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE + | ||
| 188 | PAGES_FOR_IO; | ||
| 189 | tmp = size; | 197 | tmp = size; |
| 190 | for_each_zone (zone) | 198 | for_each_zone (zone) |
| 191 | if (!is_highmem(zone) && populated_zone(zone)) { | 199 | if (!is_highmem(zone) && populated_zone(zone)) { |
| 192 | tmp -= zone->free_pages; | 200 | tmp -= zone->free_pages; |
| 193 | tmp += zone->lowmem_reserve[ZONE_NORMAL]; | 201 | tmp += zone->lowmem_reserve[ZONE_NORMAL]; |
| 202 | tmp += snapshot_additional_pages(zone); | ||
| 194 | } | 203 | } |
| 195 | if (tmp > 0) { | 204 | if (tmp > 0) { |
| 196 | tmp = __shrink_memory(tmp); | 205 | tmp = __shrink_memory(tmp); |
| @@ -226,7 +235,7 @@ int swsusp_suspend(void) | |||
| 226 | goto Enable_irqs; | 235 | goto Enable_irqs; |
| 227 | } | 236 | } |
| 228 | 237 | ||
| 229 | if ((error = save_special_mem())) { | 238 | if ((error = save_highmem())) { |
| 230 | printk(KERN_ERR "swsusp: Not enough free pages for highmem\n"); | 239 | printk(KERN_ERR "swsusp: Not enough free pages for highmem\n"); |
| 231 | goto Restore_highmem; | 240 | goto Restore_highmem; |
| 232 | } | 241 | } |
| @@ -237,7 +246,10 @@ int swsusp_suspend(void) | |||
| 237 | /* Restore control flow magically appears here */ | 246 | /* Restore control flow magically appears here */ |
| 238 | restore_processor_state(); | 247 | restore_processor_state(); |
| 239 | Restore_highmem: | 248 | Restore_highmem: |
| 240 | restore_special_mem(); | 249 | restore_highmem(); |
| 250 | /* NOTE: device_power_up() is just a resume() for devices | ||
| 251 | * that suspended with irqs off ... no overall powerup. | ||
| 252 | */ | ||
| 241 | device_power_up(); | 253 | device_power_up(); |
| 242 | Enable_irqs: | 254 | Enable_irqs: |
| 243 | local_irq_enable(); | 255 | local_irq_enable(); |
| @@ -247,8 +259,12 @@ Enable_irqs: | |||
| 247 | int swsusp_resume(void) | 259 | int swsusp_resume(void) |
| 248 | { | 260 | { |
| 249 | int error; | 261 | int error; |
| 262 | |||
| 250 | local_irq_disable(); | 263 | local_irq_disable(); |
| 251 | if (device_power_down(PMSG_FREEZE)) | 264 | /* NOTE: device_power_down() is just a suspend() with irqs off; |
| 265 | * it has no special "power things down" semantics | ||
| 266 | */ | ||
| 267 | if (device_power_down(PMSG_PRETHAW)) | ||
| 252 | printk(KERN_ERR "Some devices failed to power down, very bad\n"); | 268 | printk(KERN_ERR "Some devices failed to power down, very bad\n"); |
| 253 | /* We'll ignore saved state, but this gets preempt count (etc) right */ | 269 | /* We'll ignore saved state, but this gets preempt count (etc) right */ |
| 254 | save_processor_state(); | 270 | save_processor_state(); |
| @@ -263,7 +279,7 @@ int swsusp_resume(void) | |||
| 263 | */ | 279 | */ |
| 264 | swsusp_free(); | 280 | swsusp_free(); |
| 265 | restore_processor_state(); | 281 | restore_processor_state(); |
| 266 | restore_special_mem(); | 282 | restore_highmem(); |
| 267 | touch_softlockup_watchdog(); | 283 | touch_softlockup_watchdog(); |
| 268 | device_power_up(); | 284 | device_power_up(); |
| 269 | local_irq_enable(); | 285 | local_irq_enable(); |
diff --git a/kernel/power/user.c b/kernel/power/user.c index 3f1539fbe48a..72825c853cd7 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
| @@ -19,6 +19,7 @@ | |||
| 19 | #include <linux/swapops.h> | 19 | #include <linux/swapops.h> |
| 20 | #include <linux/pm.h> | 20 | #include <linux/pm.h> |
| 21 | #include <linux/fs.h> | 21 | #include <linux/fs.h> |
| 22 | #include <linux/cpu.h> | ||
| 22 | 23 | ||
| 23 | #include <asm/uaccess.h> | 24 | #include <asm/uaccess.h> |
| 24 | 25 | ||
| @@ -139,12 +140,15 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
| 139 | if (data->frozen) | 140 | if (data->frozen) |
| 140 | break; | 141 | break; |
| 141 | down(&pm_sem); | 142 | down(&pm_sem); |
| 142 | disable_nonboot_cpus(); | 143 | error = disable_nonboot_cpus(); |
| 143 | if (freeze_processes()) { | 144 | if (!error) { |
| 144 | thaw_processes(); | 145 | error = freeze_processes(); |
| 145 | enable_nonboot_cpus(); | 146 | if (error) { |
| 146 | error = -EBUSY; | 147 | thaw_processes(); |
| 148 | error = -EBUSY; | ||
| 149 | } | ||
| 147 | } | 150 | } |
| 151 | enable_nonboot_cpus(); | ||
| 148 | up(&pm_sem); | 152 | up(&pm_sem); |
| 149 | if (!error) | 153 | if (!error) |
| 150 | data->frozen = 1; | 154 | data->frozen = 1; |
| @@ -189,9 +193,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
| 189 | error = -EPERM; | 193 | error = -EPERM; |
| 190 | break; | 194 | break; |
| 191 | } | 195 | } |
| 196 | snapshot_free_unused_memory(&data->handle); | ||
| 192 | down(&pm_sem); | 197 | down(&pm_sem); |
| 193 | pm_prepare_console(); | 198 | pm_prepare_console(); |
| 194 | error = device_suspend(PMSG_FREEZE); | 199 | error = device_suspend(PMSG_PRETHAW); |
| 195 | if (!error) { | 200 | if (!error) { |
| 196 | error = swsusp_resume(); | 201 | error = swsusp_resume(); |
| 197 | device_resume(); | 202 | device_resume(); |
diff --git a/kernel/printk.c b/kernel/printk.c index 19a955619294..771f5e861bcd 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
| @@ -24,8 +24,8 @@ | |||
| 24 | #include <linux/console.h> | 24 | #include <linux/console.h> |
| 25 | #include <linux/init.h> | 25 | #include <linux/init.h> |
| 26 | #include <linux/module.h> | 26 | #include <linux/module.h> |
| 27 | #include <linux/moduleparam.h> | ||
| 27 | #include <linux/interrupt.h> /* For in_interrupt() */ | 28 | #include <linux/interrupt.h> /* For in_interrupt() */ |
| 28 | #include <linux/config.h> | ||
| 29 | #include <linux/delay.h> | 29 | #include <linux/delay.h> |
| 30 | #include <linux/smp.h> | 30 | #include <linux/smp.h> |
| 31 | #include <linux/security.h> | 31 | #include <linux/security.h> |
| @@ -52,7 +52,7 @@ int console_printk[4] = { | |||
| 52 | DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ | 52 | DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ |
| 53 | }; | 53 | }; |
| 54 | 54 | ||
| 55 | EXPORT_SYMBOL(console_printk); | 55 | EXPORT_UNUSED_SYMBOL(console_printk); /* June 2006 */ |
| 56 | 56 | ||
| 57 | /* | 57 | /* |
| 58 | * Low lever drivers may need that to know if they can schedule in | 58 | * Low lever drivers may need that to know if they can schedule in |
| @@ -327,7 +327,9 @@ static void __call_console_drivers(unsigned long start, unsigned long end) | |||
| 327 | struct console *con; | 327 | struct console *con; |
| 328 | 328 | ||
| 329 | for (con = console_drivers; con; con = con->next) { | 329 | for (con = console_drivers; con; con = con->next) { |
| 330 | if ((con->flags & CON_ENABLED) && con->write) | 330 | if ((con->flags & CON_ENABLED) && con->write && |
| 331 | (cpu_online(smp_processor_id()) || | ||
| 332 | (con->flags & CON_ANYTIME))) | ||
| 331 | con->write(con, &LOG_BUF(start), end - start); | 333 | con->write(con, &LOG_BUF(start), end - start); |
| 332 | } | 334 | } |
| 333 | } | 335 | } |
| @@ -437,6 +439,7 @@ static int printk_time = 1; | |||
| 437 | #else | 439 | #else |
| 438 | static int printk_time = 0; | 440 | static int printk_time = 0; |
| 439 | #endif | 441 | #endif |
| 442 | module_param(printk_time, int, S_IRUGO | S_IWUSR); | ||
| 440 | 443 | ||
| 441 | static int __init printk_time_setup(char *str) | 444 | static int __init printk_time_setup(char *str) |
| 442 | { | 445 | { |
| @@ -453,6 +456,18 @@ __attribute__((weak)) unsigned long long printk_clock(void) | |||
| 453 | return sched_clock(); | 456 | return sched_clock(); |
| 454 | } | 457 | } |
| 455 | 458 | ||
| 459 | /* Check if we have any console registered that can be called early in boot. */ | ||
| 460 | static int have_callable_console(void) | ||
| 461 | { | ||
| 462 | struct console *con; | ||
| 463 | |||
| 464 | for (con = console_drivers; con; con = con->next) | ||
| 465 | if (con->flags & CON_ANYTIME) | ||
| 466 | return 1; | ||
| 467 | |||
| 468 | return 0; | ||
| 469 | } | ||
| 470 | |||
| 456 | /** | 471 | /** |
| 457 | * printk - print a kernel message | 472 | * printk - print a kernel message |
| 458 | * @fmt: format string | 473 | * @fmt: format string |
| @@ -503,7 +518,9 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
| 503 | zap_locks(); | 518 | zap_locks(); |
| 504 | 519 | ||
| 505 | /* This stops the holder of console_sem just where we want him */ | 520 | /* This stops the holder of console_sem just where we want him */ |
| 506 | spin_lock_irqsave(&logbuf_lock, flags); | 521 | local_irq_save(flags); |
| 522 | lockdep_off(); | ||
| 523 | spin_lock(&logbuf_lock); | ||
| 507 | printk_cpu = smp_processor_id(); | 524 | printk_cpu = smp_processor_id(); |
| 508 | 525 | ||
| 509 | /* Emit the output into the temporary buffer */ | 526 | /* Emit the output into the temporary buffer */ |
| @@ -566,27 +583,31 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
| 566 | log_level_unknown = 1; | 583 | log_level_unknown = 1; |
| 567 | } | 584 | } |
| 568 | 585 | ||
| 569 | if (!cpu_online(smp_processor_id())) { | 586 | if (!down_trylock(&console_sem)) { |
| 570 | /* | 587 | /* |
| 571 | * Some console drivers may assume that per-cpu resources have | 588 | * We own the drivers. We can drop the spinlock and |
| 572 | * been allocated. So don't allow them to be called by this | 589 | * let release_console_sem() print the text, maybe ... |
| 573 | * CPU until it is officially up. We shouldn't be calling into | ||
| 574 | * random console drivers on a CPU which doesn't exist yet.. | ||
| 575 | */ | 590 | */ |
| 576 | printk_cpu = UINT_MAX; | ||
| 577 | spin_unlock_irqrestore(&logbuf_lock, flags); | ||
| 578 | goto out; | ||
| 579 | } | ||
| 580 | if (!down_trylock(&console_sem)) { | ||
| 581 | console_locked = 1; | 591 | console_locked = 1; |
| 592 | printk_cpu = UINT_MAX; | ||
| 593 | spin_unlock(&logbuf_lock); | ||
| 594 | |||
| 582 | /* | 595 | /* |
| 583 | * We own the drivers. We can drop the spinlock and let | 596 | * Console drivers may assume that per-cpu resources have |
| 584 | * release_console_sem() print the text | 597 | * been allocated. So unless they're explicitly marked as |
| 598 | * being able to cope (CON_ANYTIME) don't call them until | ||
| 599 | * this CPU is officially up. | ||
| 585 | */ | 600 | */ |
| 586 | printk_cpu = UINT_MAX; | 601 | if (cpu_online(smp_processor_id()) || have_callable_console()) { |
| 587 | spin_unlock_irqrestore(&logbuf_lock, flags); | 602 | console_may_schedule = 0; |
| 588 | console_may_schedule = 0; | 603 | release_console_sem(); |
| 589 | release_console_sem(); | 604 | } else { |
| 605 | /* Release by hand to avoid flushing the buffer. */ | ||
| 606 | console_locked = 0; | ||
| 607 | up(&console_sem); | ||
| 608 | } | ||
| 609 | lockdep_on(); | ||
| 610 | local_irq_restore(flags); | ||
| 590 | } else { | 611 | } else { |
| 591 | /* | 612 | /* |
| 592 | * Someone else owns the drivers. We drop the spinlock, which | 613 | * Someone else owns the drivers. We drop the spinlock, which |
| @@ -594,9 +615,11 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
| 594 | * console drivers with the output which we just produced. | 615 | * console drivers with the output which we just produced. |
| 595 | */ | 616 | */ |
| 596 | printk_cpu = UINT_MAX; | 617 | printk_cpu = UINT_MAX; |
| 597 | spin_unlock_irqrestore(&logbuf_lock, flags); | 618 | spin_unlock(&logbuf_lock); |
| 619 | lockdep_on(); | ||
| 620 | local_irq_restore(flags); | ||
| 598 | } | 621 | } |
| 599 | out: | 622 | |
| 600 | preempt_enable(); | 623 | preempt_enable(); |
| 601 | return printed_len; | 624 | return printed_len; |
| 602 | } | 625 | } |
| @@ -698,6 +721,7 @@ int __init add_preferred_console(char *name, int idx, char *options) | |||
| 698 | return 0; | 721 | return 0; |
| 699 | } | 722 | } |
| 700 | 723 | ||
| 724 | #ifndef CONFIG_DISABLE_CONSOLE_SUSPEND | ||
| 701 | /** | 725 | /** |
| 702 | * suspend_console - suspend the console subsystem | 726 | * suspend_console - suspend the console subsystem |
| 703 | * | 727 | * |
| @@ -705,6 +729,7 @@ int __init add_preferred_console(char *name, int idx, char *options) | |||
| 705 | */ | 729 | */ |
| 706 | void suspend_console(void) | 730 | void suspend_console(void) |
| 707 | { | 731 | { |
| 732 | printk("Suspending console(s)\n"); | ||
| 708 | acquire_console_sem(); | 733 | acquire_console_sem(); |
| 709 | console_suspended = 1; | 734 | console_suspended = 1; |
| 710 | } | 735 | } |
| @@ -714,6 +739,7 @@ void resume_console(void) | |||
| 714 | console_suspended = 0; | 739 | console_suspended = 0; |
| 715 | release_console_sem(); | 740 | release_console_sem(); |
| 716 | } | 741 | } |
| 742 | #endif /* CONFIG_DISABLE_CONSOLE_SUSPEND */ | ||
| 717 | 743 | ||
| 718 | /** | 744 | /** |
| 719 | * acquire_console_sem - lock the console system for exclusive use. | 745 | * acquire_console_sem - lock the console system for exclusive use. |
| @@ -750,7 +776,7 @@ int is_console_locked(void) | |||
| 750 | { | 776 | { |
| 751 | return console_locked; | 777 | return console_locked; |
| 752 | } | 778 | } |
| 753 | EXPORT_SYMBOL(is_console_locked); | 779 | EXPORT_UNUSED_SYMBOL(is_console_locked); /* June 2006 */ |
| 754 | 780 | ||
| 755 | /** | 781 | /** |
| 756 | * release_console_sem - unlock the console system | 782 | * release_console_sem - unlock the console system |
| @@ -776,6 +802,9 @@ void release_console_sem(void) | |||
| 776 | up(&secondary_console_sem); | 802 | up(&secondary_console_sem); |
| 777 | return; | 803 | return; |
| 778 | } | 804 | } |
| 805 | |||
| 806 | console_may_schedule = 0; | ||
| 807 | |||
| 779 | for ( ; ; ) { | 808 | for ( ; ; ) { |
| 780 | spin_lock_irqsave(&logbuf_lock, flags); | 809 | spin_lock_irqsave(&logbuf_lock, flags); |
| 781 | wake_klogd |= log_start - log_end; | 810 | wake_klogd |= log_start - log_end; |
| @@ -789,11 +818,17 @@ void release_console_sem(void) | |||
| 789 | local_irq_restore(flags); | 818 | local_irq_restore(flags); |
| 790 | } | 819 | } |
| 791 | console_locked = 0; | 820 | console_locked = 0; |
| 792 | console_may_schedule = 0; | ||
| 793 | up(&console_sem); | 821 | up(&console_sem); |
| 794 | spin_unlock_irqrestore(&logbuf_lock, flags); | 822 | spin_unlock_irqrestore(&logbuf_lock, flags); |
| 795 | if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) | 823 | if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) { |
| 796 | wake_up_interruptible(&log_wait); | 824 | /* |
| 825 | * If we printk from within the lock dependency code, | ||
| 826 | * from within the scheduler code, then do not lock | ||
| 827 | * up due to self-recursion: | ||
| 828 | */ | ||
| 829 | if (!lockdep_internal()) | ||
| 830 | wake_up_interruptible(&log_wait); | ||
| 831 | } | ||
| 797 | } | 832 | } |
| 798 | EXPORT_SYMBOL(release_console_sem); | 833 | EXPORT_SYMBOL(release_console_sem); |
| 799 | 834 | ||
diff --git a/kernel/profile.c b/kernel/profile.c index 68afe121e507..fb660c7d35ba 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
| @@ -13,7 +13,6 @@ | |||
| 13 | * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 | 13 | * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 |
| 14 | */ | 14 | */ |
| 15 | 15 | ||
| 16 | #include <linux/config.h> | ||
| 17 | #include <linux/module.h> | 16 | #include <linux/module.h> |
| 18 | #include <linux/profile.h> | 17 | #include <linux/profile.h> |
| 19 | #include <linux/bootmem.h> | 18 | #include <linux/bootmem.h> |
| @@ -299,7 +298,7 @@ out: | |||
| 299 | } | 298 | } |
| 300 | 299 | ||
| 301 | #ifdef CONFIG_HOTPLUG_CPU | 300 | #ifdef CONFIG_HOTPLUG_CPU |
| 302 | static int profile_cpu_callback(struct notifier_block *info, | 301 | static int __devinit profile_cpu_callback(struct notifier_block *info, |
| 303 | unsigned long action, void *__cpu) | 302 | unsigned long action, void *__cpu) |
| 304 | { | 303 | { |
| 305 | int node, cpu = (unsigned long)__cpu; | 304 | int node, cpu = (unsigned long)__cpu; |
| @@ -310,13 +309,17 @@ static int profile_cpu_callback(struct notifier_block *info, | |||
| 310 | node = cpu_to_node(cpu); | 309 | node = cpu_to_node(cpu); |
| 311 | per_cpu(cpu_profile_flip, cpu) = 0; | 310 | per_cpu(cpu_profile_flip, cpu) = 0; |
| 312 | if (!per_cpu(cpu_profile_hits, cpu)[1]) { | 311 | if (!per_cpu(cpu_profile_hits, cpu)[1]) { |
| 313 | page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); | 312 | page = alloc_pages_node(node, |
| 313 | GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, | ||
| 314 | 0); | ||
| 314 | if (!page) | 315 | if (!page) |
| 315 | return NOTIFY_BAD; | 316 | return NOTIFY_BAD; |
| 316 | per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); | 317 | per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); |
| 317 | } | 318 | } |
| 318 | if (!per_cpu(cpu_profile_hits, cpu)[0]) { | 319 | if (!per_cpu(cpu_profile_hits, cpu)[0]) { |
| 319 | page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); | 320 | page = alloc_pages_node(node, |
| 321 | GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, | ||
| 322 | 0); | ||
| 320 | if (!page) | 323 | if (!page) |
| 321 | goto out_free; | 324 | goto out_free; |
| 322 | per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); | 325 | per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); |
| @@ -492,12 +495,16 @@ static int __init create_hash_tables(void) | |||
| 492 | int node = cpu_to_node(cpu); | 495 | int node = cpu_to_node(cpu); |
| 493 | struct page *page; | 496 | struct page *page; |
| 494 | 497 | ||
| 495 | page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); | 498 | page = alloc_pages_node(node, |
| 499 | GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, | ||
| 500 | 0); | ||
| 496 | if (!page) | 501 | if (!page) |
| 497 | goto out_cleanup; | 502 | goto out_cleanup; |
| 498 | per_cpu(cpu_profile_hits, cpu)[1] | 503 | per_cpu(cpu_profile_hits, cpu)[1] |
| 499 | = (struct profile_hit *)page_address(page); | 504 | = (struct profile_hit *)page_address(page); |
| 500 | page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); | 505 | page = alloc_pages_node(node, |
| 506 | GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, | ||
| 507 | 0); | ||
| 501 | if (!page) | 508 | if (!page) |
| 502 | goto out_cleanup; | 509 | goto out_cleanup; |
| 503 | per_cpu(cpu_profile_hits, cpu)[0] | 510 | per_cpu(cpu_profile_hits, cpu)[0] |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 921c22ad16e4..4d50e06fd745 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
| @@ -28,7 +28,7 @@ | |||
| 28 | * | 28 | * |
| 29 | * Must be called with the tasklist lock write-held. | 29 | * Must be called with the tasklist lock write-held. |
| 30 | */ | 30 | */ |
| 31 | void __ptrace_link(task_t *child, task_t *new_parent) | 31 | void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) |
| 32 | { | 32 | { |
| 33 | BUG_ON(!list_empty(&child->ptrace_list)); | 33 | BUG_ON(!list_empty(&child->ptrace_list)); |
| 34 | if (child->parent == new_parent) | 34 | if (child->parent == new_parent) |
| @@ -46,7 +46,7 @@ void __ptrace_link(task_t *child, task_t *new_parent) | |||
| 46 | * TASK_TRACED, resume it now. | 46 | * TASK_TRACED, resume it now. |
| 47 | * Requires that irqs be disabled. | 47 | * Requires that irqs be disabled. |
| 48 | */ | 48 | */ |
| 49 | void ptrace_untrace(task_t *child) | 49 | void ptrace_untrace(struct task_struct *child) |
| 50 | { | 50 | { |
| 51 | spin_lock(&child->sighand->siglock); | 51 | spin_lock(&child->sighand->siglock); |
| 52 | if (child->state == TASK_TRACED) { | 52 | if (child->state == TASK_TRACED) { |
| @@ -65,7 +65,7 @@ void ptrace_untrace(task_t *child) | |||
| 65 | * | 65 | * |
| 66 | * Must be called with the tasklist lock write-held. | 66 | * Must be called with the tasklist lock write-held. |
| 67 | */ | 67 | */ |
| 68 | void __ptrace_unlink(task_t *child) | 68 | void __ptrace_unlink(struct task_struct *child) |
| 69 | { | 69 | { |
| 70 | BUG_ON(!child->ptrace); | 70 | BUG_ON(!child->ptrace); |
| 71 | 71 | ||
| @@ -120,8 +120,18 @@ int ptrace_check_attach(struct task_struct *child, int kill) | |||
| 120 | 120 | ||
| 121 | static int may_attach(struct task_struct *task) | 121 | static int may_attach(struct task_struct *task) |
| 122 | { | 122 | { |
| 123 | if (!task->mm) | 123 | /* May we inspect the given task? |
| 124 | return -EPERM; | 124 | * This check is used both for attaching with ptrace |
| 125 | * and for allowing access to sensitive information in /proc. | ||
| 126 | * | ||
| 127 | * ptrace_attach denies several cases that /proc allows | ||
| 128 | * because setting up the necessary parent/child relationship | ||
| 129 | * or halting the specified task is impossible. | ||
| 130 | */ | ||
| 131 | int dumpable = 0; | ||
| 132 | /* Don't let security modules deny introspection */ | ||
| 133 | if (task == current) | ||
| 134 | return 0; | ||
| 125 | if (((current->uid != task->euid) || | 135 | if (((current->uid != task->euid) || |
| 126 | (current->uid != task->suid) || | 136 | (current->uid != task->suid) || |
| 127 | (current->uid != task->uid) || | 137 | (current->uid != task->uid) || |
| @@ -130,7 +140,9 @@ static int may_attach(struct task_struct *task) | |||
| 130 | (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) | 140 | (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) |
| 131 | return -EPERM; | 141 | return -EPERM; |
| 132 | smp_rmb(); | 142 | smp_rmb(); |
| 133 | if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) | 143 | if (task->mm) |
| 144 | dumpable = task->mm->dumpable; | ||
| 145 | if (!dumpable && !capable(CAP_SYS_PTRACE)) | ||
| 134 | return -EPERM; | 146 | return -EPERM; |
| 135 | 147 | ||
| 136 | return security_ptrace(current, task); | 148 | return security_ptrace(current, task); |
| @@ -176,6 +188,8 @@ repeat: | |||
| 176 | goto repeat; | 188 | goto repeat; |
| 177 | } | 189 | } |
| 178 | 190 | ||
| 191 | if (!task->mm) | ||
| 192 | goto bad; | ||
| 179 | /* the same process cannot be attached many times */ | 193 | /* the same process cannot be attached many times */ |
| 180 | if (task->ptrace & PT_PTRACED) | 194 | if (task->ptrace & PT_PTRACED) |
| 181 | goto bad; | 195 | goto bad; |
| @@ -200,7 +214,7 @@ out: | |||
| 200 | return retval; | 214 | return retval; |
| 201 | } | 215 | } |
| 202 | 216 | ||
| 203 | void __ptrace_detach(struct task_struct *child, unsigned int data) | 217 | static inline void __ptrace_detach(struct task_struct *child, unsigned int data) |
| 204 | { | 218 | { |
| 205 | child->exit_code = data; | 219 | child->exit_code = data; |
| 206 | /* .. re-parent .. */ | 220 | /* .. re-parent .. */ |
| @@ -219,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data) | |||
| 219 | ptrace_disable(child); | 233 | ptrace_disable(child); |
| 220 | 234 | ||
| 221 | write_lock_irq(&tasklist_lock); | 235 | write_lock_irq(&tasklist_lock); |
| 236 | /* protect against de_thread()->release_task() */ | ||
| 222 | if (child->ptrace) | 237 | if (child->ptrace) |
| 223 | __ptrace_detach(child, data); | 238 | __ptrace_detach(child, data); |
| 224 | write_unlock_irq(&tasklist_lock); | 239 | write_unlock_irq(&tasklist_lock); |
| @@ -226,60 +241,6 @@ int ptrace_detach(struct task_struct *child, unsigned int data) | |||
| 226 | return 0; | 241 | return 0; |
| 227 | } | 242 | } |
| 228 | 243 | ||
| 229 | /* | ||
| 230 | * Access another process' address space. | ||
| 231 | * Source/target buffer must be kernel space, | ||
| 232 | * Do not walk the page table directly, use get_user_pages | ||
| 233 | */ | ||
| 234 | |||
| 235 | int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) | ||
| 236 | { | ||
| 237 | struct mm_struct *mm; | ||
| 238 | struct vm_area_struct *vma; | ||
| 239 | struct page *page; | ||
| 240 | void *old_buf = buf; | ||
| 241 | |||
| 242 | mm = get_task_mm(tsk); | ||
| 243 | if (!mm) | ||
| 244 | return 0; | ||
| 245 | |||
| 246 | down_read(&mm->mmap_sem); | ||
| 247 | /* ignore errors, just check how much was sucessfully transfered */ | ||
| 248 | while (len) { | ||
| 249 | int bytes, ret, offset; | ||
| 250 | void *maddr; | ||
| 251 | |||
| 252 | ret = get_user_pages(tsk, mm, addr, 1, | ||
| 253 | write, 1, &page, &vma); | ||
| 254 | if (ret <= 0) | ||
| 255 | break; | ||
| 256 | |||
| 257 | bytes = len; | ||
| 258 | offset = addr & (PAGE_SIZE-1); | ||
| 259 | if (bytes > PAGE_SIZE-offset) | ||
| 260 | bytes = PAGE_SIZE-offset; | ||
| 261 | |||
| 262 | maddr = kmap(page); | ||
| 263 | if (write) { | ||
| 264 | copy_to_user_page(vma, page, addr, | ||
| 265 | maddr + offset, buf, bytes); | ||
| 266 | set_page_dirty_lock(page); | ||
| 267 | } else { | ||
| 268 | copy_from_user_page(vma, page, addr, | ||
| 269 | buf, maddr + offset, bytes); | ||
| 270 | } | ||
| 271 | kunmap(page); | ||
| 272 | page_cache_release(page); | ||
| 273 | len -= bytes; | ||
| 274 | buf += bytes; | ||
| 275 | addr += bytes; | ||
| 276 | } | ||
| 277 | up_read(&mm->mmap_sem); | ||
| 278 | mmput(mm); | ||
| 279 | |||
| 280 | return buf - old_buf; | ||
| 281 | } | ||
| 282 | |||
| 283 | int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) | 244 | int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) |
| 284 | { | 245 | { |
| 285 | int copied = 0; | 246 | int copied = 0; |
| @@ -479,6 +440,7 @@ struct task_struct *ptrace_get_task_struct(pid_t pid) | |||
| 479 | child = find_task_by_pid(pid); | 440 | child = find_task_by_pid(pid); |
| 480 | if (child) | 441 | if (child) |
| 481 | get_task_struct(child); | 442 | get_task_struct(child); |
| 443 | |||
| 482 | read_unlock(&tasklist_lock); | 444 | read_unlock(&tasklist_lock); |
| 483 | if (!child) | 445 | if (!child) |
| 484 | return ERR_PTR(-ESRCH); | 446 | return ERR_PTR(-ESRCH); |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 20e9710fc21c..523e46483b99 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
| @@ -53,13 +53,13 @@ | |||
| 53 | static struct rcu_ctrlblk rcu_ctrlblk = { | 53 | static struct rcu_ctrlblk rcu_ctrlblk = { |
| 54 | .cur = -300, | 54 | .cur = -300, |
| 55 | .completed = -300, | 55 | .completed = -300, |
| 56 | .lock = SPIN_LOCK_UNLOCKED, | 56 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), |
| 57 | .cpumask = CPU_MASK_NONE, | 57 | .cpumask = CPU_MASK_NONE, |
| 58 | }; | 58 | }; |
| 59 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | 59 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { |
| 60 | .cur = -300, | 60 | .cur = -300, |
| 61 | .completed = -300, | 61 | .completed = -300, |
| 62 | .lock = SPIN_LOCK_UNLOCKED, | 62 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), |
| 63 | .cpumask = CPU_MASK_NONE, | 63 | .cpumask = CPU_MASK_NONE, |
| 64 | }; | 64 | }; |
| 65 | 65 | ||
| @@ -182,6 +182,15 @@ long rcu_batches_completed(void) | |||
| 182 | return rcu_ctrlblk.completed; | 182 | return rcu_ctrlblk.completed; |
| 183 | } | 183 | } |
| 184 | 184 | ||
| 185 | /* | ||
| 186 | * Return the number of RCU batches processed thus far. Useful | ||
| 187 | * for debug and statistics. | ||
| 188 | */ | ||
| 189 | long rcu_batches_completed_bh(void) | ||
| 190 | { | ||
| 191 | return rcu_bh_ctrlblk.completed; | ||
| 192 | } | ||
| 193 | |||
| 185 | static void rcu_barrier_callback(struct rcu_head *notused) | 194 | static void rcu_barrier_callback(struct rcu_head *notused) |
| 186 | { | 195 | { |
| 187 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 196 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) |
| @@ -232,12 +241,16 @@ static void rcu_do_batch(struct rcu_data *rdp) | |||
| 232 | next = rdp->donelist = list->next; | 241 | next = rdp->donelist = list->next; |
| 233 | list->func(list); | 242 | list->func(list); |
| 234 | list = next; | 243 | list = next; |
| 235 | rdp->qlen--; | ||
| 236 | if (++count >= rdp->blimit) | 244 | if (++count >= rdp->blimit) |
| 237 | break; | 245 | break; |
| 238 | } | 246 | } |
| 247 | |||
| 248 | local_irq_disable(); | ||
| 249 | rdp->qlen -= count; | ||
| 250 | local_irq_enable(); | ||
| 239 | if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) | 251 | if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) |
| 240 | rdp->blimit = blimit; | 252 | rdp->blimit = blimit; |
| 253 | |||
| 241 | if (!rdp->donelist) | 254 | if (!rdp->donelist) |
| 242 | rdp->donetail = &rdp->donelist; | 255 | rdp->donetail = &rdp->donelist; |
| 243 | else | 256 | else |
| @@ -539,7 +552,7 @@ static void __devinit rcu_online_cpu(int cpu) | |||
| 539 | tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); | 552 | tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); |
| 540 | } | 553 | } |
| 541 | 554 | ||
| 542 | static int rcu_cpu_notify(struct notifier_block *self, | 555 | static int __cpuinit rcu_cpu_notify(struct notifier_block *self, |
| 543 | unsigned long action, void *hcpu) | 556 | unsigned long action, void *hcpu) |
| 544 | { | 557 | { |
| 545 | long cpu = (long)hcpu; | 558 | long cpu = (long)hcpu; |
| @@ -556,7 +569,7 @@ static int rcu_cpu_notify(struct notifier_block *self, | |||
| 556 | return NOTIFY_OK; | 569 | return NOTIFY_OK; |
| 557 | } | 570 | } |
| 558 | 571 | ||
| 559 | static struct notifier_block rcu_nb = { | 572 | static struct notifier_block __cpuinitdata rcu_nb = { |
| 560 | .notifier_call = rcu_cpu_notify, | 573 | .notifier_call = rcu_cpu_notify, |
| 561 | }; | 574 | }; |
| 562 | 575 | ||
| @@ -619,6 +632,7 @@ module_param(qlowmark, int, 0); | |||
| 619 | module_param(rsinterval, int, 0); | 632 | module_param(rsinterval, int, 0); |
| 620 | #endif | 633 | #endif |
| 621 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | 634 | EXPORT_SYMBOL_GPL(rcu_batches_completed); |
| 635 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | ||
| 622 | EXPORT_SYMBOL_GPL(call_rcu); | 636 | EXPORT_SYMBOL_GPL(call_rcu); |
| 623 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 637 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
| 624 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 638 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 8154e7589d12..4f2c4272d59c 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * Read-Copy Update /proc-based torture test facility | 2 | * Read-Copy Update module-based torture test facility |
| 3 | * | 3 | * |
| 4 | * This program is free software; you can redistribute it and/or modify | 4 | * This program is free software; you can redistribute it and/or modify |
| 5 | * it under the terms of the GNU General Public License as published by | 5 | * it under the terms of the GNU General Public License as published by |
| @@ -53,6 +53,7 @@ static int stat_interval; /* Interval between stats, in seconds. */ | |||
| 53 | static int verbose; /* Print more debug info. */ | 53 | static int verbose; /* Print more debug info. */ |
| 54 | static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ | 54 | static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ |
| 55 | static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ | 55 | static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ |
| 56 | static char *torture_type = "rcu"; /* What to torture. */ | ||
| 56 | 57 | ||
| 57 | module_param(nreaders, int, 0); | 58 | module_param(nreaders, int, 0); |
| 58 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | 59 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); |
| @@ -64,13 +65,16 @@ module_param(test_no_idle_hz, bool, 0); | |||
| 64 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); | 65 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); |
| 65 | module_param(shuffle_interval, int, 0); | 66 | module_param(shuffle_interval, int, 0); |
| 66 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); | 67 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); |
| 67 | #define TORTURE_FLAG "rcutorture: " | 68 | module_param(torture_type, charp, 0); |
| 69 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)"); | ||
| 70 | |||
| 71 | #define TORTURE_FLAG "-torture:" | ||
| 68 | #define PRINTK_STRING(s) \ | 72 | #define PRINTK_STRING(s) \ |
| 69 | do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) | 73 | do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) |
| 70 | #define VERBOSE_PRINTK_STRING(s) \ | 74 | #define VERBOSE_PRINTK_STRING(s) \ |
| 71 | do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) | 75 | do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) |
| 72 | #define VERBOSE_PRINTK_ERRSTRING(s) \ | 76 | #define VERBOSE_PRINTK_ERRSTRING(s) \ |
| 73 | do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) | 77 | do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) |
| 74 | 78 | ||
| 75 | static char printk_buf[4096]; | 79 | static char printk_buf[4096]; |
| 76 | 80 | ||
| @@ -139,28 +143,6 @@ rcu_torture_free(struct rcu_torture *p) | |||
| 139 | spin_unlock_bh(&rcu_torture_lock); | 143 | spin_unlock_bh(&rcu_torture_lock); |
| 140 | } | 144 | } |
| 141 | 145 | ||
| 142 | static void | ||
| 143 | rcu_torture_cb(struct rcu_head *p) | ||
| 144 | { | ||
| 145 | int i; | ||
| 146 | struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); | ||
| 147 | |||
| 148 | if (fullstop) { | ||
| 149 | /* Test is ending, just drop callbacks on the floor. */ | ||
| 150 | /* The next initialization will pick up the pieces. */ | ||
| 151 | return; | ||
| 152 | } | ||
| 153 | i = rp->rtort_pipe_count; | ||
| 154 | if (i > RCU_TORTURE_PIPE_LEN) | ||
| 155 | i = RCU_TORTURE_PIPE_LEN; | ||
| 156 | atomic_inc(&rcu_torture_wcount[i]); | ||
| 157 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | ||
| 158 | rp->rtort_mbtest = 0; | ||
| 159 | rcu_torture_free(rp); | ||
| 160 | } else | ||
| 161 | call_rcu(p, rcu_torture_cb); | ||
| 162 | } | ||
| 163 | |||
| 164 | struct rcu_random_state { | 146 | struct rcu_random_state { |
| 165 | unsigned long rrs_state; | 147 | unsigned long rrs_state; |
| 166 | unsigned long rrs_count; | 148 | unsigned long rrs_count; |
| @@ -191,6 +173,119 @@ rcu_random(struct rcu_random_state *rrsp) | |||
| 191 | } | 173 | } |
| 192 | 174 | ||
| 193 | /* | 175 | /* |
| 176 | * Operations vector for selecting different types of tests. | ||
| 177 | */ | ||
| 178 | |||
| 179 | struct rcu_torture_ops { | ||
| 180 | void (*init)(void); | ||
| 181 | void (*cleanup)(void); | ||
| 182 | int (*readlock)(void); | ||
| 183 | void (*readunlock)(int idx); | ||
| 184 | int (*completed)(void); | ||
| 185 | void (*deferredfree)(struct rcu_torture *p); | ||
| 186 | int (*stats)(char *page); | ||
| 187 | char *name; | ||
| 188 | }; | ||
| 189 | static struct rcu_torture_ops *cur_ops = NULL; | ||
| 190 | |||
| 191 | /* | ||
| 192 | * Definitions for rcu torture testing. | ||
| 193 | */ | ||
| 194 | |||
| 195 | static int rcu_torture_read_lock(void) __acquires(RCU) | ||
| 196 | { | ||
| 197 | rcu_read_lock(); | ||
| 198 | return 0; | ||
| 199 | } | ||
| 200 | |||
| 201 | static void rcu_torture_read_unlock(int idx) __releases(RCU) | ||
| 202 | { | ||
| 203 | rcu_read_unlock(); | ||
| 204 | } | ||
| 205 | |||
| 206 | static int rcu_torture_completed(void) | ||
| 207 | { | ||
| 208 | return rcu_batches_completed(); | ||
| 209 | } | ||
| 210 | |||
| 211 | static void | ||
| 212 | rcu_torture_cb(struct rcu_head *p) | ||
| 213 | { | ||
| 214 | int i; | ||
| 215 | struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); | ||
| 216 | |||
| 217 | if (fullstop) { | ||
| 218 | /* Test is ending, just drop callbacks on the floor. */ | ||
| 219 | /* The next initialization will pick up the pieces. */ | ||
| 220 | return; | ||
| 221 | } | ||
| 222 | i = rp->rtort_pipe_count; | ||
| 223 | if (i > RCU_TORTURE_PIPE_LEN) | ||
| 224 | i = RCU_TORTURE_PIPE_LEN; | ||
| 225 | atomic_inc(&rcu_torture_wcount[i]); | ||
| 226 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | ||
| 227 | rp->rtort_mbtest = 0; | ||
| 228 | rcu_torture_free(rp); | ||
| 229 | } else | ||
| 230 | cur_ops->deferredfree(rp); | ||
| 231 | } | ||
| 232 | |||
| 233 | static void rcu_torture_deferred_free(struct rcu_torture *p) | ||
| 234 | { | ||
| 235 | call_rcu(&p->rtort_rcu, rcu_torture_cb); | ||
| 236 | } | ||
| 237 | |||
| 238 | static struct rcu_torture_ops rcu_ops = { | ||
| 239 | .init = NULL, | ||
| 240 | .cleanup = NULL, | ||
| 241 | .readlock = rcu_torture_read_lock, | ||
| 242 | .readunlock = rcu_torture_read_unlock, | ||
| 243 | .completed = rcu_torture_completed, | ||
| 244 | .deferredfree = rcu_torture_deferred_free, | ||
| 245 | .stats = NULL, | ||
| 246 | .name = "rcu" | ||
| 247 | }; | ||
| 248 | |||
| 249 | /* | ||
| 250 | * Definitions for rcu_bh torture testing. | ||
| 251 | */ | ||
| 252 | |||
| 253 | static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH) | ||
| 254 | { | ||
| 255 | rcu_read_lock_bh(); | ||
| 256 | return 0; | ||
| 257 | } | ||
| 258 | |||
| 259 | static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) | ||
| 260 | { | ||
| 261 | rcu_read_unlock_bh(); | ||
| 262 | } | ||
| 263 | |||
| 264 | static int rcu_bh_torture_completed(void) | ||
| 265 | { | ||
| 266 | return rcu_batches_completed_bh(); | ||
| 267 | } | ||
| 268 | |||
| 269 | static void rcu_bh_torture_deferred_free(struct rcu_torture *p) | ||
| 270 | { | ||
| 271 | call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); | ||
| 272 | } | ||
| 273 | |||
| 274 | static struct rcu_torture_ops rcu_bh_ops = { | ||
| 275 | .init = NULL, | ||
| 276 | .cleanup = NULL, | ||
| 277 | .readlock = rcu_bh_torture_read_lock, | ||
| 278 | .readunlock = rcu_bh_torture_read_unlock, | ||
| 279 | .completed = rcu_bh_torture_completed, | ||
| 280 | .deferredfree = rcu_bh_torture_deferred_free, | ||
| 281 | .stats = NULL, | ||
| 282 | .name = "rcu_bh" | ||
| 283 | }; | ||
| 284 | |||
| 285 | static struct rcu_torture_ops *torture_ops[] = | ||
| 286 | { &rcu_ops, &rcu_bh_ops, NULL }; | ||
| 287 | |||
| 288 | /* | ||
| 194 | * RCU torture writer kthread. Repeatedly substitutes a new structure | 289 | * RCU torture writer kthread. Repeatedly substitutes a new structure |
| 195 | * for that pointed to by rcu_torture_current, freeing the old structure | 290 | * for that pointed to by rcu_torture_current, freeing the old structure |
| 196 | * after a series of grace periods (the "pipeline"). | 291 | * after a series of grace periods (the "pipeline"). |
| @@ -209,8 +304,6 @@ rcu_torture_writer(void *arg) | |||
| 209 | 304 | ||
| 210 | do { | 305 | do { |
| 211 | schedule_timeout_uninterruptible(1); | 306 | schedule_timeout_uninterruptible(1); |
| 212 | if (rcu_batches_completed() == oldbatch) | ||
| 213 | continue; | ||
| 214 | if ((rp = rcu_torture_alloc()) == NULL) | 307 | if ((rp = rcu_torture_alloc()) == NULL) |
| 215 | continue; | 308 | continue; |
| 216 | rp->rtort_pipe_count = 0; | 309 | rp->rtort_pipe_count = 0; |
| @@ -225,10 +318,10 @@ rcu_torture_writer(void *arg) | |||
| 225 | i = RCU_TORTURE_PIPE_LEN; | 318 | i = RCU_TORTURE_PIPE_LEN; |
| 226 | atomic_inc(&rcu_torture_wcount[i]); | 319 | atomic_inc(&rcu_torture_wcount[i]); |
| 227 | old_rp->rtort_pipe_count++; | 320 | old_rp->rtort_pipe_count++; |
| 228 | call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); | 321 | cur_ops->deferredfree(old_rp); |
| 229 | } | 322 | } |
| 230 | rcu_torture_current_version++; | 323 | rcu_torture_current_version++; |
| 231 | oldbatch = rcu_batches_completed(); | 324 | oldbatch = cur_ops->completed(); |
| 232 | } while (!kthread_should_stop() && !fullstop); | 325 | } while (!kthread_should_stop() && !fullstop); |
| 233 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); | 326 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); |
| 234 | while (!kthread_should_stop()) | 327 | while (!kthread_should_stop()) |
| @@ -246,6 +339,7 @@ static int | |||
| 246 | rcu_torture_reader(void *arg) | 339 | rcu_torture_reader(void *arg) |
| 247 | { | 340 | { |
| 248 | int completed; | 341 | int completed; |
| 342 | int idx; | ||
| 249 | DEFINE_RCU_RANDOM(rand); | 343 | DEFINE_RCU_RANDOM(rand); |
| 250 | struct rcu_torture *p; | 344 | struct rcu_torture *p; |
| 251 | int pipe_count; | 345 | int pipe_count; |
| @@ -254,12 +348,12 @@ rcu_torture_reader(void *arg) | |||
| 254 | set_user_nice(current, 19); | 348 | set_user_nice(current, 19); |
| 255 | 349 | ||
| 256 | do { | 350 | do { |
| 257 | rcu_read_lock(); | 351 | idx = cur_ops->readlock(); |
| 258 | completed = rcu_batches_completed(); | 352 | completed = cur_ops->completed(); |
| 259 | p = rcu_dereference(rcu_torture_current); | 353 | p = rcu_dereference(rcu_torture_current); |
| 260 | if (p == NULL) { | 354 | if (p == NULL) { |
| 261 | /* Wait for rcu_torture_writer to get underway */ | 355 | /* Wait for rcu_torture_writer to get underway */ |
| 262 | rcu_read_unlock(); | 356 | cur_ops->readunlock(idx); |
| 263 | schedule_timeout_interruptible(HZ); | 357 | schedule_timeout_interruptible(HZ); |
| 264 | continue; | 358 | continue; |
| 265 | } | 359 | } |
| @@ -273,14 +367,14 @@ rcu_torture_reader(void *arg) | |||
| 273 | pipe_count = RCU_TORTURE_PIPE_LEN; | 367 | pipe_count = RCU_TORTURE_PIPE_LEN; |
| 274 | } | 368 | } |
| 275 | ++__get_cpu_var(rcu_torture_count)[pipe_count]; | 369 | ++__get_cpu_var(rcu_torture_count)[pipe_count]; |
| 276 | completed = rcu_batches_completed() - completed; | 370 | completed = cur_ops->completed() - completed; |
| 277 | if (completed > RCU_TORTURE_PIPE_LEN) { | 371 | if (completed > RCU_TORTURE_PIPE_LEN) { |
| 278 | /* Should not happen, but... */ | 372 | /* Should not happen, but... */ |
| 279 | completed = RCU_TORTURE_PIPE_LEN; | 373 | completed = RCU_TORTURE_PIPE_LEN; |
| 280 | } | 374 | } |
| 281 | ++__get_cpu_var(rcu_torture_batch)[completed]; | 375 | ++__get_cpu_var(rcu_torture_batch)[completed]; |
| 282 | preempt_enable(); | 376 | preempt_enable(); |
| 283 | rcu_read_unlock(); | 377 | cur_ops->readunlock(idx); |
| 284 | schedule(); | 378 | schedule(); |
| 285 | } while (!kthread_should_stop() && !fullstop); | 379 | } while (!kthread_should_stop() && !fullstop); |
| 286 | VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); | 380 | VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); |
| @@ -311,7 +405,7 @@ rcu_torture_printk(char *page) | |||
| 311 | if (pipesummary[i] != 0) | 405 | if (pipesummary[i] != 0) |
| 312 | break; | 406 | break; |
| 313 | } | 407 | } |
| 314 | cnt += sprintf(&page[cnt], "rcutorture: "); | 408 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); |
| 315 | cnt += sprintf(&page[cnt], | 409 | cnt += sprintf(&page[cnt], |
| 316 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " | 410 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " |
| 317 | "rtmbe: %d", | 411 | "rtmbe: %d", |
| @@ -324,7 +418,7 @@ rcu_torture_printk(char *page) | |||
| 324 | atomic_read(&n_rcu_torture_mberror)); | 418 | atomic_read(&n_rcu_torture_mberror)); |
| 325 | if (atomic_read(&n_rcu_torture_mberror) != 0) | 419 | if (atomic_read(&n_rcu_torture_mberror) != 0) |
| 326 | cnt += sprintf(&page[cnt], " !!!"); | 420 | cnt += sprintf(&page[cnt], " !!!"); |
| 327 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | 421 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
| 328 | if (i > 1) { | 422 | if (i > 1) { |
| 329 | cnt += sprintf(&page[cnt], "!!! "); | 423 | cnt += sprintf(&page[cnt], "!!! "); |
| 330 | atomic_inc(&n_rcu_torture_error); | 424 | atomic_inc(&n_rcu_torture_error); |
| @@ -332,17 +426,19 @@ rcu_torture_printk(char *page) | |||
| 332 | cnt += sprintf(&page[cnt], "Reader Pipe: "); | 426 | cnt += sprintf(&page[cnt], "Reader Pipe: "); |
| 333 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 427 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
| 334 | cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); | 428 | cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); |
| 335 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | 429 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
| 336 | cnt += sprintf(&page[cnt], "Reader Batch: "); | 430 | cnt += sprintf(&page[cnt], "Reader Batch: "); |
| 337 | for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) | 431 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
| 338 | cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); | 432 | cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); |
| 339 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | 433 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
| 340 | cnt += sprintf(&page[cnt], "Free-Block Circulation: "); | 434 | cnt += sprintf(&page[cnt], "Free-Block Circulation: "); |
| 341 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | 435 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { |
| 342 | cnt += sprintf(&page[cnt], " %d", | 436 | cnt += sprintf(&page[cnt], " %d", |
| 343 | atomic_read(&rcu_torture_wcount[i])); | 437 | atomic_read(&rcu_torture_wcount[i])); |
| 344 | } | 438 | } |
| 345 | cnt += sprintf(&page[cnt], "\n"); | 439 | cnt += sprintf(&page[cnt], "\n"); |
| 440 | if (cur_ops->stats != NULL) | ||
| 441 | cnt += cur_ops->stats(&page[cnt]); | ||
| 346 | return cnt; | 442 | return cnt; |
| 347 | } | 443 | } |
| 348 | 444 | ||
| @@ -444,11 +540,11 @@ rcu_torture_shuffle(void *arg) | |||
| 444 | static inline void | 540 | static inline void |
| 445 | rcu_torture_print_module_parms(char *tag) | 541 | rcu_torture_print_module_parms(char *tag) |
| 446 | { | 542 | { |
| 447 | printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d " | 543 | printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d " |
| 448 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " | 544 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " |
| 449 | "shuffle_interval = %d\n", | 545 | "shuffle_interval = %d\n", |
| 450 | tag, nrealreaders, stat_interval, verbose, test_no_idle_hz, | 546 | torture_type, tag, nrealreaders, stat_interval, verbose, |
| 451 | shuffle_interval); | 547 | test_no_idle_hz, shuffle_interval); |
| 452 | } | 548 | } |
| 453 | 549 | ||
| 454 | static void | 550 | static void |
| @@ -493,6 +589,9 @@ rcu_torture_cleanup(void) | |||
| 493 | rcu_barrier(); | 589 | rcu_barrier(); |
| 494 | 590 | ||
| 495 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ | 591 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ |
| 592 | |||
| 593 | if (cur_ops->cleanup != NULL) | ||
| 594 | cur_ops->cleanup(); | ||
| 496 | if (atomic_read(&n_rcu_torture_error)) | 595 | if (atomic_read(&n_rcu_torture_error)) |
| 497 | rcu_torture_print_module_parms("End of test: FAILURE"); | 596 | rcu_torture_print_module_parms("End of test: FAILURE"); |
| 498 | else | 597 | else |
| @@ -508,6 +607,20 @@ rcu_torture_init(void) | |||
| 508 | 607 | ||
| 509 | /* Process args and tell the world that the torturer is on the job. */ | 608 | /* Process args and tell the world that the torturer is on the job. */ |
| 510 | 609 | ||
| 610 | for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) { | ||
| 611 | cur_ops = torture_ops[i]; | ||
| 612 | if (strcmp(torture_type, cur_ops->name) == 0) { | ||
| 613 | break; | ||
| 614 | } | ||
| 615 | } | ||
| 616 | if (cur_ops == NULL) { | ||
| 617 | printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", | ||
| 618 | torture_type); | ||
| 619 | return (-EINVAL); | ||
| 620 | } | ||
| 621 | if (cur_ops->init != NULL) | ||
| 622 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ | ||
| 623 | |||
| 511 | if (nreaders >= 0) | 624 | if (nreaders >= 0) |
| 512 | nrealreaders = nreaders; | 625 | nrealreaders = nreaders; |
| 513 | else | 626 | else |
diff --git a/kernel/relay.c b/kernel/relay.c index 33345e73485c..1d63ecddfa70 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
| @@ -95,7 +95,7 @@ int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) | |||
| 95 | * @buf: the buffer struct | 95 | * @buf: the buffer struct |
| 96 | * @size: total size of the buffer | 96 | * @size: total size of the buffer |
| 97 | * | 97 | * |
| 98 | * Returns a pointer to the resulting buffer, NULL if unsuccessful. The | 98 | * Returns a pointer to the resulting buffer, %NULL if unsuccessful. The |
| 99 | * passed in size will get page aligned, if it isn't already. | 99 | * passed in size will get page aligned, if it isn't already. |
| 100 | */ | 100 | */ |
| 101 | static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) | 101 | static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) |
| @@ -132,10 +132,9 @@ depopulate: | |||
| 132 | 132 | ||
| 133 | /** | 133 | /** |
| 134 | * relay_create_buf - allocate and initialize a channel buffer | 134 | * relay_create_buf - allocate and initialize a channel buffer |
| 135 | * @alloc_size: size of the buffer to allocate | 135 | * @chan: the relay channel |
| 136 | * @n_subbufs: number of sub-buffers in the channel | ||
| 137 | * | 136 | * |
| 138 | * Returns channel buffer if successful, NULL otherwise | 137 | * Returns channel buffer if successful, %NULL otherwise. |
| 139 | */ | 138 | */ |
| 140 | struct rchan_buf *relay_create_buf(struct rchan *chan) | 139 | struct rchan_buf *relay_create_buf(struct rchan *chan) |
| 141 | { | 140 | { |
| @@ -163,6 +162,7 @@ free_buf: | |||
| 163 | 162 | ||
| 164 | /** | 163 | /** |
| 165 | * relay_destroy_channel - free the channel struct | 164 | * relay_destroy_channel - free the channel struct |
| 165 | * @kref: target kernel reference that contains the relay channel | ||
| 166 | * | 166 | * |
| 167 | * Should only be called from kref_put(). | 167 | * Should only be called from kref_put(). |
| 168 | */ | 168 | */ |
| @@ -194,6 +194,7 @@ void relay_destroy_buf(struct rchan_buf *buf) | |||
| 194 | 194 | ||
| 195 | /** | 195 | /** |
| 196 | * relay_remove_buf - remove a channel buffer | 196 | * relay_remove_buf - remove a channel buffer |
| 197 | * @kref: target kernel reference that contains the relay buffer | ||
| 197 | * | 198 | * |
| 198 | * Removes the file from the fileystem, which also frees the | 199 | * Removes the file from the fileystem, which also frees the |
| 199 | * rchan_buf_struct and the channel buffer. Should only be called from | 200 | * rchan_buf_struct and the channel buffer. Should only be called from |
| @@ -374,7 +375,7 @@ void relay_reset(struct rchan *chan) | |||
| 374 | } | 375 | } |
| 375 | EXPORT_SYMBOL_GPL(relay_reset); | 376 | EXPORT_SYMBOL_GPL(relay_reset); |
| 376 | 377 | ||
| 377 | /** | 378 | /* |
| 378 | * relay_open_buf - create a new relay channel buffer | 379 | * relay_open_buf - create a new relay channel buffer |
| 379 | * | 380 | * |
| 380 | * Internal - used by relay_open(). | 381 | * Internal - used by relay_open(). |
| @@ -448,12 +449,12 @@ static inline void setup_callbacks(struct rchan *chan, | |||
| 448 | /** | 449 | /** |
| 449 | * relay_open - create a new relay channel | 450 | * relay_open - create a new relay channel |
| 450 | * @base_filename: base name of files to create | 451 | * @base_filename: base name of files to create |
| 451 | * @parent: dentry of parent directory, NULL for root directory | 452 | * @parent: dentry of parent directory, %NULL for root directory |
| 452 | * @subbuf_size: size of sub-buffers | 453 | * @subbuf_size: size of sub-buffers |
| 453 | * @n_subbufs: number of sub-buffers | 454 | * @n_subbufs: number of sub-buffers |
| 454 | * @cb: client callback functions | 455 | * @cb: client callback functions |
| 455 | * | 456 | * |
| 456 | * Returns channel pointer if successful, NULL otherwise. | 457 | * Returns channel pointer if successful, %NULL otherwise. |
| 457 | * | 458 | * |
| 458 | * Creates a channel buffer for each cpu using the sizes and | 459 | * Creates a channel buffer for each cpu using the sizes and |
| 459 | * attributes specified. The created channel buffer files | 460 | * attributes specified. The created channel buffer files |
| @@ -585,7 +586,7 @@ EXPORT_SYMBOL_GPL(relay_switch_subbuf); | |||
| 585 | * subbufs_consumed should be the number of sub-buffers newly consumed, | 586 | * subbufs_consumed should be the number of sub-buffers newly consumed, |
| 586 | * not the total consumed. | 587 | * not the total consumed. |
| 587 | * | 588 | * |
| 588 | * NOTE: kernel clients don't need to call this function if the channel | 589 | * NOTE: Kernel clients don't need to call this function if the channel |
| 589 | * mode is 'overwrite'. | 590 | * mode is 'overwrite'. |
| 590 | */ | 591 | */ |
| 591 | void relay_subbufs_consumed(struct rchan *chan, | 592 | void relay_subbufs_consumed(struct rchan *chan, |
| @@ -641,7 +642,7 @@ EXPORT_SYMBOL_GPL(relay_close); | |||
| 641 | * relay_flush - close the channel | 642 | * relay_flush - close the channel |
| 642 | * @chan: the channel | 643 | * @chan: the channel |
| 643 | * | 644 | * |
| 644 | * Flushes all channel buffers i.e. forces buffer switch. | 645 | * Flushes all channel buffers, i.e. forces buffer switch. |
| 645 | */ | 646 | */ |
| 646 | void relay_flush(struct rchan *chan) | 647 | void relay_flush(struct rchan *chan) |
| 647 | { | 648 | { |
| @@ -669,7 +670,7 @@ EXPORT_SYMBOL_GPL(relay_flush); | |||
| 669 | */ | 670 | */ |
| 670 | static int relay_file_open(struct inode *inode, struct file *filp) | 671 | static int relay_file_open(struct inode *inode, struct file *filp) |
| 671 | { | 672 | { |
| 672 | struct rchan_buf *buf = inode->u.generic_ip; | 673 | struct rchan_buf *buf = inode->i_private; |
| 673 | kref_get(&buf->kref); | 674 | kref_get(&buf->kref); |
| 674 | filp->private_data = buf; | 675 | filp->private_data = buf; |
| 675 | 676 | ||
| @@ -729,7 +730,7 @@ static int relay_file_release(struct inode *inode, struct file *filp) | |||
| 729 | return 0; | 730 | return 0; |
| 730 | } | 731 | } |
| 731 | 732 | ||
| 732 | /** | 733 | /* |
| 733 | * relay_file_read_consume - update the consumed count for the buffer | 734 | * relay_file_read_consume - update the consumed count for the buffer |
| 734 | */ | 735 | */ |
| 735 | static void relay_file_read_consume(struct rchan_buf *buf, | 736 | static void relay_file_read_consume(struct rchan_buf *buf, |
| @@ -756,7 +757,7 @@ static void relay_file_read_consume(struct rchan_buf *buf, | |||
| 756 | } | 757 | } |
| 757 | } | 758 | } |
| 758 | 759 | ||
| 759 | /** | 760 | /* |
| 760 | * relay_file_read_avail - boolean, are there unconsumed bytes available? | 761 | * relay_file_read_avail - boolean, are there unconsumed bytes available? |
| 761 | */ | 762 | */ |
| 762 | static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) | 763 | static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) |
| @@ -793,6 +794,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) | |||
| 793 | 794 | ||
| 794 | /** | 795 | /** |
| 795 | * relay_file_read_subbuf_avail - return bytes available in sub-buffer | 796 | * relay_file_read_subbuf_avail - return bytes available in sub-buffer |
| 797 | * @read_pos: file read position | ||
| 798 | * @buf: relay channel buffer | ||
| 796 | */ | 799 | */ |
| 797 | static size_t relay_file_read_subbuf_avail(size_t read_pos, | 800 | static size_t relay_file_read_subbuf_avail(size_t read_pos, |
| 798 | struct rchan_buf *buf) | 801 | struct rchan_buf *buf) |
| @@ -818,6 +821,8 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos, | |||
| 818 | 821 | ||
| 819 | /** | 822 | /** |
| 820 | * relay_file_read_start_pos - find the first available byte to read | 823 | * relay_file_read_start_pos - find the first available byte to read |
| 824 | * @read_pos: file read position | ||
| 825 | * @buf: relay channel buffer | ||
| 821 | * | 826 | * |
| 822 | * If the read_pos is in the middle of padding, return the | 827 | * If the read_pos is in the middle of padding, return the |
| 823 | * position of the first actually available byte, otherwise | 828 | * position of the first actually available byte, otherwise |
| @@ -844,6 +849,9 @@ static size_t relay_file_read_start_pos(size_t read_pos, | |||
| 844 | 849 | ||
| 845 | /** | 850 | /** |
| 846 | * relay_file_read_end_pos - return the new read position | 851 | * relay_file_read_end_pos - return the new read position |
| 852 | * @read_pos: file read position | ||
| 853 | * @buf: relay channel buffer | ||
| 854 | * @count: number of bytes to be read | ||
| 847 | */ | 855 | */ |
| 848 | static size_t relay_file_read_end_pos(struct rchan_buf *buf, | 856 | static size_t relay_file_read_end_pos(struct rchan_buf *buf, |
| 849 | size_t read_pos, | 857 | size_t read_pos, |
| @@ -865,7 +873,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf, | |||
| 865 | return end_pos; | 873 | return end_pos; |
| 866 | } | 874 | } |
| 867 | 875 | ||
| 868 | /** | 876 | /* |
| 869 | * subbuf_read_actor - read up to one subbuf's worth of data | 877 | * subbuf_read_actor - read up to one subbuf's worth of data |
| 870 | */ | 878 | */ |
| 871 | static int subbuf_read_actor(size_t read_start, | 879 | static int subbuf_read_actor(size_t read_start, |
| @@ -890,7 +898,7 @@ static int subbuf_read_actor(size_t read_start, | |||
| 890 | return ret; | 898 | return ret; |
| 891 | } | 899 | } |
| 892 | 900 | ||
| 893 | /** | 901 | /* |
| 894 | * subbuf_send_actor - send up to one subbuf's worth of data | 902 | * subbuf_send_actor - send up to one subbuf's worth of data |
| 895 | */ | 903 | */ |
| 896 | static int subbuf_send_actor(size_t read_start, | 904 | static int subbuf_send_actor(size_t read_start, |
| @@ -933,7 +941,7 @@ typedef int (*subbuf_actor_t) (size_t read_start, | |||
| 933 | read_descriptor_t *desc, | 941 | read_descriptor_t *desc, |
| 934 | read_actor_t actor); | 942 | read_actor_t actor); |
| 935 | 943 | ||
| 936 | /** | 944 | /* |
| 937 | * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries | 945 | * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries |
| 938 | */ | 946 | */ |
| 939 | static inline ssize_t relay_file_read_subbufs(struct file *filp, | 947 | static inline ssize_t relay_file_read_subbufs(struct file *filp, |
diff --git a/kernel/resource.c b/kernel/resource.c index e3080fcc66a3..9db38a1a7520 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
| @@ -7,7 +7,6 @@ | |||
| 7 | * Arbitrary resource management. | 7 | * Arbitrary resource management. |
| 8 | */ | 8 | */ |
| 9 | 9 | ||
| 10 | #include <linux/config.h> | ||
| 11 | #include <linux/module.h> | 10 | #include <linux/module.h> |
| 12 | #include <linux/sched.h> | 11 | #include <linux/sched.h> |
| 13 | #include <linux/errno.h> | 12 | #include <linux/errno.h> |
| @@ -23,20 +22,18 @@ | |||
| 23 | 22 | ||
| 24 | struct resource ioport_resource = { | 23 | struct resource ioport_resource = { |
| 25 | .name = "PCI IO", | 24 | .name = "PCI IO", |
| 26 | .start = 0x0000, | 25 | .start = 0, |
| 27 | .end = IO_SPACE_LIMIT, | 26 | .end = IO_SPACE_LIMIT, |
| 28 | .flags = IORESOURCE_IO, | 27 | .flags = IORESOURCE_IO, |
| 29 | }; | 28 | }; |
| 30 | |||
| 31 | EXPORT_SYMBOL(ioport_resource); | 29 | EXPORT_SYMBOL(ioport_resource); |
| 32 | 30 | ||
| 33 | struct resource iomem_resource = { | 31 | struct resource iomem_resource = { |
| 34 | .name = "PCI mem", | 32 | .name = "PCI mem", |
| 35 | .start = 0UL, | 33 | .start = 0, |
| 36 | .end = ~0UL, | 34 | .end = -1, |
| 37 | .flags = IORESOURCE_MEM, | 35 | .flags = IORESOURCE_MEM, |
| 38 | }; | 36 | }; |
| 39 | |||
| 40 | EXPORT_SYMBOL(iomem_resource); | 37 | EXPORT_SYMBOL(iomem_resource); |
| 41 | 38 | ||
| 42 | static DEFINE_RWLOCK(resource_lock); | 39 | static DEFINE_RWLOCK(resource_lock); |
| @@ -83,10 +80,10 @@ static int r_show(struct seq_file *m, void *v) | |||
| 83 | for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) | 80 | for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) |
| 84 | if (p->parent == root) | 81 | if (p->parent == root) |
| 85 | break; | 82 | break; |
| 86 | seq_printf(m, "%*s%0*lx-%0*lx : %s\n", | 83 | seq_printf(m, "%*s%0*llx-%0*llx : %s\n", |
| 87 | depth * 2, "", | 84 | depth * 2, "", |
| 88 | width, r->start, | 85 | width, (unsigned long long) r->start, |
| 89 | width, r->end, | 86 | width, (unsigned long long) r->end, |
| 90 | r->name ? r->name : "<BAD>"); | 87 | r->name ? r->name : "<BAD>"); |
| 91 | return 0; | 88 | return 0; |
| 92 | } | 89 | } |
| @@ -151,8 +148,8 @@ __initcall(ioresources_init); | |||
| 151 | /* Return the conflict entry if you can't request it */ | 148 | /* Return the conflict entry if you can't request it */ |
| 152 | static struct resource * __request_resource(struct resource *root, struct resource *new) | 149 | static struct resource * __request_resource(struct resource *root, struct resource *new) |
| 153 | { | 150 | { |
| 154 | unsigned long start = new->start; | 151 | resource_size_t start = new->start; |
| 155 | unsigned long end = new->end; | 152 | resource_size_t end = new->end; |
| 156 | struct resource *tmp, **p; | 153 | struct resource *tmp, **p; |
| 157 | 154 | ||
| 158 | if (end < start) | 155 | if (end < start) |
| @@ -232,15 +229,55 @@ int release_resource(struct resource *old) | |||
| 232 | 229 | ||
| 233 | EXPORT_SYMBOL(release_resource); | 230 | EXPORT_SYMBOL(release_resource); |
| 234 | 231 | ||
| 232 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
| 233 | /* | ||
| 234 | * Finds the lowest memory reosurce exists within [res->start.res->end) | ||
| 235 | * the caller must specify res->start, res->end, res->flags. | ||
| 236 | * If found, returns 0, res is overwritten, if not found, returns -1. | ||
| 237 | */ | ||
| 238 | int find_next_system_ram(struct resource *res) | ||
| 239 | { | ||
| 240 | resource_size_t start, end; | ||
| 241 | struct resource *p; | ||
| 242 | |||
| 243 | BUG_ON(!res); | ||
| 244 | |||
| 245 | start = res->start; | ||
| 246 | end = res->end; | ||
| 247 | BUG_ON(start >= end); | ||
| 248 | |||
| 249 | read_lock(&resource_lock); | ||
| 250 | for (p = iomem_resource.child; p ; p = p->sibling) { | ||
| 251 | /* system ram is just marked as IORESOURCE_MEM */ | ||
| 252 | if (p->flags != res->flags) | ||
| 253 | continue; | ||
| 254 | if (p->start > end) { | ||
| 255 | p = NULL; | ||
| 256 | break; | ||
| 257 | } | ||
| 258 | if ((p->end >= start) && (p->start < end)) | ||
| 259 | break; | ||
| 260 | } | ||
| 261 | read_unlock(&resource_lock); | ||
| 262 | if (!p) | ||
| 263 | return -1; | ||
| 264 | /* copy data */ | ||
| 265 | if (res->start < p->start) | ||
| 266 | res->start = p->start; | ||
| 267 | if (res->end > p->end) | ||
| 268 | res->end = p->end; | ||
| 269 | return 0; | ||
| 270 | } | ||
| 271 | #endif | ||
| 272 | |||
| 235 | /* | 273 | /* |
| 236 | * Find empty slot in the resource tree given range and alignment. | 274 | * Find empty slot in the resource tree given range and alignment. |
| 237 | */ | 275 | */ |
| 238 | static int find_resource(struct resource *root, struct resource *new, | 276 | static int find_resource(struct resource *root, struct resource *new, |
| 239 | unsigned long size, | 277 | resource_size_t size, resource_size_t min, |
| 240 | unsigned long min, unsigned long max, | 278 | resource_size_t max, resource_size_t align, |
| 241 | unsigned long align, | ||
| 242 | void (*alignf)(void *, struct resource *, | 279 | void (*alignf)(void *, struct resource *, |
| 243 | unsigned long, unsigned long), | 280 | resource_size_t, resource_size_t), |
| 244 | void *alignf_data) | 281 | void *alignf_data) |
| 245 | { | 282 | { |
| 246 | struct resource *this = root->child; | 283 | struct resource *this = root->child; |
| @@ -282,11 +319,10 @@ static int find_resource(struct resource *root, struct resource *new, | |||
| 282 | * Allocate empty slot in the resource tree given range and alignment. | 319 | * Allocate empty slot in the resource tree given range and alignment. |
| 283 | */ | 320 | */ |
| 284 | int allocate_resource(struct resource *root, struct resource *new, | 321 | int allocate_resource(struct resource *root, struct resource *new, |
| 285 | unsigned long size, | 322 | resource_size_t size, resource_size_t min, |
| 286 | unsigned long min, unsigned long max, | 323 | resource_size_t max, resource_size_t align, |
| 287 | unsigned long align, | ||
| 288 | void (*alignf)(void *, struct resource *, | 324 | void (*alignf)(void *, struct resource *, |
| 289 | unsigned long, unsigned long), | 325 | resource_size_t, resource_size_t), |
| 290 | void *alignf_data) | 326 | void *alignf_data) |
| 291 | { | 327 | { |
| 292 | int err; | 328 | int err; |
| @@ -308,12 +344,11 @@ EXPORT_SYMBOL(allocate_resource); | |||
| 308 | * | 344 | * |
| 309 | * Returns 0 on success, -EBUSY if the resource can't be inserted. | 345 | * Returns 0 on success, -EBUSY if the resource can't be inserted. |
| 310 | * | 346 | * |
| 311 | * This function is equivalent of request_resource when no conflict | 347 | * This function is equivalent to request_resource when no conflict |
| 312 | * happens. If a conflict happens, and the conflicting resources | 348 | * happens. If a conflict happens, and the conflicting resources |
| 313 | * entirely fit within the range of the new resource, then the new | 349 | * entirely fit within the range of the new resource, then the new |
| 314 | * resource is inserted and the conflicting resources become childs of | 350 | * resource is inserted and the conflicting resources become children of |
| 315 | * the new resource. Otherwise the new resource becomes the child of | 351 | * the new resource. |
| 316 | * the conflicting resource | ||
| 317 | */ | 352 | */ |
| 318 | int insert_resource(struct resource *parent, struct resource *new) | 353 | int insert_resource(struct resource *parent, struct resource *new) |
| 319 | { | 354 | { |
| @@ -321,20 +356,21 @@ int insert_resource(struct resource *parent, struct resource *new) | |||
| 321 | struct resource *first, *next; | 356 | struct resource *first, *next; |
| 322 | 357 | ||
| 323 | write_lock(&resource_lock); | 358 | write_lock(&resource_lock); |
| 324 | begin: | ||
| 325 | result = 0; | ||
| 326 | first = __request_resource(parent, new); | ||
| 327 | if (!first) | ||
| 328 | goto out; | ||
| 329 | 359 | ||
| 330 | result = -EBUSY; | 360 | for (;; parent = first) { |
| 331 | if (first == parent) | 361 | result = 0; |
| 332 | goto out; | 362 | first = __request_resource(parent, new); |
| 363 | if (!first) | ||
| 364 | goto out; | ||
| 365 | |||
| 366 | result = -EBUSY; | ||
| 367 | if (first == parent) | ||
| 368 | goto out; | ||
| 333 | 369 | ||
| 334 | /* Resource fully contained by the clashing resource? Recurse into it */ | 370 | if ((first->start > new->start) || (first->end < new->end)) |
| 335 | if (first->start <= new->start && first->end >= new->end) { | 371 | break; |
| 336 | parent = first; | 372 | if ((first->start == new->start) && (first->end == new->end)) |
| 337 | goto begin; | 373 | break; |
| 338 | } | 374 | } |
| 339 | 375 | ||
| 340 | for (next = first; ; next = next->sibling) { | 376 | for (next = first; ; next = next->sibling) { |
| @@ -371,17 +407,15 @@ int insert_resource(struct resource *parent, struct resource *new) | |||
| 371 | return result; | 407 | return result; |
| 372 | } | 408 | } |
| 373 | 409 | ||
| 374 | EXPORT_SYMBOL(insert_resource); | ||
| 375 | |||
| 376 | /* | 410 | /* |
| 377 | * Given an existing resource, change its start and size to match the | 411 | * Given an existing resource, change its start and size to match the |
| 378 | * arguments. Returns -EBUSY if it can't fit. Existing children of | 412 | * arguments. Returns -EBUSY if it can't fit. Existing children of |
| 379 | * the resource are assumed to be immutable. | 413 | * the resource are assumed to be immutable. |
| 380 | */ | 414 | */ |
| 381 | int adjust_resource(struct resource *res, unsigned long start, unsigned long size) | 415 | int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) |
| 382 | { | 416 | { |
| 383 | struct resource *tmp, *parent = res->parent; | 417 | struct resource *tmp, *parent = res->parent; |
| 384 | unsigned long end = start + size - 1; | 418 | resource_size_t end = start + size - 1; |
| 385 | int result = -EBUSY; | 419 | int result = -EBUSY; |
| 386 | 420 | ||
| 387 | write_lock(&resource_lock); | 421 | write_lock(&resource_lock); |
| @@ -428,7 +462,9 @@ EXPORT_SYMBOL(adjust_resource); | |||
| 428 | * | 462 | * |
| 429 | * Release-region releases a matching busy region. | 463 | * Release-region releases a matching busy region. |
| 430 | */ | 464 | */ |
| 431 | struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name) | 465 | struct resource * __request_region(struct resource *parent, |
| 466 | resource_size_t start, resource_size_t n, | ||
| 467 | const char *name) | ||
| 432 | { | 468 | { |
| 433 | struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); | 469 | struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); |
| 434 | 470 | ||
| @@ -464,7 +500,8 @@ struct resource * __request_region(struct resource *parent, unsigned long start, | |||
| 464 | 500 | ||
| 465 | EXPORT_SYMBOL(__request_region); | 501 | EXPORT_SYMBOL(__request_region); |
| 466 | 502 | ||
| 467 | int __check_region(struct resource *parent, unsigned long start, unsigned long n) | 503 | int __check_region(struct resource *parent, resource_size_t start, |
| 504 | resource_size_t n) | ||
| 468 | { | 505 | { |
| 469 | struct resource * res; | 506 | struct resource * res; |
| 470 | 507 | ||
| @@ -479,10 +516,11 @@ int __check_region(struct resource *parent, unsigned long start, unsigned long n | |||
| 479 | 516 | ||
| 480 | EXPORT_SYMBOL(__check_region); | 517 | EXPORT_SYMBOL(__check_region); |
| 481 | 518 | ||
| 482 | void __release_region(struct resource *parent, unsigned long start, unsigned long n) | 519 | void __release_region(struct resource *parent, resource_size_t start, |
| 520 | resource_size_t n) | ||
| 483 | { | 521 | { |
| 484 | struct resource **p; | 522 | struct resource **p; |
| 485 | unsigned long end; | 523 | resource_size_t end; |
| 486 | 524 | ||
| 487 | p = &parent->child; | 525 | p = &parent->child; |
| 488 | end = start + n - 1; | 526 | end = start + n - 1; |
| @@ -511,7 +549,9 @@ void __release_region(struct resource *parent, unsigned long start, unsigned lon | |||
| 511 | 549 | ||
| 512 | write_unlock(&resource_lock); | 550 | write_unlock(&resource_lock); |
| 513 | 551 | ||
| 514 | printk(KERN_WARNING "Trying to free nonexistent resource <%08lx-%08lx>\n", start, end); | 552 | printk(KERN_WARNING "Trying to free nonexistent resource " |
| 553 | "<%016llx-%016llx>\n", (unsigned long long)start, | ||
| 554 | (unsigned long long)end); | ||
| 515 | } | 555 | } |
| 516 | 556 | ||
| 517 | EXPORT_SYMBOL(__release_region); | 557 | EXPORT_SYMBOL(__release_region); |
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c new file mode 100644 index 000000000000..0c1faa950af7 --- /dev/null +++ b/kernel/rtmutex-debug.c | |||
| @@ -0,0 +1,242 @@ | |||
| 1 | /* | ||
| 2 | * RT-Mutexes: blocking mutual exclusion locks with PI support | ||
| 3 | * | ||
| 4 | * started by Ingo Molnar and Thomas Gleixner: | ||
| 5 | * | ||
| 6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
| 7 | * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
| 8 | * | ||
| 9 | * This code is based on the rt.c implementation in the preempt-rt tree. | ||
| 10 | * Portions of said code are | ||
| 11 | * | ||
| 12 | * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey | ||
| 13 | * Copyright (C) 2006 Esben Nielsen | ||
| 14 | * Copyright (C) 2006 Kihon Technologies Inc., | ||
| 15 | * Steven Rostedt <rostedt@goodmis.org> | ||
| 16 | * | ||
| 17 | * See rt.c in preempt-rt for proper credits and further information | ||
| 18 | */ | ||
| 19 | #include <linux/config.h> | ||
| 20 | #include <linux/sched.h> | ||
| 21 | #include <linux/delay.h> | ||
| 22 | #include <linux/module.h> | ||
| 23 | #include <linux/spinlock.h> | ||
| 24 | #include <linux/kallsyms.h> | ||
| 25 | #include <linux/syscalls.h> | ||
| 26 | #include <linux/interrupt.h> | ||
| 27 | #include <linux/plist.h> | ||
| 28 | #include <linux/fs.h> | ||
| 29 | #include <linux/debug_locks.h> | ||
| 30 | |||
| 31 | #include "rtmutex_common.h" | ||
| 32 | |||
| 33 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
| 34 | # include "rtmutex-debug.h" | ||
| 35 | #else | ||
| 36 | # include "rtmutex.h" | ||
| 37 | #endif | ||
| 38 | |||
| 39 | # define TRACE_WARN_ON(x) WARN_ON(x) | ||
| 40 | # define TRACE_BUG_ON(x) BUG_ON(x) | ||
| 41 | |||
| 42 | # define TRACE_OFF() \ | ||
| 43 | do { \ | ||
| 44 | if (rt_trace_on) { \ | ||
| 45 | rt_trace_on = 0; \ | ||
| 46 | console_verbose(); \ | ||
| 47 | if (spin_is_locked(¤t->pi_lock)) \ | ||
| 48 | spin_unlock(¤t->pi_lock); \ | ||
| 49 | } \ | ||
| 50 | } while (0) | ||
| 51 | |||
| 52 | # define TRACE_OFF_NOLOCK() \ | ||
| 53 | do { \ | ||
| 54 | if (rt_trace_on) { \ | ||
| 55 | rt_trace_on = 0; \ | ||
| 56 | console_verbose(); \ | ||
| 57 | } \ | ||
| 58 | } while (0) | ||
| 59 | |||
| 60 | # define TRACE_BUG_LOCKED() \ | ||
| 61 | do { \ | ||
| 62 | TRACE_OFF(); \ | ||
| 63 | BUG(); \ | ||
| 64 | } while (0) | ||
| 65 | |||
| 66 | # define TRACE_WARN_ON_LOCKED(c) \ | ||
| 67 | do { \ | ||
| 68 | if (unlikely(c)) { \ | ||
| 69 | TRACE_OFF(); \ | ||
| 70 | WARN_ON(1); \ | ||
| 71 | } \ | ||
| 72 | } while (0) | ||
| 73 | |||
| 74 | # define TRACE_BUG_ON_LOCKED(c) \ | ||
| 75 | do { \ | ||
| 76 | if (unlikely(c)) \ | ||
| 77 | TRACE_BUG_LOCKED(); \ | ||
| 78 | } while (0) | ||
| 79 | |||
| 80 | #ifdef CONFIG_SMP | ||
| 81 | # define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) | ||
| 82 | #else | ||
| 83 | # define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) | ||
| 84 | #endif | ||
| 85 | |||
| 86 | /* | ||
| 87 | * deadlock detection flag. We turn it off when we detect | ||
| 88 | * the first problem because we dont want to recurse back | ||
| 89 | * into the tracing code when doing error printk or | ||
| 90 | * executing a BUG(): | ||
| 91 | */ | ||
| 92 | int rt_trace_on = 1; | ||
| 93 | |||
| 94 | void deadlock_trace_off(void) | ||
| 95 | { | ||
| 96 | rt_trace_on = 0; | ||
| 97 | } | ||
| 98 | |||
| 99 | static void printk_task(struct task_struct *p) | ||
| 100 | { | ||
| 101 | if (p) | ||
| 102 | printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio); | ||
| 103 | else | ||
| 104 | printk("<none>"); | ||
| 105 | } | ||
| 106 | |||
| 107 | static void printk_lock(struct rt_mutex *lock, int print_owner) | ||
| 108 | { | ||
| 109 | if (lock->name) | ||
| 110 | printk(" [%p] {%s}\n", | ||
| 111 | lock, lock->name); | ||
| 112 | else | ||
| 113 | printk(" [%p] {%s:%d}\n", | ||
| 114 | lock, lock->file, lock->line); | ||
| 115 | |||
| 116 | if (print_owner && rt_mutex_owner(lock)) { | ||
| 117 | printk(".. ->owner: %p\n", lock->owner); | ||
| 118 | printk(".. held by: "); | ||
| 119 | printk_task(rt_mutex_owner(lock)); | ||
| 120 | printk("\n"); | ||
| 121 | } | ||
| 122 | } | ||
| 123 | |||
| 124 | void rt_mutex_debug_task_free(struct task_struct *task) | ||
| 125 | { | ||
| 126 | WARN_ON(!plist_head_empty(&task->pi_waiters)); | ||
| 127 | WARN_ON(task->pi_blocked_on); | ||
| 128 | } | ||
| 129 | |||
| 130 | /* | ||
| 131 | * We fill out the fields in the waiter to store the information about | ||
| 132 | * the deadlock. We print when we return. act_waiter can be NULL in | ||
| 133 | * case of a remove waiter operation. | ||
| 134 | */ | ||
| 135 | void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, | ||
| 136 | struct rt_mutex *lock) | ||
| 137 | { | ||
| 138 | struct task_struct *task; | ||
| 139 | |||
| 140 | if (!rt_trace_on || detect || !act_waiter) | ||
| 141 | return; | ||
| 142 | |||
| 143 | task = rt_mutex_owner(act_waiter->lock); | ||
| 144 | if (task && task != current) { | ||
| 145 | act_waiter->deadlock_task_pid = task->pid; | ||
| 146 | act_waiter->deadlock_lock = lock; | ||
| 147 | } | ||
| 148 | } | ||
| 149 | |||
| 150 | void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) | ||
| 151 | { | ||
| 152 | struct task_struct *task; | ||
| 153 | |||
| 154 | if (!waiter->deadlock_lock || !rt_trace_on) | ||
| 155 | return; | ||
| 156 | |||
| 157 | task = find_task_by_pid(waiter->deadlock_task_pid); | ||
| 158 | if (!task) | ||
| 159 | return; | ||
| 160 | |||
| 161 | TRACE_OFF_NOLOCK(); | ||
| 162 | |||
| 163 | printk("\n============================================\n"); | ||
| 164 | printk( "[ BUG: circular locking deadlock detected! ]\n"); | ||
| 165 | printk( "--------------------------------------------\n"); | ||
| 166 | printk("%s/%d is deadlocking current task %s/%d\n\n", | ||
| 167 | task->comm, task->pid, current->comm, current->pid); | ||
| 168 | |||
| 169 | printk("\n1) %s/%d is trying to acquire this lock:\n", | ||
| 170 | current->comm, current->pid); | ||
| 171 | printk_lock(waiter->lock, 1); | ||
| 172 | |||
| 173 | printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid); | ||
| 174 | printk_lock(waiter->deadlock_lock, 1); | ||
| 175 | |||
| 176 | debug_show_held_locks(current); | ||
| 177 | debug_show_held_locks(task); | ||
| 178 | |||
| 179 | printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid); | ||
| 180 | show_stack(task, NULL); | ||
| 181 | printk("\n%s/%d's [current] stackdump:\n\n", | ||
| 182 | current->comm, current->pid); | ||
| 183 | dump_stack(); | ||
| 184 | debug_show_all_locks(); | ||
| 185 | |||
| 186 | printk("[ turning off deadlock detection." | ||
| 187 | "Please report this trace. ]\n\n"); | ||
| 188 | local_irq_disable(); | ||
| 189 | } | ||
| 190 | |||
| 191 | void debug_rt_mutex_lock(struct rt_mutex *lock) | ||
| 192 | { | ||
| 193 | } | ||
| 194 | |||
| 195 | void debug_rt_mutex_unlock(struct rt_mutex *lock) | ||
| 196 | { | ||
| 197 | TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); | ||
| 198 | } | ||
| 199 | |||
| 200 | void | ||
| 201 | debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner) | ||
| 202 | { | ||
| 203 | } | ||
| 204 | |||
| 205 | void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) | ||
| 206 | { | ||
| 207 | TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock)); | ||
| 208 | } | ||
| 209 | |||
| 210 | void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) | ||
| 211 | { | ||
| 212 | memset(waiter, 0x11, sizeof(*waiter)); | ||
| 213 | plist_node_init(&waiter->list_entry, MAX_PRIO); | ||
| 214 | plist_node_init(&waiter->pi_list_entry, MAX_PRIO); | ||
| 215 | } | ||
| 216 | |||
| 217 | void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) | ||
| 218 | { | ||
| 219 | TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); | ||
| 220 | TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); | ||
| 221 | TRACE_WARN_ON(waiter->task); | ||
| 222 | memset(waiter, 0x22, sizeof(*waiter)); | ||
| 223 | } | ||
| 224 | |||
| 225 | void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) | ||
| 226 | { | ||
| 227 | /* | ||
| 228 | * Make sure we are not reinitializing a held lock: | ||
| 229 | */ | ||
| 230 | debug_check_no_locks_freed((void *)lock, sizeof(*lock)); | ||
| 231 | lock->name = name; | ||
| 232 | } | ||
| 233 | |||
| 234 | void | ||
| 235 | rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) | ||
| 236 | { | ||
| 237 | } | ||
| 238 | |||
| 239 | void rt_mutex_deadlock_account_unlock(struct task_struct *task) | ||
| 240 | { | ||
| 241 | } | ||
| 242 | |||
diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h new file mode 100644 index 000000000000..14193d596d78 --- /dev/null +++ b/kernel/rtmutex-debug.h | |||
| @@ -0,0 +1,33 @@ | |||
| 1 | /* | ||
| 2 | * RT-Mutexes: blocking mutual exclusion locks with PI support | ||
| 3 | * | ||
| 4 | * started by Ingo Molnar and Thomas Gleixner: | ||
| 5 | * | ||
| 6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
| 7 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
| 8 | * | ||
| 9 | * This file contains macros used solely by rtmutex.c. Debug version. | ||
| 10 | */ | ||
| 11 | |||
| 12 | extern void | ||
| 13 | rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task); | ||
| 14 | extern void rt_mutex_deadlock_account_unlock(struct task_struct *task); | ||
| 15 | extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); | ||
| 16 | extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); | ||
| 17 | extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); | ||
| 18 | extern void debug_rt_mutex_lock(struct rt_mutex *lock); | ||
| 19 | extern void debug_rt_mutex_unlock(struct rt_mutex *lock); | ||
| 20 | extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, | ||
| 21 | struct task_struct *powner); | ||
| 22 | extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); | ||
| 23 | extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, | ||
| 24 | struct rt_mutex *lock); | ||
| 25 | extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); | ||
| 26 | # define debug_rt_mutex_reset_waiter(w) \ | ||
| 27 | do { (w)->deadlock_lock = NULL; } while (0) | ||
| 28 | |||
| 29 | static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, | ||
| 30 | int detect) | ||
| 31 | { | ||
| 32 | return (waiter != NULL); | ||
| 33 | } | ||
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c new file mode 100644 index 000000000000..948bd8f643e2 --- /dev/null +++ b/kernel/rtmutex-tester.c | |||
| @@ -0,0 +1,441 @@ | |||
| 1 | /* | ||
| 2 | * RT-Mutex-tester: scriptable tester for rt mutexes | ||
| 3 | * | ||
| 4 | * started by Thomas Gleixner: | ||
| 5 | * | ||
| 6 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
| 7 | * | ||
| 8 | */ | ||
| 9 | #include <linux/config.h> | ||
| 10 | #include <linux/kthread.h> | ||
| 11 | #include <linux/module.h> | ||
| 12 | #include <linux/sched.h> | ||
| 13 | #include <linux/smp_lock.h> | ||
| 14 | #include <linux/spinlock.h> | ||
| 15 | #include <linux/sysdev.h> | ||
| 16 | #include <linux/timer.h> | ||
| 17 | |||
| 18 | #include "rtmutex.h" | ||
| 19 | |||
| 20 | #define MAX_RT_TEST_THREADS 8 | ||
| 21 | #define MAX_RT_TEST_MUTEXES 8 | ||
| 22 | |||
| 23 | static spinlock_t rttest_lock; | ||
| 24 | static atomic_t rttest_event; | ||
| 25 | |||
| 26 | struct test_thread_data { | ||
| 27 | int opcode; | ||
| 28 | int opdata; | ||
| 29 | int mutexes[MAX_RT_TEST_MUTEXES]; | ||
| 30 | int bkl; | ||
| 31 | int event; | ||
| 32 | struct sys_device sysdev; | ||
| 33 | }; | ||
| 34 | |||
| 35 | static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; | ||
| 36 | static struct task_struct *threads[MAX_RT_TEST_THREADS]; | ||
| 37 | static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; | ||
| 38 | |||
| 39 | enum test_opcodes { | ||
| 40 | RTTEST_NOP = 0, | ||
| 41 | RTTEST_SCHEDOT, /* 1 Sched other, data = nice */ | ||
| 42 | RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */ | ||
| 43 | RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */ | ||
| 44 | RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */ | ||
| 45 | RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */ | ||
| 46 | RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ | ||
| 47 | RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ | ||
| 48 | RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ | ||
| 49 | RTTEST_LOCKBKL, /* 9 Lock BKL */ | ||
| 50 | RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ | ||
| 51 | RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */ | ||
| 52 | RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ | ||
| 53 | RTTEST_RESET = 99, /* 99 Reset all pending operations */ | ||
| 54 | }; | ||
| 55 | |||
| 56 | static int handle_op(struct test_thread_data *td, int lockwakeup) | ||
| 57 | { | ||
| 58 | int i, id, ret = -EINVAL; | ||
| 59 | |||
| 60 | switch(td->opcode) { | ||
| 61 | |||
| 62 | case RTTEST_NOP: | ||
| 63 | return 0; | ||
| 64 | |||
| 65 | case RTTEST_LOCKCONT: | ||
| 66 | td->mutexes[td->opdata] = 1; | ||
| 67 | td->event = atomic_add_return(1, &rttest_event); | ||
| 68 | return 0; | ||
| 69 | |||
| 70 | case RTTEST_RESET: | ||
| 71 | for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) { | ||
| 72 | if (td->mutexes[i] == 4) { | ||
| 73 | rt_mutex_unlock(&mutexes[i]); | ||
| 74 | td->mutexes[i] = 0; | ||
| 75 | } | ||
| 76 | } | ||
| 77 | |||
| 78 | if (!lockwakeup && td->bkl == 4) { | ||
| 79 | unlock_kernel(); | ||
| 80 | td->bkl = 0; | ||
| 81 | } | ||
| 82 | return 0; | ||
| 83 | |||
| 84 | case RTTEST_RESETEVENT: | ||
| 85 | atomic_set(&rttest_event, 0); | ||
| 86 | return 0; | ||
| 87 | |||
| 88 | default: | ||
| 89 | if (lockwakeup) | ||
| 90 | return ret; | ||
| 91 | } | ||
| 92 | |||
| 93 | switch(td->opcode) { | ||
| 94 | |||
| 95 | case RTTEST_LOCK: | ||
| 96 | case RTTEST_LOCKNOWAIT: | ||
| 97 | id = td->opdata; | ||
| 98 | if (id < 0 || id >= MAX_RT_TEST_MUTEXES) | ||
| 99 | return ret; | ||
| 100 | |||
| 101 | td->mutexes[id] = 1; | ||
| 102 | td->event = atomic_add_return(1, &rttest_event); | ||
| 103 | rt_mutex_lock(&mutexes[id]); | ||
| 104 | td->event = atomic_add_return(1, &rttest_event); | ||
| 105 | td->mutexes[id] = 4; | ||
| 106 | return 0; | ||
| 107 | |||
| 108 | case RTTEST_LOCKINT: | ||
| 109 | case RTTEST_LOCKINTNOWAIT: | ||
| 110 | id = td->opdata; | ||
| 111 | if (id < 0 || id >= MAX_RT_TEST_MUTEXES) | ||
| 112 | return ret; | ||
| 113 | |||
| 114 | td->mutexes[id] = 1; | ||
| 115 | td->event = atomic_add_return(1, &rttest_event); | ||
| 116 | ret = rt_mutex_lock_interruptible(&mutexes[id], 0); | ||
| 117 | td->event = atomic_add_return(1, &rttest_event); | ||
| 118 | td->mutexes[id] = ret ? 0 : 4; | ||
| 119 | return ret ? -EINTR : 0; | ||
| 120 | |||
| 121 | case RTTEST_UNLOCK: | ||
| 122 | id = td->opdata; | ||
| 123 | if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4) | ||
| 124 | return ret; | ||
| 125 | |||
| 126 | td->event = atomic_add_return(1, &rttest_event); | ||
| 127 | rt_mutex_unlock(&mutexes[id]); | ||
| 128 | td->event = atomic_add_return(1, &rttest_event); | ||
| 129 | td->mutexes[id] = 0; | ||
| 130 | return 0; | ||
| 131 | |||
| 132 | case RTTEST_LOCKBKL: | ||
| 133 | if (td->bkl) | ||
| 134 | return 0; | ||
| 135 | td->bkl = 1; | ||
| 136 | lock_kernel(); | ||
| 137 | td->bkl = 4; | ||
| 138 | return 0; | ||
| 139 | |||
| 140 | case RTTEST_UNLOCKBKL: | ||
| 141 | if (td->bkl != 4) | ||
| 142 | break; | ||
| 143 | unlock_kernel(); | ||
| 144 | td->bkl = 0; | ||
| 145 | return 0; | ||
| 146 | |||
| 147 | default: | ||
| 148 | break; | ||
| 149 | } | ||
| 150 | return ret; | ||
| 151 | } | ||
| 152 | |||
| 153 | /* | ||
| 154 | * Schedule replacement for rtsem_down(). Only called for threads with | ||
| 155 | * PF_MUTEX_TESTER set. | ||
| 156 | * | ||
| 157 | * This allows us to have finegrained control over the event flow. | ||
| 158 | * | ||
| 159 | */ | ||
| 160 | void schedule_rt_mutex_test(struct rt_mutex *mutex) | ||
| 161 | { | ||
| 162 | int tid, op, dat; | ||
| 163 | struct test_thread_data *td; | ||
| 164 | |||
| 165 | /* We have to lookup the task */ | ||
| 166 | for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) { | ||
| 167 | if (threads[tid] == current) | ||
| 168 | break; | ||
| 169 | } | ||
| 170 | |||
| 171 | BUG_ON(tid == MAX_RT_TEST_THREADS); | ||
| 172 | |||
| 173 | td = &thread_data[tid]; | ||
| 174 | |||
| 175 | op = td->opcode; | ||
| 176 | dat = td->opdata; | ||
| 177 | |||
| 178 | switch (op) { | ||
| 179 | case RTTEST_LOCK: | ||
| 180 | case RTTEST_LOCKINT: | ||
| 181 | case RTTEST_LOCKNOWAIT: | ||
| 182 | case RTTEST_LOCKINTNOWAIT: | ||
| 183 | if (mutex != &mutexes[dat]) | ||
| 184 | break; | ||
| 185 | |||
| 186 | if (td->mutexes[dat] != 1) | ||
| 187 | break; | ||
| 188 | |||
| 189 | td->mutexes[dat] = 2; | ||
| 190 | td->event = atomic_add_return(1, &rttest_event); | ||
| 191 | break; | ||
| 192 | |||
| 193 | case RTTEST_LOCKBKL: | ||
| 194 | default: | ||
| 195 | break; | ||
| 196 | } | ||
| 197 | |||
| 198 | schedule(); | ||
| 199 | |||
| 200 | |||
| 201 | switch (op) { | ||
| 202 | case RTTEST_LOCK: | ||
| 203 | case RTTEST_LOCKINT: | ||
| 204 | if (mutex != &mutexes[dat]) | ||
| 205 | return; | ||
| 206 | |||
| 207 | if (td->mutexes[dat] != 2) | ||
| 208 | return; | ||
| 209 | |||
| 210 | td->mutexes[dat] = 3; | ||
| 211 | td->event = atomic_add_return(1, &rttest_event); | ||
| 212 | break; | ||
| 213 | |||
| 214 | case RTTEST_LOCKNOWAIT: | ||
| 215 | case RTTEST_LOCKINTNOWAIT: | ||
| 216 | if (mutex != &mutexes[dat]) | ||
| 217 | return; | ||
| 218 | |||
| 219 | if (td->mutexes[dat] != 2) | ||
| 220 | return; | ||
| 221 | |||
| 222 | td->mutexes[dat] = 1; | ||
| 223 | td->event = atomic_add_return(1, &rttest_event); | ||
| 224 | return; | ||
| 225 | |||
| 226 | case RTTEST_LOCKBKL: | ||
| 227 | return; | ||
| 228 | default: | ||
| 229 | return; | ||
| 230 | } | ||
| 231 | |||
| 232 | td->opcode = 0; | ||
| 233 | |||
| 234 | for (;;) { | ||
| 235 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 236 | |||
| 237 | if (td->opcode > 0) { | ||
| 238 | int ret; | ||
| 239 | |||
| 240 | set_current_state(TASK_RUNNING); | ||
| 241 | ret = handle_op(td, 1); | ||
| 242 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 243 | if (td->opcode == RTTEST_LOCKCONT) | ||
| 244 | break; | ||
| 245 | td->opcode = ret; | ||
| 246 | } | ||
| 247 | |||
| 248 | /* Wait for the next command to be executed */ | ||
| 249 | schedule(); | ||
| 250 | } | ||
| 251 | |||
| 252 | /* Restore previous command and data */ | ||
| 253 | td->opcode = op; | ||
| 254 | td->opdata = dat; | ||
| 255 | } | ||
| 256 | |||
| 257 | static int test_func(void *data) | ||
| 258 | { | ||
| 259 | struct test_thread_data *td = data; | ||
| 260 | int ret; | ||
| 261 | |||
| 262 | current->flags |= PF_MUTEX_TESTER; | ||
| 263 | allow_signal(SIGHUP); | ||
| 264 | |||
| 265 | for(;;) { | ||
| 266 | |||
| 267 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 268 | |||
| 269 | if (td->opcode > 0) { | ||
| 270 | set_current_state(TASK_RUNNING); | ||
| 271 | ret = handle_op(td, 0); | ||
| 272 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 273 | td->opcode = ret; | ||
| 274 | } | ||
| 275 | |||
| 276 | /* Wait for the next command to be executed */ | ||
| 277 | schedule(); | ||
| 278 | try_to_freeze(); | ||
| 279 | |||
| 280 | if (signal_pending(current)) | ||
| 281 | flush_signals(current); | ||
| 282 | |||
| 283 | if(kthread_should_stop()) | ||
| 284 | break; | ||
| 285 | } | ||
| 286 | return 0; | ||
| 287 | } | ||
| 288 | |||
| 289 | /** | ||
| 290 | * sysfs_test_command - interface for test commands | ||
| 291 | * @dev: thread reference | ||
| 292 | * @buf: command for actual step | ||
| 293 | * @count: length of buffer | ||
| 294 | * | ||
| 295 | * command syntax: | ||
| 296 | * | ||
| 297 | * opcode:data | ||
| 298 | */ | ||
| 299 | static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf, | ||
| 300 | size_t count) | ||
| 301 | { | ||
| 302 | struct sched_param schedpar; | ||
| 303 | struct test_thread_data *td; | ||
| 304 | char cmdbuf[32]; | ||
| 305 | int op, dat, tid, ret; | ||
| 306 | |||
| 307 | td = container_of(dev, struct test_thread_data, sysdev); | ||
| 308 | tid = td->sysdev.id; | ||
| 309 | |||
| 310 | /* strings from sysfs write are not 0 terminated! */ | ||
| 311 | if (count >= sizeof(cmdbuf)) | ||
| 312 | return -EINVAL; | ||
| 313 | |||
| 314 | /* strip of \n: */ | ||
| 315 | if (buf[count-1] == '\n') | ||
| 316 | count--; | ||
| 317 | if (count < 1) | ||
| 318 | return -EINVAL; | ||
| 319 | |||
| 320 | memcpy(cmdbuf, buf, count); | ||
| 321 | cmdbuf[count] = 0; | ||
| 322 | |||
| 323 | if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2) | ||
| 324 | return -EINVAL; | ||
| 325 | |||
| 326 | switch (op) { | ||
| 327 | case RTTEST_SCHEDOT: | ||
| 328 | schedpar.sched_priority = 0; | ||
| 329 | ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar); | ||
| 330 | if (ret) | ||
| 331 | return ret; | ||
| 332 | set_user_nice(current, 0); | ||
| 333 | break; | ||
| 334 | |||
| 335 | case RTTEST_SCHEDRT: | ||
| 336 | schedpar.sched_priority = dat; | ||
| 337 | ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar); | ||
| 338 | if (ret) | ||
| 339 | return ret; | ||
| 340 | break; | ||
| 341 | |||
| 342 | case RTTEST_SIGNAL: | ||
| 343 | send_sig(SIGHUP, threads[tid], 0); | ||
| 344 | break; | ||
| 345 | |||
| 346 | default: | ||
| 347 | if (td->opcode > 0) | ||
| 348 | return -EBUSY; | ||
| 349 | td->opdata = dat; | ||
| 350 | td->opcode = op; | ||
| 351 | wake_up_process(threads[tid]); | ||
| 352 | } | ||
| 353 | |||
| 354 | return count; | ||
| 355 | } | ||
| 356 | |||
| 357 | /** | ||
| 358 | * sysfs_test_status - sysfs interface for rt tester | ||
| 359 | * @dev: thread to query | ||
| 360 | * @buf: char buffer to be filled with thread status info | ||
| 361 | */ | ||
| 362 | static ssize_t sysfs_test_status(struct sys_device *dev, char *buf) | ||
| 363 | { | ||
| 364 | struct test_thread_data *td; | ||
| 365 | struct task_struct *tsk; | ||
| 366 | char *curr = buf; | ||
| 367 | int i; | ||
| 368 | |||
| 369 | td = container_of(dev, struct test_thread_data, sysdev); | ||
| 370 | tsk = threads[td->sysdev.id]; | ||
| 371 | |||
| 372 | spin_lock(&rttest_lock); | ||
| 373 | |||
| 374 | curr += sprintf(curr, | ||
| 375 | "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", | ||
| 376 | td->opcode, td->event, tsk->state, | ||
| 377 | (MAX_RT_PRIO - 1) - tsk->prio, | ||
| 378 | (MAX_RT_PRIO - 1) - tsk->normal_prio, | ||
| 379 | tsk->pi_blocked_on, td->bkl); | ||
| 380 | |||
| 381 | for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) | ||
| 382 | curr += sprintf(curr, "%d", td->mutexes[i]); | ||
| 383 | |||
| 384 | spin_unlock(&rttest_lock); | ||
| 385 | |||
| 386 | curr += sprintf(curr, ", T: %p, R: %p\n", tsk, | ||
| 387 | mutexes[td->sysdev.id].owner); | ||
| 388 | |||
| 389 | return curr - buf; | ||
| 390 | } | ||
| 391 | |||
| 392 | static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); | ||
| 393 | static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); | ||
| 394 | |||
| 395 | static struct sysdev_class rttest_sysclass = { | ||
| 396 | set_kset_name("rttest"), | ||
| 397 | }; | ||
| 398 | |||
| 399 | static int init_test_thread(int id) | ||
| 400 | { | ||
| 401 | thread_data[id].sysdev.cls = &rttest_sysclass; | ||
| 402 | thread_data[id].sysdev.id = id; | ||
| 403 | |||
| 404 | threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); | ||
| 405 | if (IS_ERR(threads[id])) | ||
| 406 | return PTR_ERR(threads[id]); | ||
| 407 | |||
| 408 | return sysdev_register(&thread_data[id].sysdev); | ||
| 409 | } | ||
| 410 | |||
| 411 | static int init_rttest(void) | ||
| 412 | { | ||
| 413 | int ret, i; | ||
| 414 | |||
| 415 | spin_lock_init(&rttest_lock); | ||
| 416 | |||
| 417 | for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) | ||
| 418 | rt_mutex_init(&mutexes[i]); | ||
| 419 | |||
| 420 | ret = sysdev_class_register(&rttest_sysclass); | ||
| 421 | if (ret) | ||
| 422 | return ret; | ||
| 423 | |||
| 424 | for (i = 0; i < MAX_RT_TEST_THREADS; i++) { | ||
| 425 | ret = init_test_thread(i); | ||
| 426 | if (ret) | ||
| 427 | break; | ||
| 428 | ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status); | ||
| 429 | if (ret) | ||
| 430 | break; | ||
| 431 | ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command); | ||
| 432 | if (ret) | ||
| 433 | break; | ||
| 434 | } | ||
| 435 | |||
| 436 | printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" ); | ||
| 437 | |||
| 438 | return ret; | ||
| 439 | } | ||
| 440 | |||
| 441 | device_initcall(init_rttest); | ||
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c new file mode 100644 index 000000000000..4ab17da46fd8 --- /dev/null +++ b/kernel/rtmutex.c | |||
| @@ -0,0 +1,990 @@ | |||
| 1 | /* | ||
| 2 | * RT-Mutexes: simple blocking mutual exclusion locks with PI support | ||
| 3 | * | ||
| 4 | * started by Ingo Molnar and Thomas Gleixner. | ||
| 5 | * | ||
| 6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
| 7 | * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
| 8 | * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt | ||
| 9 | * Copyright (C) 2006 Esben Nielsen | ||
| 10 | * | ||
| 11 | * See Documentation/rt-mutex-design.txt for details. | ||
| 12 | */ | ||
| 13 | #include <linux/spinlock.h> | ||
| 14 | #include <linux/module.h> | ||
| 15 | #include <linux/sched.h> | ||
| 16 | #include <linux/timer.h> | ||
| 17 | |||
| 18 | #include "rtmutex_common.h" | ||
| 19 | |||
| 20 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
| 21 | # include "rtmutex-debug.h" | ||
| 22 | #else | ||
| 23 | # include "rtmutex.h" | ||
| 24 | #endif | ||
| 25 | |||
| 26 | /* | ||
| 27 | * lock->owner state tracking: | ||
| 28 | * | ||
| 29 | * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 | ||
| 30 | * are used to keep track of the "owner is pending" and "lock has | ||
| 31 | * waiters" state. | ||
| 32 | * | ||
| 33 | * owner bit1 bit0 | ||
| 34 | * NULL 0 0 lock is free (fast acquire possible) | ||
| 35 | * NULL 0 1 invalid state | ||
| 36 | * NULL 1 0 Transitional State* | ||
| 37 | * NULL 1 1 invalid state | ||
| 38 | * taskpointer 0 0 lock is held (fast release possible) | ||
| 39 | * taskpointer 0 1 task is pending owner | ||
| 40 | * taskpointer 1 0 lock is held and has waiters | ||
| 41 | * taskpointer 1 1 task is pending owner and lock has more waiters | ||
| 42 | * | ||
| 43 | * Pending ownership is assigned to the top (highest priority) | ||
| 44 | * waiter of the lock, when the lock is released. The thread is woken | ||
| 45 | * up and can now take the lock. Until the lock is taken (bit 0 | ||
| 46 | * cleared) a competing higher priority thread can steal the lock | ||
| 47 | * which puts the woken up thread back on the waiters list. | ||
| 48 | * | ||
| 49 | * The fast atomic compare exchange based acquire and release is only | ||
| 50 | * possible when bit 0 and 1 of lock->owner are 0. | ||
| 51 | * | ||
| 52 | * (*) There's a small time where the owner can be NULL and the | ||
| 53 | * "lock has waiters" bit is set. This can happen when grabbing the lock. | ||
| 54 | * To prevent a cmpxchg of the owner releasing the lock, we need to set this | ||
| 55 | * bit before looking at the lock, hence the reason this is a transitional | ||
| 56 | * state. | ||
| 57 | */ | ||
| 58 | |||
| 59 | static void | ||
| 60 | rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, | ||
| 61 | unsigned long mask) | ||
| 62 | { | ||
| 63 | unsigned long val = (unsigned long)owner | mask; | ||
| 64 | |||
| 65 | if (rt_mutex_has_waiters(lock)) | ||
| 66 | val |= RT_MUTEX_HAS_WAITERS; | ||
| 67 | |||
| 68 | lock->owner = (struct task_struct *)val; | ||
| 69 | } | ||
| 70 | |||
| 71 | static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) | ||
| 72 | { | ||
| 73 | lock->owner = (struct task_struct *) | ||
| 74 | ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); | ||
| 75 | } | ||
| 76 | |||
| 77 | static void fixup_rt_mutex_waiters(struct rt_mutex *lock) | ||
| 78 | { | ||
| 79 | if (!rt_mutex_has_waiters(lock)) | ||
| 80 | clear_rt_mutex_waiters(lock); | ||
| 81 | } | ||
| 82 | |||
| 83 | /* | ||
| 84 | * We can speed up the acquire/release, if the architecture | ||
| 85 | * supports cmpxchg and if there's no debugging state to be set up | ||
| 86 | */ | ||
| 87 | #if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) | ||
| 88 | # define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) | ||
| 89 | static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) | ||
| 90 | { | ||
| 91 | unsigned long owner, *p = (unsigned long *) &lock->owner; | ||
| 92 | |||
| 93 | do { | ||
| 94 | owner = *p; | ||
| 95 | } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); | ||
| 96 | } | ||
| 97 | #else | ||
| 98 | # define rt_mutex_cmpxchg(l,c,n) (0) | ||
| 99 | static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) | ||
| 100 | { | ||
| 101 | lock->owner = (struct task_struct *) | ||
| 102 | ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); | ||
| 103 | } | ||
| 104 | #endif | ||
| 105 | |||
| 106 | /* | ||
| 107 | * Calculate task priority from the waiter list priority | ||
| 108 | * | ||
| 109 | * Return task->normal_prio when the waiter list is empty or when | ||
| 110 | * the waiter is not allowed to do priority boosting | ||
| 111 | */ | ||
| 112 | int rt_mutex_getprio(struct task_struct *task) | ||
| 113 | { | ||
| 114 | if (likely(!task_has_pi_waiters(task))) | ||
| 115 | return task->normal_prio; | ||
| 116 | |||
| 117 | return min(task_top_pi_waiter(task)->pi_list_entry.prio, | ||
| 118 | task->normal_prio); | ||
| 119 | } | ||
| 120 | |||
| 121 | /* | ||
| 122 | * Adjust the priority of a task, after its pi_waiters got modified. | ||
| 123 | * | ||
| 124 | * This can be both boosting and unboosting. task->pi_lock must be held. | ||
| 125 | */ | ||
| 126 | static void __rt_mutex_adjust_prio(struct task_struct *task) | ||
| 127 | { | ||
| 128 | int prio = rt_mutex_getprio(task); | ||
| 129 | |||
| 130 | if (task->prio != prio) | ||
| 131 | rt_mutex_setprio(task, prio); | ||
| 132 | } | ||
| 133 | |||
| 134 | /* | ||
| 135 | * Adjust task priority (undo boosting). Called from the exit path of | ||
| 136 | * rt_mutex_slowunlock() and rt_mutex_slowlock(). | ||
| 137 | * | ||
| 138 | * (Note: We do this outside of the protection of lock->wait_lock to | ||
| 139 | * allow the lock to be taken while or before we readjust the priority | ||
| 140 | * of task. We do not use the spin_xx_mutex() variants here as we are | ||
| 141 | * outside of the debug path.) | ||
| 142 | */ | ||
| 143 | static void rt_mutex_adjust_prio(struct task_struct *task) | ||
| 144 | { | ||
| 145 | unsigned long flags; | ||
| 146 | |||
| 147 | spin_lock_irqsave(&task->pi_lock, flags); | ||
| 148 | __rt_mutex_adjust_prio(task); | ||
| 149 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
| 150 | } | ||
| 151 | |||
| 152 | /* | ||
| 153 | * Max number of times we'll walk the boosting chain: | ||
| 154 | */ | ||
| 155 | int max_lock_depth = 1024; | ||
| 156 | |||
| 157 | /* | ||
| 158 | * Adjust the priority chain. Also used for deadlock detection. | ||
| 159 | * Decreases task's usage by one - may thus free the task. | ||
| 160 | * Returns 0 or -EDEADLK. | ||
| 161 | */ | ||
| 162 | static int rt_mutex_adjust_prio_chain(struct task_struct *task, | ||
| 163 | int deadlock_detect, | ||
| 164 | struct rt_mutex *orig_lock, | ||
| 165 | struct rt_mutex_waiter *orig_waiter, | ||
| 166 | struct task_struct *top_task) | ||
| 167 | { | ||
| 168 | struct rt_mutex *lock; | ||
| 169 | struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; | ||
| 170 | int detect_deadlock, ret = 0, depth = 0; | ||
| 171 | unsigned long flags; | ||
| 172 | |||
| 173 | detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, | ||
| 174 | deadlock_detect); | ||
| 175 | |||
| 176 | /* | ||
| 177 | * The (de)boosting is a step by step approach with a lot of | ||
| 178 | * pitfalls. We want this to be preemptible and we want hold a | ||
| 179 | * maximum of two locks per step. So we have to check | ||
| 180 | * carefully whether things change under us. | ||
| 181 | */ | ||
| 182 | again: | ||
| 183 | if (++depth > max_lock_depth) { | ||
| 184 | static int prev_max; | ||
| 185 | |||
| 186 | /* | ||
| 187 | * Print this only once. If the admin changes the limit, | ||
| 188 | * print a new message when reaching the limit again. | ||
| 189 | */ | ||
| 190 | if (prev_max != max_lock_depth) { | ||
| 191 | prev_max = max_lock_depth; | ||
| 192 | printk(KERN_WARNING "Maximum lock depth %d reached " | ||
| 193 | "task: %s (%d)\n", max_lock_depth, | ||
| 194 | top_task->comm, top_task->pid); | ||
| 195 | } | ||
| 196 | put_task_struct(task); | ||
| 197 | |||
| 198 | return deadlock_detect ? -EDEADLK : 0; | ||
| 199 | } | ||
| 200 | retry: | ||
| 201 | /* | ||
| 202 | * Task can not go away as we did a get_task() before ! | ||
| 203 | */ | ||
| 204 | spin_lock_irqsave(&task->pi_lock, flags); | ||
| 205 | |||
| 206 | waiter = task->pi_blocked_on; | ||
| 207 | /* | ||
| 208 | * Check whether the end of the boosting chain has been | ||
| 209 | * reached or the state of the chain has changed while we | ||
| 210 | * dropped the locks. | ||
| 211 | */ | ||
| 212 | if (!waiter || !waiter->task) | ||
| 213 | goto out_unlock_pi; | ||
| 214 | |||
| 215 | if (top_waiter && (!task_has_pi_waiters(task) || | ||
| 216 | top_waiter != task_top_pi_waiter(task))) | ||
| 217 | goto out_unlock_pi; | ||
| 218 | |||
| 219 | /* | ||
| 220 | * When deadlock detection is off then we check, if further | ||
| 221 | * priority adjustment is necessary. | ||
| 222 | */ | ||
| 223 | if (!detect_deadlock && waiter->list_entry.prio == task->prio) | ||
| 224 | goto out_unlock_pi; | ||
| 225 | |||
| 226 | lock = waiter->lock; | ||
| 227 | if (!spin_trylock(&lock->wait_lock)) { | ||
| 228 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
| 229 | cpu_relax(); | ||
| 230 | goto retry; | ||
| 231 | } | ||
| 232 | |||
| 233 | /* Deadlock detection */ | ||
| 234 | if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { | ||
| 235 | debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); | ||
| 236 | spin_unlock(&lock->wait_lock); | ||
| 237 | ret = deadlock_detect ? -EDEADLK : 0; | ||
| 238 | goto out_unlock_pi; | ||
| 239 | } | ||
| 240 | |||
| 241 | top_waiter = rt_mutex_top_waiter(lock); | ||
| 242 | |||
| 243 | /* Requeue the waiter */ | ||
| 244 | plist_del(&waiter->list_entry, &lock->wait_list); | ||
| 245 | waiter->list_entry.prio = task->prio; | ||
| 246 | plist_add(&waiter->list_entry, &lock->wait_list); | ||
| 247 | |||
| 248 | /* Release the task */ | ||
| 249 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
| 250 | put_task_struct(task); | ||
| 251 | |||
| 252 | /* Grab the next task */ | ||
| 253 | task = rt_mutex_owner(lock); | ||
| 254 | get_task_struct(task); | ||
| 255 | spin_lock_irqsave(&task->pi_lock, flags); | ||
| 256 | |||
| 257 | if (waiter == rt_mutex_top_waiter(lock)) { | ||
| 258 | /* Boost the owner */ | ||
| 259 | plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); | ||
| 260 | waiter->pi_list_entry.prio = waiter->list_entry.prio; | ||
| 261 | plist_add(&waiter->pi_list_entry, &task->pi_waiters); | ||
| 262 | __rt_mutex_adjust_prio(task); | ||
| 263 | |||
| 264 | } else if (top_waiter == waiter) { | ||
| 265 | /* Deboost the owner */ | ||
| 266 | plist_del(&waiter->pi_list_entry, &task->pi_waiters); | ||
| 267 | waiter = rt_mutex_top_waiter(lock); | ||
| 268 | waiter->pi_list_entry.prio = waiter->list_entry.prio; | ||
| 269 | plist_add(&waiter->pi_list_entry, &task->pi_waiters); | ||
| 270 | __rt_mutex_adjust_prio(task); | ||
| 271 | } | ||
| 272 | |||
| 273 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
| 274 | |||
| 275 | top_waiter = rt_mutex_top_waiter(lock); | ||
| 276 | spin_unlock(&lock->wait_lock); | ||
| 277 | |||
| 278 | if (!detect_deadlock && waiter != top_waiter) | ||
| 279 | goto out_put_task; | ||
| 280 | |||
| 281 | goto again; | ||
| 282 | |||
| 283 | out_unlock_pi: | ||
| 284 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
| 285 | out_put_task: | ||
| 286 | put_task_struct(task); | ||
| 287 | |||
| 288 | return ret; | ||
| 289 | } | ||
| 290 | |||
| 291 | /* | ||
| 292 | * Optimization: check if we can steal the lock from the | ||
| 293 | * assigned pending owner [which might not have taken the | ||
| 294 | * lock yet]: | ||
| 295 | */ | ||
| 296 | static inline int try_to_steal_lock(struct rt_mutex *lock) | ||
| 297 | { | ||
| 298 | struct task_struct *pendowner = rt_mutex_owner(lock); | ||
| 299 | struct rt_mutex_waiter *next; | ||
| 300 | unsigned long flags; | ||
| 301 | |||
| 302 | if (!rt_mutex_owner_pending(lock)) | ||
| 303 | return 0; | ||
| 304 | |||
| 305 | if (pendowner == current) | ||
| 306 | return 1; | ||
| 307 | |||
| 308 | spin_lock_irqsave(&pendowner->pi_lock, flags); | ||
| 309 | if (current->prio >= pendowner->prio) { | ||
| 310 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
| 311 | return 0; | ||
| 312 | } | ||
| 313 | |||
| 314 | /* | ||
| 315 | * Check if a waiter is enqueued on the pending owners | ||
| 316 | * pi_waiters list. Remove it and readjust pending owners | ||
| 317 | * priority. | ||
| 318 | */ | ||
| 319 | if (likely(!rt_mutex_has_waiters(lock))) { | ||
| 320 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
| 321 | return 1; | ||
| 322 | } | ||
| 323 | |||
| 324 | /* No chain handling, pending owner is not blocked on anything: */ | ||
| 325 | next = rt_mutex_top_waiter(lock); | ||
| 326 | plist_del(&next->pi_list_entry, &pendowner->pi_waiters); | ||
| 327 | __rt_mutex_adjust_prio(pendowner); | ||
| 328 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
| 329 | |||
| 330 | /* | ||
| 331 | * We are going to steal the lock and a waiter was | ||
| 332 | * enqueued on the pending owners pi_waiters queue. So | ||
| 333 | * we have to enqueue this waiter into | ||
| 334 | * current->pi_waiters list. This covers the case, | ||
| 335 | * where current is boosted because it holds another | ||
| 336 | * lock and gets unboosted because the booster is | ||
| 337 | * interrupted, so we would delay a waiter with higher | ||
| 338 | * priority as current->normal_prio. | ||
| 339 | * | ||
| 340 | * Note: in the rare case of a SCHED_OTHER task changing | ||
| 341 | * its priority and thus stealing the lock, next->task | ||
| 342 | * might be current: | ||
| 343 | */ | ||
| 344 | if (likely(next->task != current)) { | ||
| 345 | spin_lock_irqsave(¤t->pi_lock, flags); | ||
| 346 | plist_add(&next->pi_list_entry, ¤t->pi_waiters); | ||
| 347 | __rt_mutex_adjust_prio(current); | ||
| 348 | spin_unlock_irqrestore(¤t->pi_lock, flags); | ||
| 349 | } | ||
| 350 | return 1; | ||
| 351 | } | ||
| 352 | |||
| 353 | /* | ||
| 354 | * Try to take an rt-mutex | ||
| 355 | * | ||
| 356 | * This fails | ||
| 357 | * - when the lock has a real owner | ||
| 358 | * - when a different pending owner exists and has higher priority than current | ||
| 359 | * | ||
| 360 | * Must be called with lock->wait_lock held. | ||
| 361 | */ | ||
| 362 | static int try_to_take_rt_mutex(struct rt_mutex *lock) | ||
| 363 | { | ||
| 364 | /* | ||
| 365 | * We have to be careful here if the atomic speedups are | ||
| 366 | * enabled, such that, when | ||
| 367 | * - no other waiter is on the lock | ||
| 368 | * - the lock has been released since we did the cmpxchg | ||
| 369 | * the lock can be released or taken while we are doing the | ||
| 370 | * checks and marking the lock with RT_MUTEX_HAS_WAITERS. | ||
| 371 | * | ||
| 372 | * The atomic acquire/release aware variant of | ||
| 373 | * mark_rt_mutex_waiters uses a cmpxchg loop. After setting | ||
| 374 | * the WAITERS bit, the atomic release / acquire can not | ||
| 375 | * happen anymore and lock->wait_lock protects us from the | ||
| 376 | * non-atomic case. | ||
| 377 | * | ||
| 378 | * Note, that this might set lock->owner = | ||
| 379 | * RT_MUTEX_HAS_WAITERS in the case the lock is not contended | ||
| 380 | * any more. This is fixed up when we take the ownership. | ||
| 381 | * This is the transitional state explained at the top of this file. | ||
| 382 | */ | ||
| 383 | mark_rt_mutex_waiters(lock); | ||
| 384 | |||
| 385 | if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) | ||
| 386 | return 0; | ||
| 387 | |||
| 388 | /* We got the lock. */ | ||
| 389 | debug_rt_mutex_lock(lock); | ||
| 390 | |||
| 391 | rt_mutex_set_owner(lock, current, 0); | ||
| 392 | |||
| 393 | rt_mutex_deadlock_account_lock(lock, current); | ||
| 394 | |||
| 395 | return 1; | ||
| 396 | } | ||
| 397 | |||
| 398 | /* | ||
| 399 | * Task blocks on lock. | ||
| 400 | * | ||
| 401 | * Prepare waiter and propagate pi chain | ||
| 402 | * | ||
| 403 | * This must be called with lock->wait_lock held. | ||
| 404 | */ | ||
| 405 | static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | ||
| 406 | struct rt_mutex_waiter *waiter, | ||
| 407 | int detect_deadlock) | ||
| 408 | { | ||
| 409 | struct task_struct *owner = rt_mutex_owner(lock); | ||
| 410 | struct rt_mutex_waiter *top_waiter = waiter; | ||
| 411 | unsigned long flags; | ||
| 412 | int chain_walk = 0, res; | ||
| 413 | |||
| 414 | spin_lock_irqsave(¤t->pi_lock, flags); | ||
| 415 | __rt_mutex_adjust_prio(current); | ||
| 416 | waiter->task = current; | ||
| 417 | waiter->lock = lock; | ||
| 418 | plist_node_init(&waiter->list_entry, current->prio); | ||
| 419 | plist_node_init(&waiter->pi_list_entry, current->prio); | ||
| 420 | |||
| 421 | /* Get the top priority waiter on the lock */ | ||
| 422 | if (rt_mutex_has_waiters(lock)) | ||
| 423 | top_waiter = rt_mutex_top_waiter(lock); | ||
| 424 | plist_add(&waiter->list_entry, &lock->wait_list); | ||
| 425 | |||
| 426 | current->pi_blocked_on = waiter; | ||
| 427 | |||
| 428 | spin_unlock_irqrestore(¤t->pi_lock, flags); | ||
| 429 | |||
| 430 | if (waiter == rt_mutex_top_waiter(lock)) { | ||
| 431 | spin_lock_irqsave(&owner->pi_lock, flags); | ||
| 432 | plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); | ||
| 433 | plist_add(&waiter->pi_list_entry, &owner->pi_waiters); | ||
| 434 | |||
| 435 | __rt_mutex_adjust_prio(owner); | ||
| 436 | if (owner->pi_blocked_on) | ||
| 437 | chain_walk = 1; | ||
| 438 | spin_unlock_irqrestore(&owner->pi_lock, flags); | ||
| 439 | } | ||
| 440 | else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) | ||
| 441 | chain_walk = 1; | ||
| 442 | |||
| 443 | if (!chain_walk) | ||
| 444 | return 0; | ||
| 445 | |||
| 446 | /* | ||
| 447 | * The owner can't disappear while holding a lock, | ||
| 448 | * so the owner struct is protected by wait_lock. | ||
| 449 | * Gets dropped in rt_mutex_adjust_prio_chain()! | ||
| 450 | */ | ||
| 451 | get_task_struct(owner); | ||
| 452 | |||
| 453 | spin_unlock(&lock->wait_lock); | ||
| 454 | |||
| 455 | res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, | ||
| 456 | current); | ||
| 457 | |||
| 458 | spin_lock(&lock->wait_lock); | ||
| 459 | |||
| 460 | return res; | ||
| 461 | } | ||
| 462 | |||
| 463 | /* | ||
| 464 | * Wake up the next waiter on the lock. | ||
| 465 | * | ||
| 466 | * Remove the top waiter from the current tasks waiter list and from | ||
| 467 | * the lock waiter list. Set it as pending owner. Then wake it up. | ||
| 468 | * | ||
| 469 | * Called with lock->wait_lock held. | ||
| 470 | */ | ||
| 471 | static void wakeup_next_waiter(struct rt_mutex *lock) | ||
| 472 | { | ||
| 473 | struct rt_mutex_waiter *waiter; | ||
| 474 | struct task_struct *pendowner; | ||
| 475 | unsigned long flags; | ||
| 476 | |||
| 477 | spin_lock_irqsave(¤t->pi_lock, flags); | ||
| 478 | |||
| 479 | waiter = rt_mutex_top_waiter(lock); | ||
| 480 | plist_del(&waiter->list_entry, &lock->wait_list); | ||
| 481 | |||
| 482 | /* | ||
| 483 | * Remove it from current->pi_waiters. We do not adjust a | ||
| 484 | * possible priority boost right now. We execute wakeup in the | ||
| 485 | * boosted mode and go back to normal after releasing | ||
| 486 | * lock->wait_lock. | ||
| 487 | */ | ||
| 488 | plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); | ||
| 489 | pendowner = waiter->task; | ||
| 490 | waiter->task = NULL; | ||
| 491 | |||
| 492 | rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); | ||
| 493 | |||
| 494 | spin_unlock_irqrestore(¤t->pi_lock, flags); | ||
| 495 | |||
| 496 | /* | ||
| 497 | * Clear the pi_blocked_on variable and enqueue a possible | ||
| 498 | * waiter into the pi_waiters list of the pending owner. This | ||
| 499 | * prevents that in case the pending owner gets unboosted a | ||
| 500 | * waiter with higher priority than pending-owner->normal_prio | ||
| 501 | * is blocked on the unboosted (pending) owner. | ||
| 502 | */ | ||
| 503 | spin_lock_irqsave(&pendowner->pi_lock, flags); | ||
| 504 | |||
| 505 | WARN_ON(!pendowner->pi_blocked_on); | ||
| 506 | WARN_ON(pendowner->pi_blocked_on != waiter); | ||
| 507 | WARN_ON(pendowner->pi_blocked_on->lock != lock); | ||
| 508 | |||
| 509 | pendowner->pi_blocked_on = NULL; | ||
| 510 | |||
| 511 | if (rt_mutex_has_waiters(lock)) { | ||
| 512 | struct rt_mutex_waiter *next; | ||
| 513 | |||
| 514 | next = rt_mutex_top_waiter(lock); | ||
| 515 | plist_add(&next->pi_list_entry, &pendowner->pi_waiters); | ||
| 516 | } | ||
| 517 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
| 518 | |||
| 519 | wake_up_process(pendowner); | ||
| 520 | } | ||
| 521 | |||
| 522 | /* | ||
| 523 | * Remove a waiter from a lock | ||
| 524 | * | ||
| 525 | * Must be called with lock->wait_lock held | ||
| 526 | */ | ||
| 527 | static void remove_waiter(struct rt_mutex *lock, | ||
| 528 | struct rt_mutex_waiter *waiter) | ||
| 529 | { | ||
| 530 | int first = (waiter == rt_mutex_top_waiter(lock)); | ||
| 531 | struct task_struct *owner = rt_mutex_owner(lock); | ||
| 532 | unsigned long flags; | ||
| 533 | int chain_walk = 0; | ||
| 534 | |||
| 535 | spin_lock_irqsave(¤t->pi_lock, flags); | ||
| 536 | plist_del(&waiter->list_entry, &lock->wait_list); | ||
| 537 | waiter->task = NULL; | ||
| 538 | current->pi_blocked_on = NULL; | ||
| 539 | spin_unlock_irqrestore(¤t->pi_lock, flags); | ||
| 540 | |||
| 541 | if (first && owner != current) { | ||
| 542 | |||
| 543 | spin_lock_irqsave(&owner->pi_lock, flags); | ||
| 544 | |||
| 545 | plist_del(&waiter->pi_list_entry, &owner->pi_waiters); | ||
| 546 | |||
| 547 | if (rt_mutex_has_waiters(lock)) { | ||
| 548 | struct rt_mutex_waiter *next; | ||
| 549 | |||
| 550 | next = rt_mutex_top_waiter(lock); | ||
| 551 | plist_add(&next->pi_list_entry, &owner->pi_waiters); | ||
| 552 | } | ||
| 553 | __rt_mutex_adjust_prio(owner); | ||
| 554 | |||
| 555 | if (owner->pi_blocked_on) | ||
| 556 | chain_walk = 1; | ||
| 557 | |||
| 558 | spin_unlock_irqrestore(&owner->pi_lock, flags); | ||
| 559 | } | ||
| 560 | |||
| 561 | WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); | ||
| 562 | |||
| 563 | if (!chain_walk) | ||
| 564 | return; | ||
| 565 | |||
| 566 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ | ||
| 567 | get_task_struct(owner); | ||
| 568 | |||
| 569 | spin_unlock(&lock->wait_lock); | ||
| 570 | |||
| 571 | rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); | ||
| 572 | |||
| 573 | spin_lock(&lock->wait_lock); | ||
| 574 | } | ||
| 575 | |||
| 576 | /* | ||
| 577 | * Recheck the pi chain, in case we got a priority setting | ||
| 578 | * | ||
| 579 | * Called from sched_setscheduler | ||
| 580 | */ | ||
| 581 | void rt_mutex_adjust_pi(struct task_struct *task) | ||
| 582 | { | ||
| 583 | struct rt_mutex_waiter *waiter; | ||
| 584 | unsigned long flags; | ||
| 585 | |||
| 586 | spin_lock_irqsave(&task->pi_lock, flags); | ||
| 587 | |||
| 588 | waiter = task->pi_blocked_on; | ||
| 589 | if (!waiter || waiter->list_entry.prio == task->prio) { | ||
| 590 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
| 591 | return; | ||
| 592 | } | ||
| 593 | |||
| 594 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
| 595 | |||
| 596 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ | ||
| 597 | get_task_struct(task); | ||
| 598 | rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); | ||
| 599 | } | ||
| 600 | |||
| 601 | /* | ||
| 602 | * Slow path lock function: | ||
| 603 | */ | ||
| 604 | static int __sched | ||
| 605 | rt_mutex_slowlock(struct rt_mutex *lock, int state, | ||
| 606 | struct hrtimer_sleeper *timeout, | ||
| 607 | int detect_deadlock) | ||
| 608 | { | ||
| 609 | struct rt_mutex_waiter waiter; | ||
| 610 | int ret = 0; | ||
| 611 | |||
| 612 | debug_rt_mutex_init_waiter(&waiter); | ||
| 613 | waiter.task = NULL; | ||
| 614 | |||
| 615 | spin_lock(&lock->wait_lock); | ||
| 616 | |||
| 617 | /* Try to acquire the lock again: */ | ||
| 618 | if (try_to_take_rt_mutex(lock)) { | ||
| 619 | spin_unlock(&lock->wait_lock); | ||
| 620 | return 0; | ||
| 621 | } | ||
| 622 | |||
| 623 | set_current_state(state); | ||
| 624 | |||
| 625 | /* Setup the timer, when timeout != NULL */ | ||
| 626 | if (unlikely(timeout)) | ||
| 627 | hrtimer_start(&timeout->timer, timeout->timer.expires, | ||
| 628 | HRTIMER_ABS); | ||
| 629 | |||
| 630 | for (;;) { | ||
| 631 | /* Try to acquire the lock: */ | ||
| 632 | if (try_to_take_rt_mutex(lock)) | ||
| 633 | break; | ||
| 634 | |||
| 635 | /* | ||
| 636 | * TASK_INTERRUPTIBLE checks for signals and | ||
| 637 | * timeout. Ignored otherwise. | ||
| 638 | */ | ||
| 639 | if (unlikely(state == TASK_INTERRUPTIBLE)) { | ||
| 640 | /* Signal pending? */ | ||
| 641 | if (signal_pending(current)) | ||
| 642 | ret = -EINTR; | ||
| 643 | if (timeout && !timeout->task) | ||
| 644 | ret = -ETIMEDOUT; | ||
| 645 | if (ret) | ||
| 646 | break; | ||
| 647 | } | ||
| 648 | |||
| 649 | /* | ||
| 650 | * waiter.task is NULL the first time we come here and | ||
| 651 | * when we have been woken up by the previous owner | ||
| 652 | * but the lock got stolen by a higher prio task. | ||
| 653 | */ | ||
| 654 | if (!waiter.task) { | ||
| 655 | ret = task_blocks_on_rt_mutex(lock, &waiter, | ||
| 656 | detect_deadlock); | ||
| 657 | /* | ||
| 658 | * If we got woken up by the owner then start loop | ||
| 659 | * all over without going into schedule to try | ||
| 660 | * to get the lock now: | ||
| 661 | */ | ||
| 662 | if (unlikely(!waiter.task)) | ||
| 663 | continue; | ||
| 664 | |||
| 665 | if (unlikely(ret)) | ||
| 666 | break; | ||
| 667 | } | ||
| 668 | |||
| 669 | spin_unlock(&lock->wait_lock); | ||
| 670 | |||
| 671 | debug_rt_mutex_print_deadlock(&waiter); | ||
| 672 | |||
| 673 | if (waiter.task) | ||
| 674 | schedule_rt_mutex(lock); | ||
| 675 | |||
| 676 | spin_lock(&lock->wait_lock); | ||
| 677 | set_current_state(state); | ||
| 678 | } | ||
| 679 | |||
| 680 | set_current_state(TASK_RUNNING); | ||
| 681 | |||
| 682 | if (unlikely(waiter.task)) | ||
| 683 | remove_waiter(lock, &waiter); | ||
| 684 | |||
| 685 | /* | ||
| 686 | * try_to_take_rt_mutex() sets the waiter bit | ||
| 687 | * unconditionally. We might have to fix that up. | ||
| 688 | */ | ||
| 689 | fixup_rt_mutex_waiters(lock); | ||
| 690 | |||
| 691 | spin_unlock(&lock->wait_lock); | ||
| 692 | |||
| 693 | /* Remove pending timer: */ | ||
| 694 | if (unlikely(timeout)) | ||
| 695 | hrtimer_cancel(&timeout->timer); | ||
| 696 | |||
| 697 | /* | ||
| 698 | * Readjust priority, when we did not get the lock. We might | ||
| 699 | * have been the pending owner and boosted. Since we did not | ||
| 700 | * take the lock, the PI boost has to go. | ||
| 701 | */ | ||
| 702 | if (unlikely(ret)) | ||
| 703 | rt_mutex_adjust_prio(current); | ||
| 704 | |||
| 705 | debug_rt_mutex_free_waiter(&waiter); | ||
| 706 | |||
| 707 | return ret; | ||
| 708 | } | ||
| 709 | |||
| 710 | /* | ||
| 711 | * Slow path try-lock function: | ||
| 712 | */ | ||
| 713 | static inline int | ||
| 714 | rt_mutex_slowtrylock(struct rt_mutex *lock) | ||
| 715 | { | ||
| 716 | int ret = 0; | ||
| 717 | |||
| 718 | spin_lock(&lock->wait_lock); | ||
| 719 | |||
| 720 | if (likely(rt_mutex_owner(lock) != current)) { | ||
| 721 | |||
| 722 | ret = try_to_take_rt_mutex(lock); | ||
| 723 | /* | ||
| 724 | * try_to_take_rt_mutex() sets the lock waiters | ||
| 725 | * bit unconditionally. Clean this up. | ||
| 726 | */ | ||
| 727 | fixup_rt_mutex_waiters(lock); | ||
| 728 | } | ||
| 729 | |||
| 730 | spin_unlock(&lock->wait_lock); | ||
| 731 | |||
| 732 | return ret; | ||
| 733 | } | ||
| 734 | |||
| 735 | /* | ||
| 736 | * Slow path to release a rt-mutex: | ||
| 737 | */ | ||
| 738 | static void __sched | ||
| 739 | rt_mutex_slowunlock(struct rt_mutex *lock) | ||
| 740 | { | ||
| 741 | spin_lock(&lock->wait_lock); | ||
| 742 | |||
| 743 | debug_rt_mutex_unlock(lock); | ||
| 744 | |||
| 745 | rt_mutex_deadlock_account_unlock(current); | ||
| 746 | |||
| 747 | if (!rt_mutex_has_waiters(lock)) { | ||
| 748 | lock->owner = NULL; | ||
| 749 | spin_unlock(&lock->wait_lock); | ||
| 750 | return; | ||
| 751 | } | ||
| 752 | |||
| 753 | wakeup_next_waiter(lock); | ||
| 754 | |||
| 755 | spin_unlock(&lock->wait_lock); | ||
| 756 | |||
| 757 | /* Undo pi boosting if necessary: */ | ||
| 758 | rt_mutex_adjust_prio(current); | ||
| 759 | } | ||
| 760 | |||
| 761 | /* | ||
| 762 | * debug aware fast / slowpath lock,trylock,unlock | ||
| 763 | * | ||
| 764 | * The atomic acquire/release ops are compiled away, when either the | ||
| 765 | * architecture does not support cmpxchg or when debugging is enabled. | ||
| 766 | */ | ||
| 767 | static inline int | ||
| 768 | rt_mutex_fastlock(struct rt_mutex *lock, int state, | ||
| 769 | int detect_deadlock, | ||
| 770 | int (*slowfn)(struct rt_mutex *lock, int state, | ||
| 771 | struct hrtimer_sleeper *timeout, | ||
| 772 | int detect_deadlock)) | ||
| 773 | { | ||
| 774 | if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { | ||
| 775 | rt_mutex_deadlock_account_lock(lock, current); | ||
| 776 | return 0; | ||
| 777 | } else | ||
| 778 | return slowfn(lock, state, NULL, detect_deadlock); | ||
| 779 | } | ||
| 780 | |||
| 781 | static inline int | ||
| 782 | rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, | ||
| 783 | struct hrtimer_sleeper *timeout, int detect_deadlock, | ||
| 784 | int (*slowfn)(struct rt_mutex *lock, int state, | ||
| 785 | struct hrtimer_sleeper *timeout, | ||
| 786 | int detect_deadlock)) | ||
| 787 | { | ||
| 788 | if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { | ||
| 789 | rt_mutex_deadlock_account_lock(lock, current); | ||
| 790 | return 0; | ||
| 791 | } else | ||
| 792 | return slowfn(lock, state, timeout, detect_deadlock); | ||
| 793 | } | ||
| 794 | |||
| 795 | static inline int | ||
| 796 | rt_mutex_fasttrylock(struct rt_mutex *lock, | ||
| 797 | int (*slowfn)(struct rt_mutex *lock)) | ||
| 798 | { | ||
| 799 | if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { | ||
| 800 | rt_mutex_deadlock_account_lock(lock, current); | ||
| 801 | return 1; | ||
| 802 | } | ||
| 803 | return slowfn(lock); | ||
| 804 | } | ||
| 805 | |||
| 806 | static inline void | ||
| 807 | rt_mutex_fastunlock(struct rt_mutex *lock, | ||
| 808 | void (*slowfn)(struct rt_mutex *lock)) | ||
| 809 | { | ||
| 810 | if (likely(rt_mutex_cmpxchg(lock, current, NULL))) | ||
| 811 | rt_mutex_deadlock_account_unlock(current); | ||
| 812 | else | ||
| 813 | slowfn(lock); | ||
| 814 | } | ||
| 815 | |||
| 816 | /** | ||
| 817 | * rt_mutex_lock - lock a rt_mutex | ||
| 818 | * | ||
| 819 | * @lock: the rt_mutex to be locked | ||
| 820 | */ | ||
| 821 | void __sched rt_mutex_lock(struct rt_mutex *lock) | ||
| 822 | { | ||
| 823 | might_sleep(); | ||
| 824 | |||
| 825 | rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); | ||
| 826 | } | ||
| 827 | EXPORT_SYMBOL_GPL(rt_mutex_lock); | ||
| 828 | |||
| 829 | /** | ||
| 830 | * rt_mutex_lock_interruptible - lock a rt_mutex interruptible | ||
| 831 | * | ||
| 832 | * @lock: the rt_mutex to be locked | ||
| 833 | * @detect_deadlock: deadlock detection on/off | ||
| 834 | * | ||
| 835 | * Returns: | ||
| 836 | * 0 on success | ||
| 837 | * -EINTR when interrupted by a signal | ||
| 838 | * -EDEADLK when the lock would deadlock (when deadlock detection is on) | ||
| 839 | */ | ||
| 840 | int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, | ||
| 841 | int detect_deadlock) | ||
| 842 | { | ||
| 843 | might_sleep(); | ||
| 844 | |||
| 845 | return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, | ||
| 846 | detect_deadlock, rt_mutex_slowlock); | ||
| 847 | } | ||
| 848 | EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); | ||
| 849 | |||
| 850 | /** | ||
| 851 | * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible | ||
| 852 | * the timeout structure is provided | ||
| 853 | * by the caller | ||
| 854 | * | ||
| 855 | * @lock: the rt_mutex to be locked | ||
| 856 | * @timeout: timeout structure or NULL (no timeout) | ||
| 857 | * @detect_deadlock: deadlock detection on/off | ||
| 858 | * | ||
| 859 | * Returns: | ||
| 860 | * 0 on success | ||
| 861 | * -EINTR when interrupted by a signal | ||
| 862 | * -ETIMEOUT when the timeout expired | ||
| 863 | * -EDEADLK when the lock would deadlock (when deadlock detection is on) | ||
| 864 | */ | ||
| 865 | int | ||
| 866 | rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, | ||
| 867 | int detect_deadlock) | ||
| 868 | { | ||
| 869 | might_sleep(); | ||
| 870 | |||
| 871 | return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, | ||
| 872 | detect_deadlock, rt_mutex_slowlock); | ||
| 873 | } | ||
| 874 | EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); | ||
| 875 | |||
| 876 | /** | ||
| 877 | * rt_mutex_trylock - try to lock a rt_mutex | ||
| 878 | * | ||
| 879 | * @lock: the rt_mutex to be locked | ||
| 880 | * | ||
| 881 | * Returns 1 on success and 0 on contention | ||
| 882 | */ | ||
| 883 | int __sched rt_mutex_trylock(struct rt_mutex *lock) | ||
| 884 | { | ||
| 885 | return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); | ||
| 886 | } | ||
| 887 | EXPORT_SYMBOL_GPL(rt_mutex_trylock); | ||
| 888 | |||
| 889 | /** | ||
| 890 | * rt_mutex_unlock - unlock a rt_mutex | ||
| 891 | * | ||
| 892 | * @lock: the rt_mutex to be unlocked | ||
| 893 | */ | ||
| 894 | void __sched rt_mutex_unlock(struct rt_mutex *lock) | ||
| 895 | { | ||
| 896 | rt_mutex_fastunlock(lock, rt_mutex_slowunlock); | ||
| 897 | } | ||
| 898 | EXPORT_SYMBOL_GPL(rt_mutex_unlock); | ||
| 899 | |||
| 900 | /*** | ||
| 901 | * rt_mutex_destroy - mark a mutex unusable | ||
| 902 | * @lock: the mutex to be destroyed | ||
| 903 | * | ||
| 904 | * This function marks the mutex uninitialized, and any subsequent | ||
| 905 | * use of the mutex is forbidden. The mutex must not be locked when | ||
| 906 | * this function is called. | ||
| 907 | */ | ||
| 908 | void rt_mutex_destroy(struct rt_mutex *lock) | ||
| 909 | { | ||
| 910 | WARN_ON(rt_mutex_is_locked(lock)); | ||
| 911 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
| 912 | lock->magic = NULL; | ||
| 913 | #endif | ||
| 914 | } | ||
| 915 | |||
| 916 | EXPORT_SYMBOL_GPL(rt_mutex_destroy); | ||
| 917 | |||
| 918 | /** | ||
| 919 | * __rt_mutex_init - initialize the rt lock | ||
| 920 | * | ||
| 921 | * @lock: the rt lock to be initialized | ||
| 922 | * | ||
| 923 | * Initialize the rt lock to unlocked state. | ||
| 924 | * | ||
| 925 | * Initializing of a locked rt lock is not allowed | ||
| 926 | */ | ||
| 927 | void __rt_mutex_init(struct rt_mutex *lock, const char *name) | ||
| 928 | { | ||
| 929 | lock->owner = NULL; | ||
| 930 | spin_lock_init(&lock->wait_lock); | ||
| 931 | plist_head_init(&lock->wait_list, &lock->wait_lock); | ||
| 932 | |||
| 933 | debug_rt_mutex_init(lock, name); | ||
| 934 | } | ||
| 935 | EXPORT_SYMBOL_GPL(__rt_mutex_init); | ||
| 936 | |||
| 937 | /** | ||
| 938 | * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a | ||
| 939 | * proxy owner | ||
| 940 | * | ||
| 941 | * @lock: the rt_mutex to be locked | ||
| 942 | * @proxy_owner:the task to set as owner | ||
| 943 | * | ||
| 944 | * No locking. Caller has to do serializing itself | ||
| 945 | * Special API call for PI-futex support | ||
| 946 | */ | ||
| 947 | void rt_mutex_init_proxy_locked(struct rt_mutex *lock, | ||
| 948 | struct task_struct *proxy_owner) | ||
| 949 | { | ||
| 950 | __rt_mutex_init(lock, NULL); | ||
| 951 | debug_rt_mutex_proxy_lock(lock, proxy_owner); | ||
| 952 | rt_mutex_set_owner(lock, proxy_owner, 0); | ||
| 953 | rt_mutex_deadlock_account_lock(lock, proxy_owner); | ||
| 954 | } | ||
| 955 | |||
| 956 | /** | ||
| 957 | * rt_mutex_proxy_unlock - release a lock on behalf of owner | ||
| 958 | * | ||
| 959 | * @lock: the rt_mutex to be locked | ||
| 960 | * | ||
| 961 | * No locking. Caller has to do serializing itself | ||
| 962 | * Special API call for PI-futex support | ||
| 963 | */ | ||
| 964 | void rt_mutex_proxy_unlock(struct rt_mutex *lock, | ||
| 965 | struct task_struct *proxy_owner) | ||
| 966 | { | ||
| 967 | debug_rt_mutex_proxy_unlock(lock); | ||
| 968 | rt_mutex_set_owner(lock, NULL, 0); | ||
| 969 | rt_mutex_deadlock_account_unlock(proxy_owner); | ||
| 970 | } | ||
| 971 | |||
| 972 | /** | ||
| 973 | * rt_mutex_next_owner - return the next owner of the lock | ||
| 974 | * | ||
| 975 | * @lock: the rt lock query | ||
| 976 | * | ||
| 977 | * Returns the next owner of the lock or NULL | ||
| 978 | * | ||
| 979 | * Caller has to serialize against other accessors to the lock | ||
| 980 | * itself. | ||
| 981 | * | ||
| 982 | * Special API call for PI-futex support | ||
| 983 | */ | ||
| 984 | struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) | ||
| 985 | { | ||
| 986 | if (!rt_mutex_has_waiters(lock)) | ||
| 987 | return NULL; | ||
| 988 | |||
| 989 | return rt_mutex_top_waiter(lock)->task; | ||
| 990 | } | ||
diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h new file mode 100644 index 000000000000..a1a1dd06421d --- /dev/null +++ b/kernel/rtmutex.h | |||
| @@ -0,0 +1,26 @@ | |||
| 1 | /* | ||
| 2 | * RT-Mutexes: blocking mutual exclusion locks with PI support | ||
| 3 | * | ||
| 4 | * started by Ingo Molnar and Thomas Gleixner: | ||
| 5 | * | ||
| 6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
| 7 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
| 8 | * | ||
| 9 | * This file contains macros used solely by rtmutex.c. | ||
| 10 | * Non-debug version. | ||
| 11 | */ | ||
| 12 | |||
| 13 | #define rt_mutex_deadlock_check(l) (0) | ||
| 14 | #define rt_mutex_deadlock_account_lock(m, t) do { } while (0) | ||
| 15 | #define rt_mutex_deadlock_account_unlock(l) do { } while (0) | ||
| 16 | #define debug_rt_mutex_init_waiter(w) do { } while (0) | ||
| 17 | #define debug_rt_mutex_free_waiter(w) do { } while (0) | ||
| 18 | #define debug_rt_mutex_lock(l) do { } while (0) | ||
| 19 | #define debug_rt_mutex_proxy_lock(l,p) do { } while (0) | ||
| 20 | #define debug_rt_mutex_proxy_unlock(l) do { } while (0) | ||
| 21 | #define debug_rt_mutex_unlock(l) do { } while (0) | ||
| 22 | #define debug_rt_mutex_init(m, n) do { } while (0) | ||
| 23 | #define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) | ||
| 24 | #define debug_rt_mutex_print_deadlock(w) do { } while (0) | ||
| 25 | #define debug_rt_mutex_detect_deadlock(w,d) (d) | ||
| 26 | #define debug_rt_mutex_reset_waiter(w) do { } while (0) | ||
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h new file mode 100644 index 000000000000..9c75856e791e --- /dev/null +++ b/kernel/rtmutex_common.h | |||
| @@ -0,0 +1,123 @@ | |||
| 1 | /* | ||
| 2 | * RT Mutexes: blocking mutual exclusion locks with PI support | ||
| 3 | * | ||
| 4 | * started by Ingo Molnar and Thomas Gleixner: | ||
| 5 | * | ||
| 6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
| 7 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
| 8 | * | ||
| 9 | * This file contains the private data structure and API definitions. | ||
| 10 | */ | ||
| 11 | |||
| 12 | #ifndef __KERNEL_RTMUTEX_COMMON_H | ||
| 13 | #define __KERNEL_RTMUTEX_COMMON_H | ||
| 14 | |||
| 15 | #include <linux/rtmutex.h> | ||
| 16 | |||
| 17 | /* | ||
| 18 | * The rtmutex in kernel tester is independent of rtmutex debugging. We | ||
| 19 | * call schedule_rt_mutex_test() instead of schedule() for the tasks which | ||
| 20 | * belong to the tester. That way we can delay the wakeup path of those | ||
| 21 | * threads to provoke lock stealing and testing of complex boosting scenarios. | ||
| 22 | */ | ||
| 23 | #ifdef CONFIG_RT_MUTEX_TESTER | ||
| 24 | |||
| 25 | extern void schedule_rt_mutex_test(struct rt_mutex *lock); | ||
| 26 | |||
| 27 | #define schedule_rt_mutex(_lock) \ | ||
| 28 | do { \ | ||
| 29 | if (!(current->flags & PF_MUTEX_TESTER)) \ | ||
| 30 | schedule(); \ | ||
| 31 | else \ | ||
| 32 | schedule_rt_mutex_test(_lock); \ | ||
| 33 | } while (0) | ||
| 34 | |||
| 35 | #else | ||
| 36 | # define schedule_rt_mutex(_lock) schedule() | ||
| 37 | #endif | ||
| 38 | |||
| 39 | /* | ||
| 40 | * This is the control structure for tasks blocked on a rt_mutex, | ||
| 41 | * which is allocated on the kernel stack on of the blocked task. | ||
| 42 | * | ||
| 43 | * @list_entry: pi node to enqueue into the mutex waiters list | ||
| 44 | * @pi_list_entry: pi node to enqueue into the mutex owner waiters list | ||
| 45 | * @task: task reference to the blocked task | ||
| 46 | */ | ||
| 47 | struct rt_mutex_waiter { | ||
| 48 | struct plist_node list_entry; | ||
| 49 | struct plist_node pi_list_entry; | ||
| 50 | struct task_struct *task; | ||
| 51 | struct rt_mutex *lock; | ||
| 52 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
| 53 | unsigned long ip; | ||
| 54 | pid_t deadlock_task_pid; | ||
| 55 | struct rt_mutex *deadlock_lock; | ||
| 56 | #endif | ||
| 57 | }; | ||
| 58 | |||
| 59 | /* | ||
| 60 | * Various helpers to access the waiters-plist: | ||
| 61 | */ | ||
| 62 | static inline int rt_mutex_has_waiters(struct rt_mutex *lock) | ||
| 63 | { | ||
| 64 | return !plist_head_empty(&lock->wait_list); | ||
| 65 | } | ||
| 66 | |||
| 67 | static inline struct rt_mutex_waiter * | ||
| 68 | rt_mutex_top_waiter(struct rt_mutex *lock) | ||
| 69 | { | ||
| 70 | struct rt_mutex_waiter *w; | ||
| 71 | |||
| 72 | w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, | ||
| 73 | list_entry); | ||
| 74 | BUG_ON(w->lock != lock); | ||
| 75 | |||
| 76 | return w; | ||
| 77 | } | ||
| 78 | |||
| 79 | static inline int task_has_pi_waiters(struct task_struct *p) | ||
| 80 | { | ||
| 81 | return !plist_head_empty(&p->pi_waiters); | ||
| 82 | } | ||
| 83 | |||
| 84 | static inline struct rt_mutex_waiter * | ||
| 85 | task_top_pi_waiter(struct task_struct *p) | ||
| 86 | { | ||
| 87 | return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, | ||
| 88 | pi_list_entry); | ||
| 89 | } | ||
| 90 | |||
| 91 | /* | ||
| 92 | * lock->owner state tracking: | ||
| 93 | */ | ||
| 94 | #define RT_MUTEX_OWNER_PENDING 1UL | ||
| 95 | #define RT_MUTEX_HAS_WAITERS 2UL | ||
| 96 | #define RT_MUTEX_OWNER_MASKALL 3UL | ||
| 97 | |||
| 98 | static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) | ||
| 99 | { | ||
| 100 | return (struct task_struct *) | ||
| 101 | ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); | ||
| 102 | } | ||
| 103 | |||
| 104 | static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) | ||
| 105 | { | ||
| 106 | return (struct task_struct *) | ||
| 107 | ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); | ||
| 108 | } | ||
| 109 | |||
| 110 | static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) | ||
| 111 | { | ||
| 112 | return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; | ||
| 113 | } | ||
| 114 | |||
| 115 | /* | ||
| 116 | * PI-futex support (proxy locking functions, etc.): | ||
| 117 | */ | ||
| 118 | extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); | ||
| 119 | extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, | ||
| 120 | struct task_struct *proxy_owner); | ||
| 121 | extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, | ||
| 122 | struct task_struct *proxy_owner); | ||
| 123 | #endif | ||
diff --git a/kernel/rwsem.c b/kernel/rwsem.c new file mode 100644 index 000000000000..291ded556aa0 --- /dev/null +++ b/kernel/rwsem.c | |||
| @@ -0,0 +1,147 @@ | |||
| 1 | /* kernel/rwsem.c: R/W semaphores, public implementation | ||
| 2 | * | ||
| 3 | * Written by David Howells (dhowells@redhat.com). | ||
| 4 | * Derived from asm-i386/semaphore.h | ||
| 5 | */ | ||
| 6 | |||
| 7 | #include <linux/types.h> | ||
| 8 | #include <linux/kernel.h> | ||
| 9 | #include <linux/module.h> | ||
| 10 | #include <linux/rwsem.h> | ||
| 11 | |||
| 12 | #include <asm/system.h> | ||
| 13 | #include <asm/atomic.h> | ||
| 14 | |||
| 15 | /* | ||
| 16 | * lock for reading | ||
| 17 | */ | ||
| 18 | void down_read(struct rw_semaphore *sem) | ||
| 19 | { | ||
| 20 | might_sleep(); | ||
| 21 | rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); | ||
| 22 | |||
| 23 | __down_read(sem); | ||
| 24 | } | ||
| 25 | |||
| 26 | EXPORT_SYMBOL(down_read); | ||
| 27 | |||
| 28 | /* | ||
| 29 | * trylock for reading -- returns 1 if successful, 0 if contention | ||
| 30 | */ | ||
| 31 | int down_read_trylock(struct rw_semaphore *sem) | ||
| 32 | { | ||
| 33 | int ret = __down_read_trylock(sem); | ||
| 34 | |||
| 35 | if (ret == 1) | ||
| 36 | rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); | ||
| 37 | return ret; | ||
| 38 | } | ||
| 39 | |||
| 40 | EXPORT_SYMBOL(down_read_trylock); | ||
| 41 | |||
| 42 | /* | ||
| 43 | * lock for writing | ||
| 44 | */ | ||
| 45 | void down_write(struct rw_semaphore *sem) | ||
| 46 | { | ||
| 47 | might_sleep(); | ||
| 48 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); | ||
| 49 | |||
| 50 | __down_write(sem); | ||
| 51 | } | ||
| 52 | |||
| 53 | EXPORT_SYMBOL(down_write); | ||
| 54 | |||
| 55 | /* | ||
| 56 | * trylock for writing -- returns 1 if successful, 0 if contention | ||
| 57 | */ | ||
| 58 | int down_write_trylock(struct rw_semaphore *sem) | ||
| 59 | { | ||
| 60 | int ret = __down_write_trylock(sem); | ||
| 61 | |||
| 62 | if (ret == 1) | ||
| 63 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); | ||
| 64 | return ret; | ||
| 65 | } | ||
| 66 | |||
| 67 | EXPORT_SYMBOL(down_write_trylock); | ||
| 68 | |||
| 69 | /* | ||
| 70 | * release a read lock | ||
| 71 | */ | ||
| 72 | void up_read(struct rw_semaphore *sem) | ||
| 73 | { | ||
| 74 | rwsem_release(&sem->dep_map, 1, _RET_IP_); | ||
| 75 | |||
| 76 | __up_read(sem); | ||
| 77 | } | ||
| 78 | |||
| 79 | EXPORT_SYMBOL(up_read); | ||
| 80 | |||
| 81 | /* | ||
| 82 | * release a write lock | ||
| 83 | */ | ||
| 84 | void up_write(struct rw_semaphore *sem) | ||
| 85 | { | ||
| 86 | rwsem_release(&sem->dep_map, 1, _RET_IP_); | ||
| 87 | |||
| 88 | __up_write(sem); | ||
| 89 | } | ||
| 90 | |||
| 91 | EXPORT_SYMBOL(up_write); | ||
| 92 | |||
| 93 | /* | ||
| 94 | * downgrade write lock to read lock | ||
| 95 | */ | ||
| 96 | void downgrade_write(struct rw_semaphore *sem) | ||
| 97 | { | ||
| 98 | /* | ||
| 99 | * lockdep: a downgraded write will live on as a write | ||
| 100 | * dependency. | ||
| 101 | */ | ||
| 102 | __downgrade_write(sem); | ||
| 103 | } | ||
| 104 | |||
| 105 | EXPORT_SYMBOL(downgrade_write); | ||
| 106 | |||
| 107 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
| 108 | |||
| 109 | void down_read_nested(struct rw_semaphore *sem, int subclass) | ||
| 110 | { | ||
| 111 | might_sleep(); | ||
| 112 | rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); | ||
| 113 | |||
| 114 | __down_read(sem); | ||
| 115 | } | ||
| 116 | |||
| 117 | EXPORT_SYMBOL(down_read_nested); | ||
| 118 | |||
| 119 | void down_read_non_owner(struct rw_semaphore *sem) | ||
| 120 | { | ||
| 121 | might_sleep(); | ||
| 122 | |||
| 123 | __down_read(sem); | ||
| 124 | } | ||
| 125 | |||
| 126 | EXPORT_SYMBOL(down_read_non_owner); | ||
| 127 | |||
| 128 | void down_write_nested(struct rw_semaphore *sem, int subclass) | ||
| 129 | { | ||
| 130 | might_sleep(); | ||
| 131 | rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); | ||
| 132 | |||
| 133 | __down_write_nested(sem, subclass); | ||
| 134 | } | ||
| 135 | |||
| 136 | EXPORT_SYMBOL(down_write_nested); | ||
| 137 | |||
| 138 | void up_read_non_owner(struct rw_semaphore *sem) | ||
| 139 | { | ||
| 140 | __up_read(sem); | ||
| 141 | } | ||
| 142 | |||
| 143 | EXPORT_SYMBOL(up_read_non_owner); | ||
| 144 | |||
| 145 | #endif | ||
| 146 | |||
| 147 | |||
diff --git a/kernel/sched.c b/kernel/sched.c index 5dbc42694477..74f169ac0773 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -30,6 +30,7 @@ | |||
| 30 | #include <linux/capability.h> | 30 | #include <linux/capability.h> |
| 31 | #include <linux/completion.h> | 31 | #include <linux/completion.h> |
| 32 | #include <linux/kernel_stat.h> | 32 | #include <linux/kernel_stat.h> |
| 33 | #include <linux/debug_locks.h> | ||
| 33 | #include <linux/security.h> | 34 | #include <linux/security.h> |
| 34 | #include <linux/notifier.h> | 35 | #include <linux/notifier.h> |
| 35 | #include <linux/profile.h> | 36 | #include <linux/profile.h> |
| @@ -50,6 +51,7 @@ | |||
| 50 | #include <linux/times.h> | 51 | #include <linux/times.h> |
| 51 | #include <linux/acct.h> | 52 | #include <linux/acct.h> |
| 52 | #include <linux/kprobes.h> | 53 | #include <linux/kprobes.h> |
| 54 | #include <linux/delayacct.h> | ||
| 53 | #include <asm/tlb.h> | 55 | #include <asm/tlb.h> |
| 54 | 56 | ||
| 55 | #include <asm/unistd.h> | 57 | #include <asm/unistd.h> |
| @@ -168,29 +170,28 @@ | |||
| 168 | */ | 170 | */ |
| 169 | 171 | ||
| 170 | #define SCALE_PRIO(x, prio) \ | 172 | #define SCALE_PRIO(x, prio) \ |
| 171 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) | 173 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) |
| 172 | 174 | ||
| 173 | static unsigned int task_timeslice(task_t *p) | 175 | static unsigned int static_prio_timeslice(int static_prio) |
| 174 | { | 176 | { |
| 175 | if (p->static_prio < NICE_TO_PRIO(0)) | 177 | if (static_prio < NICE_TO_PRIO(0)) |
| 176 | return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); | 178 | return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); |
| 177 | else | 179 | else |
| 178 | return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); | 180 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); |
| 181 | } | ||
| 182 | |||
| 183 | static inline unsigned int task_timeslice(struct task_struct *p) | ||
| 184 | { | ||
| 185 | return static_prio_timeslice(p->static_prio); | ||
| 179 | } | 186 | } |
| 180 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ | ||
| 181 | < (long long) (sd)->cache_hot_time) | ||
| 182 | 187 | ||
| 183 | /* | 188 | /* |
| 184 | * These are the runqueue data structures: | 189 | * These are the runqueue data structures: |
| 185 | */ | 190 | */ |
| 186 | 191 | ||
| 187 | #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) | ||
| 188 | |||
| 189 | typedef struct runqueue runqueue_t; | ||
| 190 | |||
| 191 | struct prio_array { | 192 | struct prio_array { |
| 192 | unsigned int nr_active; | 193 | unsigned int nr_active; |
| 193 | unsigned long bitmap[BITMAP_SIZE]; | 194 | DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ |
| 194 | struct list_head queue[MAX_PRIO]; | 195 | struct list_head queue[MAX_PRIO]; |
| 195 | }; | 196 | }; |
| 196 | 197 | ||
| @@ -201,7 +202,7 @@ struct prio_array { | |||
| 201 | * (such as the load balancing or the thread migration code), lock | 202 | * (such as the load balancing or the thread migration code), lock |
| 202 | * acquire operations must be ordered by ascending &runqueue. | 203 | * acquire operations must be ordered by ascending &runqueue. |
| 203 | */ | 204 | */ |
| 204 | struct runqueue { | 205 | struct rq { |
| 205 | spinlock_t lock; | 206 | spinlock_t lock; |
| 206 | 207 | ||
| 207 | /* | 208 | /* |
| @@ -209,6 +210,7 @@ struct runqueue { | |||
| 209 | * remote CPUs use both these fields when doing load calculation. | 210 | * remote CPUs use both these fields when doing load calculation. |
| 210 | */ | 211 | */ |
| 211 | unsigned long nr_running; | 212 | unsigned long nr_running; |
| 213 | unsigned long raw_weighted_load; | ||
| 212 | #ifdef CONFIG_SMP | 214 | #ifdef CONFIG_SMP |
| 213 | unsigned long cpu_load[3]; | 215 | unsigned long cpu_load[3]; |
| 214 | #endif | 216 | #endif |
| @@ -224,9 +226,9 @@ struct runqueue { | |||
| 224 | 226 | ||
| 225 | unsigned long expired_timestamp; | 227 | unsigned long expired_timestamp; |
| 226 | unsigned long long timestamp_last_tick; | 228 | unsigned long long timestamp_last_tick; |
| 227 | task_t *curr, *idle; | 229 | struct task_struct *curr, *idle; |
| 228 | struct mm_struct *prev_mm; | 230 | struct mm_struct *prev_mm; |
| 229 | prio_array_t *active, *expired, arrays[2]; | 231 | struct prio_array *active, *expired, arrays[2]; |
| 230 | int best_expired_prio; | 232 | int best_expired_prio; |
| 231 | atomic_t nr_iowait; | 233 | atomic_t nr_iowait; |
| 232 | 234 | ||
| @@ -236,10 +238,10 @@ struct runqueue { | |||
| 236 | /* For active balancing */ | 238 | /* For active balancing */ |
| 237 | int active_balance; | 239 | int active_balance; |
| 238 | int push_cpu; | 240 | int push_cpu; |
| 241 | int cpu; /* cpu of this runqueue */ | ||
| 239 | 242 | ||
| 240 | task_t *migration_thread; | 243 | struct task_struct *migration_thread; |
| 241 | struct list_head migration_queue; | 244 | struct list_head migration_queue; |
| 242 | int cpu; | ||
| 243 | #endif | 245 | #endif |
| 244 | 246 | ||
| 245 | #ifdef CONFIG_SCHEDSTATS | 247 | #ifdef CONFIG_SCHEDSTATS |
| @@ -261,9 +263,19 @@ struct runqueue { | |||
| 261 | unsigned long ttwu_cnt; | 263 | unsigned long ttwu_cnt; |
| 262 | unsigned long ttwu_local; | 264 | unsigned long ttwu_local; |
| 263 | #endif | 265 | #endif |
| 266 | struct lock_class_key rq_lock_key; | ||
| 264 | }; | 267 | }; |
| 265 | 268 | ||
| 266 | static DEFINE_PER_CPU(struct runqueue, runqueues); | 269 | static DEFINE_PER_CPU(struct rq, runqueues); |
| 270 | |||
| 271 | static inline int cpu_of(struct rq *rq) | ||
| 272 | { | ||
| 273 | #ifdef CONFIG_SMP | ||
| 274 | return rq->cpu; | ||
| 275 | #else | ||
| 276 | return 0; | ||
| 277 | #endif | ||
| 278 | } | ||
| 267 | 279 | ||
| 268 | /* | 280 | /* |
| 269 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | 281 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. |
| @@ -272,8 +284,8 @@ static DEFINE_PER_CPU(struct runqueue, runqueues); | |||
| 272 | * The domain tree of any CPU may only be accessed from within | 284 | * The domain tree of any CPU may only be accessed from within |
| 273 | * preempt-disabled sections. | 285 | * preempt-disabled sections. |
| 274 | */ | 286 | */ |
| 275 | #define for_each_domain(cpu, domain) \ | 287 | #define for_each_domain(cpu, __sd) \ |
| 276 | for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) | 288 | for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) |
| 277 | 289 | ||
| 278 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 290 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
| 279 | #define this_rq() (&__get_cpu_var(runqueues)) | 291 | #define this_rq() (&__get_cpu_var(runqueues)) |
| @@ -288,26 +300,33 @@ for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) | |||
| 288 | #endif | 300 | #endif |
| 289 | 301 | ||
| 290 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | 302 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW |
| 291 | static inline int task_running(runqueue_t *rq, task_t *p) | 303 | static inline int task_running(struct rq *rq, struct task_struct *p) |
| 292 | { | 304 | { |
| 293 | return rq->curr == p; | 305 | return rq->curr == p; |
| 294 | } | 306 | } |
| 295 | 307 | ||
| 296 | static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) | 308 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
| 297 | { | 309 | { |
| 298 | } | 310 | } |
| 299 | 311 | ||
| 300 | static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | 312 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
| 301 | { | 313 | { |
| 302 | #ifdef CONFIG_DEBUG_SPINLOCK | 314 | #ifdef CONFIG_DEBUG_SPINLOCK |
| 303 | /* this is a valid case when another task releases the spinlock */ | 315 | /* this is a valid case when another task releases the spinlock */ |
| 304 | rq->lock.owner = current; | 316 | rq->lock.owner = current; |
| 305 | #endif | 317 | #endif |
| 318 | /* | ||
| 319 | * If we are tracking spinlock dependencies then we have to | ||
| 320 | * fix up the runqueue lock - which gets 'carried over' from | ||
| 321 | * prev into current: | ||
| 322 | */ | ||
| 323 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); | ||
| 324 | |||
| 306 | spin_unlock_irq(&rq->lock); | 325 | spin_unlock_irq(&rq->lock); |
| 307 | } | 326 | } |
| 308 | 327 | ||
| 309 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | 328 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ |
| 310 | static inline int task_running(runqueue_t *rq, task_t *p) | 329 | static inline int task_running(struct rq *rq, struct task_struct *p) |
| 311 | { | 330 | { |
| 312 | #ifdef CONFIG_SMP | 331 | #ifdef CONFIG_SMP |
| 313 | return p->oncpu; | 332 | return p->oncpu; |
| @@ -316,7 +335,7 @@ static inline int task_running(runqueue_t *rq, task_t *p) | |||
| 316 | #endif | 335 | #endif |
| 317 | } | 336 | } |
| 318 | 337 | ||
| 319 | static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) | 338 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
| 320 | { | 339 | { |
| 321 | #ifdef CONFIG_SMP | 340 | #ifdef CONFIG_SMP |
| 322 | /* | 341 | /* |
| @@ -333,7 +352,7 @@ static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) | |||
| 333 | #endif | 352 | #endif |
| 334 | } | 353 | } |
| 335 | 354 | ||
| 336 | static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | 355 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
| 337 | { | 356 | { |
| 338 | #ifdef CONFIG_SMP | 357 | #ifdef CONFIG_SMP |
| 339 | /* | 358 | /* |
| @@ -351,14 +370,33 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | |||
| 351 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 370 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
| 352 | 371 | ||
| 353 | /* | 372 | /* |
| 373 | * __task_rq_lock - lock the runqueue a given task resides on. | ||
| 374 | * Must be called interrupts disabled. | ||
| 375 | */ | ||
| 376 | static inline struct rq *__task_rq_lock(struct task_struct *p) | ||
| 377 | __acquires(rq->lock) | ||
| 378 | { | ||
| 379 | struct rq *rq; | ||
| 380 | |||
| 381 | repeat_lock_task: | ||
| 382 | rq = task_rq(p); | ||
| 383 | spin_lock(&rq->lock); | ||
| 384 | if (unlikely(rq != task_rq(p))) { | ||
| 385 | spin_unlock(&rq->lock); | ||
| 386 | goto repeat_lock_task; | ||
| 387 | } | ||
| 388 | return rq; | ||
| 389 | } | ||
| 390 | |||
| 391 | /* | ||
| 354 | * task_rq_lock - lock the runqueue a given task resides on and disable | 392 | * task_rq_lock - lock the runqueue a given task resides on and disable |
| 355 | * interrupts. Note the ordering: we can safely lookup the task_rq without | 393 | * interrupts. Note the ordering: we can safely lookup the task_rq without |
| 356 | * explicitly disabling preemption. | 394 | * explicitly disabling preemption. |
| 357 | */ | 395 | */ |
| 358 | static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) | 396 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) |
| 359 | __acquires(rq->lock) | 397 | __acquires(rq->lock) |
| 360 | { | 398 | { |
| 361 | struct runqueue *rq; | 399 | struct rq *rq; |
| 362 | 400 | ||
| 363 | repeat_lock_task: | 401 | repeat_lock_task: |
| 364 | local_irq_save(*flags); | 402 | local_irq_save(*flags); |
| @@ -371,7 +409,13 @@ repeat_lock_task: | |||
| 371 | return rq; | 409 | return rq; |
| 372 | } | 410 | } |
| 373 | 411 | ||
| 374 | static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) | 412 | static inline void __task_rq_unlock(struct rq *rq) |
| 413 | __releases(rq->lock) | ||
| 414 | { | ||
| 415 | spin_unlock(&rq->lock); | ||
| 416 | } | ||
| 417 | |||
| 418 | static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | ||
| 375 | __releases(rq->lock) | 419 | __releases(rq->lock) |
| 376 | { | 420 | { |
| 377 | spin_unlock_irqrestore(&rq->lock, *flags); | 421 | spin_unlock_irqrestore(&rq->lock, *flags); |
| @@ -391,7 +435,7 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
| 391 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | 435 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); |
| 392 | seq_printf(seq, "timestamp %lu\n", jiffies); | 436 | seq_printf(seq, "timestamp %lu\n", jiffies); |
| 393 | for_each_online_cpu(cpu) { | 437 | for_each_online_cpu(cpu) { |
| 394 | runqueue_t *rq = cpu_rq(cpu); | 438 | struct rq *rq = cpu_rq(cpu); |
| 395 | #ifdef CONFIG_SMP | 439 | #ifdef CONFIG_SMP |
| 396 | struct sched_domain *sd; | 440 | struct sched_domain *sd; |
| 397 | int dcnt = 0; | 441 | int dcnt = 0; |
| @@ -468,9 +512,36 @@ struct file_operations proc_schedstat_operations = { | |||
| 468 | .release = single_release, | 512 | .release = single_release, |
| 469 | }; | 513 | }; |
| 470 | 514 | ||
| 515 | /* | ||
| 516 | * Expects runqueue lock to be held for atomicity of update | ||
| 517 | */ | ||
| 518 | static inline void | ||
| 519 | rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) | ||
| 520 | { | ||
| 521 | if (rq) { | ||
| 522 | rq->rq_sched_info.run_delay += delta_jiffies; | ||
| 523 | rq->rq_sched_info.pcnt++; | ||
| 524 | } | ||
| 525 | } | ||
| 526 | |||
| 527 | /* | ||
| 528 | * Expects runqueue lock to be held for atomicity of update | ||
| 529 | */ | ||
| 530 | static inline void | ||
| 531 | rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) | ||
| 532 | { | ||
| 533 | if (rq) | ||
| 534 | rq->rq_sched_info.cpu_time += delta_jiffies; | ||
| 535 | } | ||
| 471 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) | 536 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) |
| 472 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) | 537 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) |
| 473 | #else /* !CONFIG_SCHEDSTATS */ | 538 | #else /* !CONFIG_SCHEDSTATS */ |
| 539 | static inline void | ||
| 540 | rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) | ||
| 541 | {} | ||
| 542 | static inline void | ||
| 543 | rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) | ||
| 544 | {} | ||
| 474 | # define schedstat_inc(rq, field) do { } while (0) | 545 | # define schedstat_inc(rq, field) do { } while (0) |
| 475 | # define schedstat_add(rq, field, amt) do { } while (0) | 546 | # define schedstat_add(rq, field, amt) do { } while (0) |
| 476 | #endif | 547 | #endif |
| @@ -478,10 +549,10 @@ struct file_operations proc_schedstat_operations = { | |||
| 478 | /* | 549 | /* |
| 479 | * rq_lock - lock a given runqueue and disable interrupts. | 550 | * rq_lock - lock a given runqueue and disable interrupts. |
| 480 | */ | 551 | */ |
| 481 | static inline runqueue_t *this_rq_lock(void) | 552 | static inline struct rq *this_rq_lock(void) |
| 482 | __acquires(rq->lock) | 553 | __acquires(rq->lock) |
| 483 | { | 554 | { |
| 484 | runqueue_t *rq; | 555 | struct rq *rq; |
| 485 | 556 | ||
| 486 | local_irq_disable(); | 557 | local_irq_disable(); |
| 487 | rq = this_rq(); | 558 | rq = this_rq(); |
| @@ -490,7 +561,7 @@ static inline runqueue_t *this_rq_lock(void) | |||
| 490 | return rq; | 561 | return rq; |
| 491 | } | 562 | } |
| 492 | 563 | ||
| 493 | #ifdef CONFIG_SCHEDSTATS | 564 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
| 494 | /* | 565 | /* |
| 495 | * Called when a process is dequeued from the active array and given | 566 | * Called when a process is dequeued from the active array and given |
| 496 | * the cpu. We should note that with the exception of interactive | 567 | * the cpu. We should note that with the exception of interactive |
| @@ -506,7 +577,7 @@ static inline runqueue_t *this_rq_lock(void) | |||
| 506 | * long it was from the *first* time it was queued to the time that it | 577 | * long it was from the *first* time it was queued to the time that it |
| 507 | * finally hit a cpu. | 578 | * finally hit a cpu. |
| 508 | */ | 579 | */ |
| 509 | static inline void sched_info_dequeued(task_t *t) | 580 | static inline void sched_info_dequeued(struct task_struct *t) |
| 510 | { | 581 | { |
| 511 | t->sched_info.last_queued = 0; | 582 | t->sched_info.last_queued = 0; |
| 512 | } | 583 | } |
| @@ -516,23 +587,18 @@ static inline void sched_info_dequeued(task_t *t) | |||
| 516 | * long it was waiting to run. We also note when it began so that we | 587 | * long it was waiting to run. We also note when it began so that we |
| 517 | * can keep stats on how long its timeslice is. | 588 | * can keep stats on how long its timeslice is. |
| 518 | */ | 589 | */ |
| 519 | static void sched_info_arrive(task_t *t) | 590 | static void sched_info_arrive(struct task_struct *t) |
| 520 | { | 591 | { |
| 521 | unsigned long now = jiffies, diff = 0; | 592 | unsigned long now = jiffies, delta_jiffies = 0; |
| 522 | struct runqueue *rq = task_rq(t); | ||
| 523 | 593 | ||
| 524 | if (t->sched_info.last_queued) | 594 | if (t->sched_info.last_queued) |
| 525 | diff = now - t->sched_info.last_queued; | 595 | delta_jiffies = now - t->sched_info.last_queued; |
| 526 | sched_info_dequeued(t); | 596 | sched_info_dequeued(t); |
| 527 | t->sched_info.run_delay += diff; | 597 | t->sched_info.run_delay += delta_jiffies; |
| 528 | t->sched_info.last_arrival = now; | 598 | t->sched_info.last_arrival = now; |
| 529 | t->sched_info.pcnt++; | 599 | t->sched_info.pcnt++; |
| 530 | 600 | ||
| 531 | if (!rq) | 601 | rq_sched_info_arrive(task_rq(t), delta_jiffies); |
| 532 | return; | ||
| 533 | |||
| 534 | rq->rq_sched_info.run_delay += diff; | ||
| 535 | rq->rq_sched_info.pcnt++; | ||
| 536 | } | 602 | } |
| 537 | 603 | ||
| 538 | /* | 604 | /* |
| @@ -550,25 +616,23 @@ static void sched_info_arrive(task_t *t) | |||
| 550 | * the timestamp if it is already not set. It's assumed that | 616 | * the timestamp if it is already not set. It's assumed that |
| 551 | * sched_info_dequeued() will clear that stamp when appropriate. | 617 | * sched_info_dequeued() will clear that stamp when appropriate. |
| 552 | */ | 618 | */ |
| 553 | static inline void sched_info_queued(task_t *t) | 619 | static inline void sched_info_queued(struct task_struct *t) |
| 554 | { | 620 | { |
| 555 | if (!t->sched_info.last_queued) | 621 | if (unlikely(sched_info_on())) |
| 556 | t->sched_info.last_queued = jiffies; | 622 | if (!t->sched_info.last_queued) |
| 623 | t->sched_info.last_queued = jiffies; | ||
| 557 | } | 624 | } |
| 558 | 625 | ||
| 559 | /* | 626 | /* |
| 560 | * Called when a process ceases being the active-running process, either | 627 | * Called when a process ceases being the active-running process, either |
| 561 | * voluntarily or involuntarily. Now we can calculate how long we ran. | 628 | * voluntarily or involuntarily. Now we can calculate how long we ran. |
| 562 | */ | 629 | */ |
| 563 | static inline void sched_info_depart(task_t *t) | 630 | static inline void sched_info_depart(struct task_struct *t) |
| 564 | { | 631 | { |
| 565 | struct runqueue *rq = task_rq(t); | 632 | unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival; |
| 566 | unsigned long diff = jiffies - t->sched_info.last_arrival; | ||
| 567 | |||
| 568 | t->sched_info.cpu_time += diff; | ||
| 569 | 633 | ||
| 570 | if (rq) | 634 | t->sched_info.cpu_time += delta_jiffies; |
| 571 | rq->rq_sched_info.cpu_time += diff; | 635 | rq_sched_info_depart(task_rq(t), delta_jiffies); |
| 572 | } | 636 | } |
| 573 | 637 | ||
| 574 | /* | 638 | /* |
| @@ -576,9 +640,10 @@ static inline void sched_info_depart(task_t *t) | |||
| 576 | * their time slice. (This may also be called when switching to or from | 640 | * their time slice. (This may also be called when switching to or from |
| 577 | * the idle task.) We are only called when prev != next. | 641 | * the idle task.) We are only called when prev != next. |
| 578 | */ | 642 | */ |
| 579 | static inline void sched_info_switch(task_t *prev, task_t *next) | 643 | static inline void |
| 644 | __sched_info_switch(struct task_struct *prev, struct task_struct *next) | ||
| 580 | { | 645 | { |
| 581 | struct runqueue *rq = task_rq(prev); | 646 | struct rq *rq = task_rq(prev); |
| 582 | 647 | ||
| 583 | /* | 648 | /* |
| 584 | * prev now departs the cpu. It's not interesting to record | 649 | * prev now departs the cpu. It's not interesting to record |
| @@ -591,15 +656,21 @@ static inline void sched_info_switch(task_t *prev, task_t *next) | |||
| 591 | if (next != rq->idle) | 656 | if (next != rq->idle) |
| 592 | sched_info_arrive(next); | 657 | sched_info_arrive(next); |
| 593 | } | 658 | } |
| 659 | static inline void | ||
| 660 | sched_info_switch(struct task_struct *prev, struct task_struct *next) | ||
| 661 | { | ||
| 662 | if (unlikely(sched_info_on())) | ||
| 663 | __sched_info_switch(prev, next); | ||
| 664 | } | ||
| 594 | #else | 665 | #else |
| 595 | #define sched_info_queued(t) do { } while (0) | 666 | #define sched_info_queued(t) do { } while (0) |
| 596 | #define sched_info_switch(t, next) do { } while (0) | 667 | #define sched_info_switch(t, next) do { } while (0) |
| 597 | #endif /* CONFIG_SCHEDSTATS */ | 668 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ |
| 598 | 669 | ||
| 599 | /* | 670 | /* |
| 600 | * Adding/removing a task to/from a priority array: | 671 | * Adding/removing a task to/from a priority array: |
| 601 | */ | 672 | */ |
| 602 | static void dequeue_task(struct task_struct *p, prio_array_t *array) | 673 | static void dequeue_task(struct task_struct *p, struct prio_array *array) |
| 603 | { | 674 | { |
| 604 | array->nr_active--; | 675 | array->nr_active--; |
| 605 | list_del(&p->run_list); | 676 | list_del(&p->run_list); |
| @@ -607,7 +678,7 @@ static void dequeue_task(struct task_struct *p, prio_array_t *array) | |||
| 607 | __clear_bit(p->prio, array->bitmap); | 678 | __clear_bit(p->prio, array->bitmap); |
| 608 | } | 679 | } |
| 609 | 680 | ||
| 610 | static void enqueue_task(struct task_struct *p, prio_array_t *array) | 681 | static void enqueue_task(struct task_struct *p, struct prio_array *array) |
| 611 | { | 682 | { |
| 612 | sched_info_queued(p); | 683 | sched_info_queued(p); |
| 613 | list_add_tail(&p->run_list, array->queue + p->prio); | 684 | list_add_tail(&p->run_list, array->queue + p->prio); |
| @@ -620,12 +691,13 @@ static void enqueue_task(struct task_struct *p, prio_array_t *array) | |||
| 620 | * Put task to the end of the run list without the overhead of dequeue | 691 | * Put task to the end of the run list without the overhead of dequeue |
| 621 | * followed by enqueue. | 692 | * followed by enqueue. |
| 622 | */ | 693 | */ |
| 623 | static void requeue_task(struct task_struct *p, prio_array_t *array) | 694 | static void requeue_task(struct task_struct *p, struct prio_array *array) |
| 624 | { | 695 | { |
| 625 | list_move_tail(&p->run_list, array->queue + p->prio); | 696 | list_move_tail(&p->run_list, array->queue + p->prio); |
| 626 | } | 697 | } |
| 627 | 698 | ||
| 628 | static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | 699 | static inline void |
| 700 | enqueue_task_head(struct task_struct *p, struct prio_array *array) | ||
| 629 | { | 701 | { |
| 630 | list_add(&p->run_list, array->queue + p->prio); | 702 | list_add(&p->run_list, array->queue + p->prio); |
| 631 | __set_bit(p->prio, array->bitmap); | 703 | __set_bit(p->prio, array->bitmap); |
| @@ -634,7 +706,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | |||
| 634 | } | 706 | } |
| 635 | 707 | ||
| 636 | /* | 708 | /* |
| 637 | * effective_prio - return the priority that is based on the static | 709 | * __normal_prio - return the priority that is based on the static |
| 638 | * priority but is modified by bonuses/penalties. | 710 | * priority but is modified by bonuses/penalties. |
| 639 | * | 711 | * |
| 640 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] | 712 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] |
| @@ -647,13 +719,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | |||
| 647 | * | 719 | * |
| 648 | * Both properties are important to certain workloads. | 720 | * Both properties are important to certain workloads. |
| 649 | */ | 721 | */ |
| 650 | static int effective_prio(task_t *p) | 722 | |
| 723 | static inline int __normal_prio(struct task_struct *p) | ||
| 651 | { | 724 | { |
| 652 | int bonus, prio; | 725 | int bonus, prio; |
| 653 | 726 | ||
| 654 | if (rt_task(p)) | ||
| 655 | return p->prio; | ||
| 656 | |||
| 657 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; | 727 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; |
| 658 | 728 | ||
| 659 | prio = p->static_prio - bonus; | 729 | prio = p->static_prio - bonus; |
| @@ -665,57 +735,165 @@ static int effective_prio(task_t *p) | |||
| 665 | } | 735 | } |
| 666 | 736 | ||
| 667 | /* | 737 | /* |
| 738 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | ||
| 739 | * of tasks with abnormal "nice" values across CPUs the contribution that | ||
| 740 | * each task makes to its run queue's load is weighted according to its | ||
| 741 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a | ||
| 742 | * scaled version of the new time slice allocation that they receive on time | ||
| 743 | * slice expiry etc. | ||
| 744 | */ | ||
| 745 | |||
| 746 | /* | ||
| 747 | * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE | ||
| 748 | * If static_prio_timeslice() is ever changed to break this assumption then | ||
| 749 | * this code will need modification | ||
| 750 | */ | ||
| 751 | #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE | ||
| 752 | #define LOAD_WEIGHT(lp) \ | ||
| 753 | (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) | ||
| 754 | #define PRIO_TO_LOAD_WEIGHT(prio) \ | ||
| 755 | LOAD_WEIGHT(static_prio_timeslice(prio)) | ||
| 756 | #define RTPRIO_TO_LOAD_WEIGHT(rp) \ | ||
| 757 | (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) | ||
| 758 | |||
| 759 | static void set_load_weight(struct task_struct *p) | ||
| 760 | { | ||
| 761 | if (has_rt_policy(p)) { | ||
| 762 | #ifdef CONFIG_SMP | ||
| 763 | if (p == task_rq(p)->migration_thread) | ||
| 764 | /* | ||
| 765 | * The migration thread does the actual balancing. | ||
| 766 | * Giving its load any weight will skew balancing | ||
| 767 | * adversely. | ||
| 768 | */ | ||
| 769 | p->load_weight = 0; | ||
| 770 | else | ||
| 771 | #endif | ||
| 772 | p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); | ||
| 773 | } else | ||
| 774 | p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); | ||
| 775 | } | ||
| 776 | |||
| 777 | static inline void | ||
| 778 | inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) | ||
| 779 | { | ||
| 780 | rq->raw_weighted_load += p->load_weight; | ||
| 781 | } | ||
| 782 | |||
| 783 | static inline void | ||
| 784 | dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) | ||
| 785 | { | ||
| 786 | rq->raw_weighted_load -= p->load_weight; | ||
| 787 | } | ||
| 788 | |||
| 789 | static inline void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
| 790 | { | ||
| 791 | rq->nr_running++; | ||
| 792 | inc_raw_weighted_load(rq, p); | ||
| 793 | } | ||
| 794 | |||
| 795 | static inline void dec_nr_running(struct task_struct *p, struct rq *rq) | ||
| 796 | { | ||
| 797 | rq->nr_running--; | ||
| 798 | dec_raw_weighted_load(rq, p); | ||
| 799 | } | ||
| 800 | |||
| 801 | /* | ||
| 802 | * Calculate the expected normal priority: i.e. priority | ||
| 803 | * without taking RT-inheritance into account. Might be | ||
| 804 | * boosted by interactivity modifiers. Changes upon fork, | ||
| 805 | * setprio syscalls, and whenever the interactivity | ||
| 806 | * estimator recalculates. | ||
| 807 | */ | ||
| 808 | static inline int normal_prio(struct task_struct *p) | ||
| 809 | { | ||
| 810 | int prio; | ||
| 811 | |||
| 812 | if (has_rt_policy(p)) | ||
| 813 | prio = MAX_RT_PRIO-1 - p->rt_priority; | ||
| 814 | else | ||
| 815 | prio = __normal_prio(p); | ||
| 816 | return prio; | ||
| 817 | } | ||
| 818 | |||
| 819 | /* | ||
| 820 | * Calculate the current priority, i.e. the priority | ||
| 821 | * taken into account by the scheduler. This value might | ||
| 822 | * be boosted by RT tasks, or might be boosted by | ||
| 823 | * interactivity modifiers. Will be RT if the task got | ||
| 824 | * RT-boosted. If not then it returns p->normal_prio. | ||
| 825 | */ | ||
| 826 | static int effective_prio(struct task_struct *p) | ||
| 827 | { | ||
| 828 | p->normal_prio = normal_prio(p); | ||
| 829 | /* | ||
| 830 | * If we are RT tasks or we were boosted to RT priority, | ||
| 831 | * keep the priority unchanged. Otherwise, update priority | ||
| 832 | * to the normal priority: | ||
| 833 | */ | ||
| 834 | if (!rt_prio(p->prio)) | ||
| 835 | return p->normal_prio; | ||
| 836 | return p->prio; | ||
| 837 | } | ||
| 838 | |||
| 839 | /* | ||
| 668 | * __activate_task - move a task to the runqueue. | 840 | * __activate_task - move a task to the runqueue. |
| 669 | */ | 841 | */ |
| 670 | static void __activate_task(task_t *p, runqueue_t *rq) | 842 | static void __activate_task(struct task_struct *p, struct rq *rq) |
| 671 | { | 843 | { |
| 672 | prio_array_t *target = rq->active; | 844 | struct prio_array *target = rq->active; |
| 673 | 845 | ||
| 674 | if (batch_task(p)) | 846 | if (batch_task(p)) |
| 675 | target = rq->expired; | 847 | target = rq->expired; |
| 676 | enqueue_task(p, target); | 848 | enqueue_task(p, target); |
| 677 | rq->nr_running++; | 849 | inc_nr_running(p, rq); |
| 678 | } | 850 | } |
| 679 | 851 | ||
| 680 | /* | 852 | /* |
| 681 | * __activate_idle_task - move idle task to the _front_ of runqueue. | 853 | * __activate_idle_task - move idle task to the _front_ of runqueue. |
| 682 | */ | 854 | */ |
| 683 | static inline void __activate_idle_task(task_t *p, runqueue_t *rq) | 855 | static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) |
| 684 | { | 856 | { |
| 685 | enqueue_task_head(p, rq->active); | 857 | enqueue_task_head(p, rq->active); |
| 686 | rq->nr_running++; | 858 | inc_nr_running(p, rq); |
| 687 | } | 859 | } |
| 688 | 860 | ||
| 689 | static int recalc_task_prio(task_t *p, unsigned long long now) | 861 | /* |
| 862 | * Recalculate p->normal_prio and p->prio after having slept, | ||
| 863 | * updating the sleep-average too: | ||
| 864 | */ | ||
| 865 | static int recalc_task_prio(struct task_struct *p, unsigned long long now) | ||
| 690 | { | 866 | { |
| 691 | /* Caller must always ensure 'now >= p->timestamp' */ | 867 | /* Caller must always ensure 'now >= p->timestamp' */ |
| 692 | unsigned long long __sleep_time = now - p->timestamp; | 868 | unsigned long sleep_time = now - p->timestamp; |
| 693 | unsigned long sleep_time; | ||
| 694 | 869 | ||
| 695 | if (batch_task(p)) | 870 | if (batch_task(p)) |
| 696 | sleep_time = 0; | 871 | sleep_time = 0; |
| 697 | else { | ||
| 698 | if (__sleep_time > NS_MAX_SLEEP_AVG) | ||
| 699 | sleep_time = NS_MAX_SLEEP_AVG; | ||
| 700 | else | ||
| 701 | sleep_time = (unsigned long)__sleep_time; | ||
| 702 | } | ||
| 703 | 872 | ||
| 704 | if (likely(sleep_time > 0)) { | 873 | if (likely(sleep_time > 0)) { |
| 705 | /* | 874 | /* |
| 706 | * User tasks that sleep a long time are categorised as | 875 | * This ceiling is set to the lowest priority that would allow |
| 707 | * idle. They will only have their sleep_avg increased to a | 876 | * a task to be reinserted into the active array on timeslice |
| 708 | * level that makes them just interactive priority to stay | 877 | * completion. |
| 709 | * active yet prevent them suddenly becoming cpu hogs and | ||
| 710 | * starving other processes. | ||
| 711 | */ | 878 | */ |
| 712 | if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) { | 879 | unsigned long ceiling = INTERACTIVE_SLEEP(p); |
| 713 | unsigned long ceiling; | ||
| 714 | 880 | ||
| 715 | ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG - | 881 | if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { |
| 716 | DEF_TIMESLICE); | 882 | /* |
| 717 | if (p->sleep_avg < ceiling) | 883 | * Prevents user tasks from achieving best priority |
| 718 | p->sleep_avg = ceiling; | 884 | * with one single large enough sleep. |
| 885 | */ | ||
| 886 | p->sleep_avg = ceiling; | ||
| 887 | /* | ||
| 888 | * Using INTERACTIVE_SLEEP() as a ceiling places a | ||
| 889 | * nice(0) task 1ms sleep away from promotion, and | ||
| 890 | * gives it 700ms to round-robin with no chance of | ||
| 891 | * being demoted. This is more than generous, so | ||
| 892 | * mark this sleep as non-interactive to prevent the | ||
| 893 | * on-runqueue bonus logic from intervening should | ||
| 894 | * this task not receive cpu immediately. | ||
| 895 | */ | ||
| 896 | p->sleep_type = SLEEP_NONINTERACTIVE; | ||
| 719 | } else { | 897 | } else { |
| 720 | /* | 898 | /* |
| 721 | * Tasks waking from uninterruptible sleep are | 899 | * Tasks waking from uninterruptible sleep are |
| @@ -723,12 +901,12 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
| 723 | * are likely to be waiting on I/O | 901 | * are likely to be waiting on I/O |
| 724 | */ | 902 | */ |
| 725 | if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { | 903 | if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { |
| 726 | if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) | 904 | if (p->sleep_avg >= ceiling) |
| 727 | sleep_time = 0; | 905 | sleep_time = 0; |
| 728 | else if (p->sleep_avg + sleep_time >= | 906 | else if (p->sleep_avg + sleep_time >= |
| 729 | INTERACTIVE_SLEEP(p)) { | 907 | ceiling) { |
| 730 | p->sleep_avg = INTERACTIVE_SLEEP(p); | 908 | p->sleep_avg = ceiling; |
| 731 | sleep_time = 0; | 909 | sleep_time = 0; |
| 732 | } | 910 | } |
| 733 | } | 911 | } |
| 734 | 912 | ||
| @@ -742,9 +920,9 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
| 742 | */ | 920 | */ |
| 743 | p->sleep_avg += sleep_time; | 921 | p->sleep_avg += sleep_time; |
| 744 | 922 | ||
| 745 | if (p->sleep_avg > NS_MAX_SLEEP_AVG) | ||
| 746 | p->sleep_avg = NS_MAX_SLEEP_AVG; | ||
| 747 | } | 923 | } |
| 924 | if (p->sleep_avg > NS_MAX_SLEEP_AVG) | ||
| 925 | p->sleep_avg = NS_MAX_SLEEP_AVG; | ||
| 748 | } | 926 | } |
| 749 | 927 | ||
| 750 | return effective_prio(p); | 928 | return effective_prio(p); |
| @@ -756,7 +934,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
| 756 | * Update all the scheduling statistics stuff. (sleep average | 934 | * Update all the scheduling statistics stuff. (sleep average |
| 757 | * calculation, priority modifiers, etc.) | 935 | * calculation, priority modifiers, etc.) |
| 758 | */ | 936 | */ |
| 759 | static void activate_task(task_t *p, runqueue_t *rq, int local) | 937 | static void activate_task(struct task_struct *p, struct rq *rq, int local) |
| 760 | { | 938 | { |
| 761 | unsigned long long now; | 939 | unsigned long long now; |
| 762 | 940 | ||
| @@ -764,7 +942,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) | |||
| 764 | #ifdef CONFIG_SMP | 942 | #ifdef CONFIG_SMP |
| 765 | if (!local) { | 943 | if (!local) { |
| 766 | /* Compensate for drifting sched_clock */ | 944 | /* Compensate for drifting sched_clock */ |
| 767 | runqueue_t *this_rq = this_rq(); | 945 | struct rq *this_rq = this_rq(); |
| 768 | now = (now - this_rq->timestamp_last_tick) | 946 | now = (now - this_rq->timestamp_last_tick) |
| 769 | + rq->timestamp_last_tick; | 947 | + rq->timestamp_last_tick; |
| 770 | } | 948 | } |
| @@ -803,9 +981,9 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) | |||
| 803 | /* | 981 | /* |
| 804 | * deactivate_task - remove a task from the runqueue. | 982 | * deactivate_task - remove a task from the runqueue. |
| 805 | */ | 983 | */ |
| 806 | static void deactivate_task(struct task_struct *p, runqueue_t *rq) | 984 | static void deactivate_task(struct task_struct *p, struct rq *rq) |
| 807 | { | 985 | { |
| 808 | rq->nr_running--; | 986 | dec_nr_running(p, rq); |
| 809 | dequeue_task(p, p->array); | 987 | dequeue_task(p, p->array); |
| 810 | p->array = NULL; | 988 | p->array = NULL; |
| 811 | } | 989 | } |
| @@ -818,7 +996,12 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq) | |||
| 818 | * the target CPU. | 996 | * the target CPU. |
| 819 | */ | 997 | */ |
| 820 | #ifdef CONFIG_SMP | 998 | #ifdef CONFIG_SMP |
| 821 | static void resched_task(task_t *p) | 999 | |
| 1000 | #ifndef tsk_is_polling | ||
| 1001 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | ||
| 1002 | #endif | ||
| 1003 | |||
| 1004 | static void resched_task(struct task_struct *p) | ||
| 822 | { | 1005 | { |
| 823 | int cpu; | 1006 | int cpu; |
| 824 | 1007 | ||
| @@ -833,13 +1016,13 @@ static void resched_task(task_t *p) | |||
| 833 | if (cpu == smp_processor_id()) | 1016 | if (cpu == smp_processor_id()) |
| 834 | return; | 1017 | return; |
| 835 | 1018 | ||
| 836 | /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ | 1019 | /* NEED_RESCHED must be visible before we test polling */ |
| 837 | smp_mb(); | 1020 | smp_mb(); |
| 838 | if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) | 1021 | if (!tsk_is_polling(p)) |
| 839 | smp_send_reschedule(cpu); | 1022 | smp_send_reschedule(cpu); |
| 840 | } | 1023 | } |
| 841 | #else | 1024 | #else |
| 842 | static inline void resched_task(task_t *p) | 1025 | static inline void resched_task(struct task_struct *p) |
| 843 | { | 1026 | { |
| 844 | assert_spin_locked(&task_rq(p)->lock); | 1027 | assert_spin_locked(&task_rq(p)->lock); |
| 845 | set_tsk_need_resched(p); | 1028 | set_tsk_need_resched(p); |
| @@ -850,28 +1033,35 @@ static inline void resched_task(task_t *p) | |||
| 850 | * task_curr - is this task currently executing on a CPU? | 1033 | * task_curr - is this task currently executing on a CPU? |
| 851 | * @p: the task in question. | 1034 | * @p: the task in question. |
| 852 | */ | 1035 | */ |
| 853 | inline int task_curr(const task_t *p) | 1036 | inline int task_curr(const struct task_struct *p) |
| 854 | { | 1037 | { |
| 855 | return cpu_curr(task_cpu(p)) == p; | 1038 | return cpu_curr(task_cpu(p)) == p; |
| 856 | } | 1039 | } |
| 857 | 1040 | ||
| 1041 | /* Used instead of source_load when we know the type == 0 */ | ||
| 1042 | unsigned long weighted_cpuload(const int cpu) | ||
| 1043 | { | ||
| 1044 | return cpu_rq(cpu)->raw_weighted_load; | ||
| 1045 | } | ||
| 1046 | |||
| 858 | #ifdef CONFIG_SMP | 1047 | #ifdef CONFIG_SMP |
| 859 | typedef struct { | 1048 | struct migration_req { |
| 860 | struct list_head list; | 1049 | struct list_head list; |
| 861 | 1050 | ||
| 862 | task_t *task; | 1051 | struct task_struct *task; |
| 863 | int dest_cpu; | 1052 | int dest_cpu; |
| 864 | 1053 | ||
| 865 | struct completion done; | 1054 | struct completion done; |
| 866 | } migration_req_t; | 1055 | }; |
| 867 | 1056 | ||
| 868 | /* | 1057 | /* |
| 869 | * The task's runqueue lock must be held. | 1058 | * The task's runqueue lock must be held. |
| 870 | * Returns true if you have to wait for migration thread. | 1059 | * Returns true if you have to wait for migration thread. |
| 871 | */ | 1060 | */ |
| 872 | static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) | 1061 | static int |
| 1062 | migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) | ||
| 873 | { | 1063 | { |
| 874 | runqueue_t *rq = task_rq(p); | 1064 | struct rq *rq = task_rq(p); |
| 875 | 1065 | ||
| 876 | /* | 1066 | /* |
| 877 | * If the task is not on a runqueue (and not running), then | 1067 | * If the task is not on a runqueue (and not running), then |
| @@ -886,6 +1076,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) | |||
| 886 | req->task = p; | 1076 | req->task = p; |
| 887 | req->dest_cpu = dest_cpu; | 1077 | req->dest_cpu = dest_cpu; |
| 888 | list_add(&req->list, &rq->migration_queue); | 1078 | list_add(&req->list, &rq->migration_queue); |
| 1079 | |||
| 889 | return 1; | 1080 | return 1; |
| 890 | } | 1081 | } |
| 891 | 1082 | ||
| @@ -898,10 +1089,10 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) | |||
| 898 | * smp_call_function() if an IPI is sent by the same process we are | 1089 | * smp_call_function() if an IPI is sent by the same process we are |
| 899 | * waiting to become inactive. | 1090 | * waiting to become inactive. |
| 900 | */ | 1091 | */ |
| 901 | void wait_task_inactive(task_t *p) | 1092 | void wait_task_inactive(struct task_struct *p) |
| 902 | { | 1093 | { |
| 903 | unsigned long flags; | 1094 | unsigned long flags; |
| 904 | runqueue_t *rq; | 1095 | struct rq *rq; |
| 905 | int preempted; | 1096 | int preempted; |
| 906 | 1097 | ||
| 907 | repeat: | 1098 | repeat: |
| @@ -932,7 +1123,7 @@ repeat: | |||
| 932 | * to another CPU then no harm is done and the purpose has been | 1123 | * to another CPU then no harm is done and the purpose has been |
| 933 | * achieved as well. | 1124 | * achieved as well. |
| 934 | */ | 1125 | */ |
| 935 | void kick_process(task_t *p) | 1126 | void kick_process(struct task_struct *p) |
| 936 | { | 1127 | { |
| 937 | int cpu; | 1128 | int cpu; |
| 938 | 1129 | ||
| @@ -944,32 +1135,45 @@ void kick_process(task_t *p) | |||
| 944 | } | 1135 | } |
| 945 | 1136 | ||
| 946 | /* | 1137 | /* |
| 947 | * Return a low guess at the load of a migration-source cpu. | 1138 | * Return a low guess at the load of a migration-source cpu weighted |
| 1139 | * according to the scheduling class and "nice" value. | ||
| 948 | * | 1140 | * |
| 949 | * We want to under-estimate the load of migration sources, to | 1141 | * We want to under-estimate the load of migration sources, to |
| 950 | * balance conservatively. | 1142 | * balance conservatively. |
| 951 | */ | 1143 | */ |
| 952 | static inline unsigned long source_load(int cpu, int type) | 1144 | static inline unsigned long source_load(int cpu, int type) |
| 953 | { | 1145 | { |
| 954 | runqueue_t *rq = cpu_rq(cpu); | 1146 | struct rq *rq = cpu_rq(cpu); |
| 955 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 1147 | |
| 956 | if (type == 0) | 1148 | if (type == 0) |
| 957 | return load_now; | 1149 | return rq->raw_weighted_load; |
| 958 | 1150 | ||
| 959 | return min(rq->cpu_load[type-1], load_now); | 1151 | return min(rq->cpu_load[type-1], rq->raw_weighted_load); |
| 960 | } | 1152 | } |
| 961 | 1153 | ||
| 962 | /* | 1154 | /* |
| 963 | * Return a high guess at the load of a migration-target cpu | 1155 | * Return a high guess at the load of a migration-target cpu weighted |
| 1156 | * according to the scheduling class and "nice" value. | ||
| 964 | */ | 1157 | */ |
| 965 | static inline unsigned long target_load(int cpu, int type) | 1158 | static inline unsigned long target_load(int cpu, int type) |
| 966 | { | 1159 | { |
| 967 | runqueue_t *rq = cpu_rq(cpu); | 1160 | struct rq *rq = cpu_rq(cpu); |
| 968 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 1161 | |
| 969 | if (type == 0) | 1162 | if (type == 0) |
| 970 | return load_now; | 1163 | return rq->raw_weighted_load; |
| 1164 | |||
| 1165 | return max(rq->cpu_load[type-1], rq->raw_weighted_load); | ||
| 1166 | } | ||
| 971 | 1167 | ||
| 972 | return max(rq->cpu_load[type-1], load_now); | 1168 | /* |
| 1169 | * Return the average load per task on the cpu's run queue | ||
| 1170 | */ | ||
| 1171 | static inline unsigned long cpu_avg_load_per_task(int cpu) | ||
| 1172 | { | ||
| 1173 | struct rq *rq = cpu_rq(cpu); | ||
| 1174 | unsigned long n = rq->nr_running; | ||
| 1175 | |||
| 1176 | return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; | ||
| 973 | } | 1177 | } |
| 974 | 1178 | ||
| 975 | /* | 1179 | /* |
| @@ -1042,7 +1246,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
| 1042 | cpus_and(tmp, group->cpumask, p->cpus_allowed); | 1246 | cpus_and(tmp, group->cpumask, p->cpus_allowed); |
| 1043 | 1247 | ||
| 1044 | for_each_cpu_mask(i, tmp) { | 1248 | for_each_cpu_mask(i, tmp) { |
| 1045 | load = source_load(i, 0); | 1249 | load = weighted_cpuload(i); |
| 1046 | 1250 | ||
| 1047 | if (load < min_load || (load == min_load && i == this_cpu)) { | 1251 | if (load < min_load || (load == min_load && i == this_cpu)) { |
| 1048 | min_load = load; | 1252 | min_load = load; |
| @@ -1069,9 +1273,15 @@ static int sched_balance_self(int cpu, int flag) | |||
| 1069 | struct task_struct *t = current; | 1273 | struct task_struct *t = current; |
| 1070 | struct sched_domain *tmp, *sd = NULL; | 1274 | struct sched_domain *tmp, *sd = NULL; |
| 1071 | 1275 | ||
| 1072 | for_each_domain(cpu, tmp) | 1276 | for_each_domain(cpu, tmp) { |
| 1277 | /* | ||
| 1278 | * If power savings logic is enabled for a domain, stop there. | ||
| 1279 | */ | ||
| 1280 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | ||
| 1281 | break; | ||
| 1073 | if (tmp->flags & flag) | 1282 | if (tmp->flags & flag) |
| 1074 | sd = tmp; | 1283 | sd = tmp; |
| 1284 | } | ||
| 1075 | 1285 | ||
| 1076 | while (sd) { | 1286 | while (sd) { |
| 1077 | cpumask_t span; | 1287 | cpumask_t span; |
| @@ -1116,7 +1326,7 @@ nextlevel: | |||
| 1116 | * Returns the CPU we should wake onto. | 1326 | * Returns the CPU we should wake onto. |
| 1117 | */ | 1327 | */ |
| 1118 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | 1328 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) |
| 1119 | static int wake_idle(int cpu, task_t *p) | 1329 | static int wake_idle(int cpu, struct task_struct *p) |
| 1120 | { | 1330 | { |
| 1121 | cpumask_t tmp; | 1331 | cpumask_t tmp; |
| 1122 | struct sched_domain *sd; | 1332 | struct sched_domain *sd; |
| @@ -1139,7 +1349,7 @@ static int wake_idle(int cpu, task_t *p) | |||
| 1139 | return cpu; | 1349 | return cpu; |
| 1140 | } | 1350 | } |
| 1141 | #else | 1351 | #else |
| 1142 | static inline int wake_idle(int cpu, task_t *p) | 1352 | static inline int wake_idle(int cpu, struct task_struct *p) |
| 1143 | { | 1353 | { |
| 1144 | return cpu; | 1354 | return cpu; |
| 1145 | } | 1355 | } |
| @@ -1159,15 +1369,15 @@ static inline int wake_idle(int cpu, task_t *p) | |||
| 1159 | * | 1369 | * |
| 1160 | * returns failure only if the task is already active. | 1370 | * returns failure only if the task is already active. |
| 1161 | */ | 1371 | */ |
| 1162 | static int try_to_wake_up(task_t *p, unsigned int state, int sync) | 1372 | static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) |
| 1163 | { | 1373 | { |
| 1164 | int cpu, this_cpu, success = 0; | 1374 | int cpu, this_cpu, success = 0; |
| 1165 | unsigned long flags; | 1375 | unsigned long flags; |
| 1166 | long old_state; | 1376 | long old_state; |
| 1167 | runqueue_t *rq; | 1377 | struct rq *rq; |
| 1168 | #ifdef CONFIG_SMP | 1378 | #ifdef CONFIG_SMP |
| 1169 | unsigned long load, this_load; | ||
| 1170 | struct sched_domain *sd, *this_sd = NULL; | 1379 | struct sched_domain *sd, *this_sd = NULL; |
| 1380 | unsigned long load, this_load; | ||
| 1171 | int new_cpu; | 1381 | int new_cpu; |
| 1172 | #endif | 1382 | #endif |
| 1173 | 1383 | ||
| @@ -1221,17 +1431,19 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync) | |||
| 1221 | 1431 | ||
| 1222 | if (this_sd->flags & SD_WAKE_AFFINE) { | 1432 | if (this_sd->flags & SD_WAKE_AFFINE) { |
| 1223 | unsigned long tl = this_load; | 1433 | unsigned long tl = this_load; |
| 1434 | unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
| 1435 | |||
| 1224 | /* | 1436 | /* |
| 1225 | * If sync wakeup then subtract the (maximum possible) | 1437 | * If sync wakeup then subtract the (maximum possible) |
| 1226 | * effect of the currently running task from the load | 1438 | * effect of the currently running task from the load |
| 1227 | * of the current CPU: | 1439 | * of the current CPU: |
| 1228 | */ | 1440 | */ |
| 1229 | if (sync) | 1441 | if (sync) |
| 1230 | tl -= SCHED_LOAD_SCALE; | 1442 | tl -= current->load_weight; |
| 1231 | 1443 | ||
| 1232 | if ((tl <= load && | 1444 | if ((tl <= load && |
| 1233 | tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || | 1445 | tl + target_load(cpu, idx) <= tl_per_task) || |
| 1234 | 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { | 1446 | 100*(tl + p->load_weight) <= imbalance*load) { |
| 1235 | /* | 1447 | /* |
| 1236 | * This domain has SD_WAKE_AFFINE and | 1448 | * This domain has SD_WAKE_AFFINE and |
| 1237 | * p is cache cold in this domain, and | 1449 | * p is cache cold in this domain, and |
| @@ -1315,15 +1527,14 @@ out: | |||
| 1315 | return success; | 1527 | return success; |
| 1316 | } | 1528 | } |
| 1317 | 1529 | ||
| 1318 | int fastcall wake_up_process(task_t *p) | 1530 | int fastcall wake_up_process(struct task_struct *p) |
| 1319 | { | 1531 | { |
| 1320 | return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | | 1532 | return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | |
| 1321 | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); | 1533 | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); |
| 1322 | } | 1534 | } |
| 1323 | |||
| 1324 | EXPORT_SYMBOL(wake_up_process); | 1535 | EXPORT_SYMBOL(wake_up_process); |
| 1325 | 1536 | ||
| 1326 | int fastcall wake_up_state(task_t *p, unsigned int state) | 1537 | int fastcall wake_up_state(struct task_struct *p, unsigned int state) |
| 1327 | { | 1538 | { |
| 1328 | return try_to_wake_up(p, state, 0); | 1539 | return try_to_wake_up(p, state, 0); |
| 1329 | } | 1540 | } |
| @@ -1332,7 +1543,7 @@ int fastcall wake_up_state(task_t *p, unsigned int state) | |||
| 1332 | * Perform scheduler related setup for a newly forked process p. | 1543 | * Perform scheduler related setup for a newly forked process p. |
| 1333 | * p is forked by current. | 1544 | * p is forked by current. |
| 1334 | */ | 1545 | */ |
| 1335 | void fastcall sched_fork(task_t *p, int clone_flags) | 1546 | void fastcall sched_fork(struct task_struct *p, int clone_flags) |
| 1336 | { | 1547 | { |
| 1337 | int cpu = get_cpu(); | 1548 | int cpu = get_cpu(); |
| 1338 | 1549 | ||
| @@ -1348,10 +1559,17 @@ void fastcall sched_fork(task_t *p, int clone_flags) | |||
| 1348 | * event cannot wake it up and insert it on the runqueue either. | 1559 | * event cannot wake it up and insert it on the runqueue either. |
| 1349 | */ | 1560 | */ |
| 1350 | p->state = TASK_RUNNING; | 1561 | p->state = TASK_RUNNING; |
| 1562 | |||
| 1563 | /* | ||
| 1564 | * Make sure we do not leak PI boosting priority to the child: | ||
| 1565 | */ | ||
| 1566 | p->prio = current->normal_prio; | ||
| 1567 | |||
| 1351 | INIT_LIST_HEAD(&p->run_list); | 1568 | INIT_LIST_HEAD(&p->run_list); |
| 1352 | p->array = NULL; | 1569 | p->array = NULL; |
| 1353 | #ifdef CONFIG_SCHEDSTATS | 1570 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
| 1354 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 1571 | if (unlikely(sched_info_on())) |
| 1572 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | ||
| 1355 | #endif | 1573 | #endif |
| 1356 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 1574 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
| 1357 | p->oncpu = 0; | 1575 | p->oncpu = 0; |
| @@ -1394,11 +1612,11 @@ void fastcall sched_fork(task_t *p, int clone_flags) | |||
| 1394 | * that must be done for every newly created context, then puts the task | 1612 | * that must be done for every newly created context, then puts the task |
| 1395 | * on the runqueue and wakes it. | 1613 | * on the runqueue and wakes it. |
| 1396 | */ | 1614 | */ |
| 1397 | void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) | 1615 | void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) |
| 1398 | { | 1616 | { |
| 1617 | struct rq *rq, *this_rq; | ||
| 1399 | unsigned long flags; | 1618 | unsigned long flags; |
| 1400 | int this_cpu, cpu; | 1619 | int this_cpu, cpu; |
| 1401 | runqueue_t *rq, *this_rq; | ||
| 1402 | 1620 | ||
| 1403 | rq = task_rq_lock(p, &flags); | 1621 | rq = task_rq_lock(p, &flags); |
| 1404 | BUG_ON(p->state != TASK_RUNNING); | 1622 | BUG_ON(p->state != TASK_RUNNING); |
| @@ -1427,10 +1645,11 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) | |||
| 1427 | __activate_task(p, rq); | 1645 | __activate_task(p, rq); |
| 1428 | else { | 1646 | else { |
| 1429 | p->prio = current->prio; | 1647 | p->prio = current->prio; |
| 1648 | p->normal_prio = current->normal_prio; | ||
| 1430 | list_add_tail(&p->run_list, ¤t->run_list); | 1649 | list_add_tail(&p->run_list, ¤t->run_list); |
| 1431 | p->array = current->array; | 1650 | p->array = current->array; |
| 1432 | p->array->nr_active++; | 1651 | p->array->nr_active++; |
| 1433 | rq->nr_running++; | 1652 | inc_nr_running(p, rq); |
| 1434 | } | 1653 | } |
| 1435 | set_need_resched(); | 1654 | set_need_resched(); |
| 1436 | } else | 1655 | } else |
| @@ -1477,10 +1696,10 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) | |||
| 1477 | * artificially, because any timeslice recovered here | 1696 | * artificially, because any timeslice recovered here |
| 1478 | * was given away by the parent in the first place.) | 1697 | * was given away by the parent in the first place.) |
| 1479 | */ | 1698 | */ |
| 1480 | void fastcall sched_exit(task_t *p) | 1699 | void fastcall sched_exit(struct task_struct *p) |
| 1481 | { | 1700 | { |
| 1482 | unsigned long flags; | 1701 | unsigned long flags; |
| 1483 | runqueue_t *rq; | 1702 | struct rq *rq; |
| 1484 | 1703 | ||
| 1485 | /* | 1704 | /* |
| 1486 | * If the child was a (relative-) CPU hog then decrease | 1705 | * If the child was a (relative-) CPU hog then decrease |
| @@ -1511,7 +1730,7 @@ void fastcall sched_exit(task_t *p) | |||
| 1511 | * prepare_task_switch sets up locking and calls architecture specific | 1730 | * prepare_task_switch sets up locking and calls architecture specific |
| 1512 | * hooks. | 1731 | * hooks. |
| 1513 | */ | 1732 | */ |
| 1514 | static inline void prepare_task_switch(runqueue_t *rq, task_t *next) | 1733 | static inline void prepare_task_switch(struct rq *rq, struct task_struct *next) |
| 1515 | { | 1734 | { |
| 1516 | prepare_lock_switch(rq, next); | 1735 | prepare_lock_switch(rq, next); |
| 1517 | prepare_arch_switch(next); | 1736 | prepare_arch_switch(next); |
| @@ -1532,31 +1751,31 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next) | |||
| 1532 | * with the lock held can cause deadlocks; see schedule() for | 1751 | * with the lock held can cause deadlocks; see schedule() for |
| 1533 | * details.) | 1752 | * details.) |
| 1534 | */ | 1753 | */ |
| 1535 | static inline void finish_task_switch(runqueue_t *rq, task_t *prev) | 1754 | static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) |
| 1536 | __releases(rq->lock) | 1755 | __releases(rq->lock) |
| 1537 | { | 1756 | { |
| 1538 | struct mm_struct *mm = rq->prev_mm; | 1757 | struct mm_struct *mm = rq->prev_mm; |
| 1539 | unsigned long prev_task_flags; | 1758 | long prev_state; |
| 1540 | 1759 | ||
| 1541 | rq->prev_mm = NULL; | 1760 | rq->prev_mm = NULL; |
| 1542 | 1761 | ||
| 1543 | /* | 1762 | /* |
| 1544 | * A task struct has one reference for the use as "current". | 1763 | * A task struct has one reference for the use as "current". |
| 1545 | * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and | 1764 | * If a task dies, then it sets TASK_DEAD in tsk->state and calls |
| 1546 | * calls schedule one last time. The schedule call will never return, | 1765 | * schedule one last time. The schedule call will never return, and |
| 1547 | * and the scheduled task must drop that reference. | 1766 | * the scheduled task must drop that reference. |
| 1548 | * The test for EXIT_ZOMBIE must occur while the runqueue locks are | 1767 | * The test for TASK_DEAD must occur while the runqueue locks are |
| 1549 | * still held, otherwise prev could be scheduled on another cpu, die | 1768 | * still held, otherwise prev could be scheduled on another cpu, die |
| 1550 | * there before we look at prev->state, and then the reference would | 1769 | * there before we look at prev->state, and then the reference would |
| 1551 | * be dropped twice. | 1770 | * be dropped twice. |
| 1552 | * Manfred Spraul <manfred@colorfullife.com> | 1771 | * Manfred Spraul <manfred@colorfullife.com> |
| 1553 | */ | 1772 | */ |
| 1554 | prev_task_flags = prev->flags; | 1773 | prev_state = prev->state; |
| 1555 | finish_arch_switch(prev); | 1774 | finish_arch_switch(prev); |
| 1556 | finish_lock_switch(rq, prev); | 1775 | finish_lock_switch(rq, prev); |
| 1557 | if (mm) | 1776 | if (mm) |
| 1558 | mmdrop(mm); | 1777 | mmdrop(mm); |
| 1559 | if (unlikely(prev_task_flags & PF_DEAD)) { | 1778 | if (unlikely(prev_state == TASK_DEAD)) { |
| 1560 | /* | 1779 | /* |
| 1561 | * Remove function-return probe instances associated with this | 1780 | * Remove function-return probe instances associated with this |
| 1562 | * task and put them back on the free list. | 1781 | * task and put them back on the free list. |
| @@ -1570,10 +1789,11 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev) | |||
| 1570 | * schedule_tail - first thing a freshly forked thread must call. | 1789 | * schedule_tail - first thing a freshly forked thread must call. |
| 1571 | * @prev: the thread we just switched away from. | 1790 | * @prev: the thread we just switched away from. |
| 1572 | */ | 1791 | */ |
| 1573 | asmlinkage void schedule_tail(task_t *prev) | 1792 | asmlinkage void schedule_tail(struct task_struct *prev) |
| 1574 | __releases(rq->lock) | 1793 | __releases(rq->lock) |
| 1575 | { | 1794 | { |
| 1576 | runqueue_t *rq = this_rq(); | 1795 | struct rq *rq = this_rq(); |
| 1796 | |||
| 1577 | finish_task_switch(rq, prev); | 1797 | finish_task_switch(rq, prev); |
| 1578 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | 1798 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW |
| 1579 | /* In this case, finish_task_switch does not reenable preemption */ | 1799 | /* In this case, finish_task_switch does not reenable preemption */ |
| @@ -1587,8 +1807,9 @@ asmlinkage void schedule_tail(task_t *prev) | |||
| 1587 | * context_switch - switch to the new MM and the new | 1807 | * context_switch - switch to the new MM and the new |
| 1588 | * thread's register state. | 1808 | * thread's register state. |
| 1589 | */ | 1809 | */ |
| 1590 | static inline | 1810 | static inline struct task_struct * |
| 1591 | task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) | 1811 | context_switch(struct rq *rq, struct task_struct *prev, |
| 1812 | struct task_struct *next) | ||
| 1592 | { | 1813 | { |
| 1593 | struct mm_struct *mm = next->mm; | 1814 | struct mm_struct *mm = next->mm; |
| 1594 | struct mm_struct *oldmm = prev->active_mm; | 1815 | struct mm_struct *oldmm = prev->active_mm; |
| @@ -1605,6 +1826,15 @@ task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) | |||
| 1605 | WARN_ON(rq->prev_mm); | 1826 | WARN_ON(rq->prev_mm); |
| 1606 | rq->prev_mm = oldmm; | 1827 | rq->prev_mm = oldmm; |
| 1607 | } | 1828 | } |
| 1829 | /* | ||
| 1830 | * Since the runqueue lock will be released by the next | ||
| 1831 | * task (which is an invalid locking op but in the case | ||
| 1832 | * of the scheduler it's an obvious special-case), so we | ||
| 1833 | * do an early lockdep release here: | ||
| 1834 | */ | ||
| 1835 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
| 1836 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | ||
| 1837 | #endif | ||
| 1608 | 1838 | ||
| 1609 | /* Here we just switch the register state and the stack. */ | 1839 | /* Here we just switch the register state and the stack. */ |
| 1610 | switch_to(prev, next, prev); | 1840 | switch_to(prev, next, prev); |
| @@ -1648,7 +1878,8 @@ unsigned long nr_uninterruptible(void) | |||
| 1648 | 1878 | ||
| 1649 | unsigned long long nr_context_switches(void) | 1879 | unsigned long long nr_context_switches(void) |
| 1650 | { | 1880 | { |
| 1651 | unsigned long long i, sum = 0; | 1881 | int i; |
| 1882 | unsigned long long sum = 0; | ||
| 1652 | 1883 | ||
| 1653 | for_each_possible_cpu(i) | 1884 | for_each_possible_cpu(i) |
| 1654 | sum += cpu_rq(i)->nr_switches; | 1885 | sum += cpu_rq(i)->nr_switches; |
| @@ -1684,15 +1915,21 @@ unsigned long nr_active(void) | |||
| 1684 | #ifdef CONFIG_SMP | 1915 | #ifdef CONFIG_SMP |
| 1685 | 1916 | ||
| 1686 | /* | 1917 | /* |
| 1918 | * Is this task likely cache-hot: | ||
| 1919 | */ | ||
| 1920 | static inline int | ||
| 1921 | task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd) | ||
| 1922 | { | ||
| 1923 | return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; | ||
| 1924 | } | ||
| 1925 | |||
| 1926 | /* | ||
| 1687 | * double_rq_lock - safely lock two runqueues | 1927 | * double_rq_lock - safely lock two runqueues |
| 1688 | * | 1928 | * |
| 1689 | * We must take them in cpu order to match code in | ||
| 1690 | * dependent_sleeper and wake_dependent_sleeper. | ||
| 1691 | * | ||
| 1692 | * Note this does not disable interrupts like task_rq_lock, | 1929 | * Note this does not disable interrupts like task_rq_lock, |
| 1693 | * you need to do so manually before calling. | 1930 | * you need to do so manually before calling. |
| 1694 | */ | 1931 | */ |
| 1695 | static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) | 1932 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) |
| 1696 | __acquires(rq1->lock) | 1933 | __acquires(rq1->lock) |
| 1697 | __acquires(rq2->lock) | 1934 | __acquires(rq2->lock) |
| 1698 | { | 1935 | { |
| @@ -1700,7 +1937,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) | |||
| 1700 | spin_lock(&rq1->lock); | 1937 | spin_lock(&rq1->lock); |
| 1701 | __acquire(rq2->lock); /* Fake it out ;) */ | 1938 | __acquire(rq2->lock); /* Fake it out ;) */ |
| 1702 | } else { | 1939 | } else { |
| 1703 | if (rq1->cpu < rq2->cpu) { | 1940 | if (rq1 < rq2) { |
| 1704 | spin_lock(&rq1->lock); | 1941 | spin_lock(&rq1->lock); |
| 1705 | spin_lock(&rq2->lock); | 1942 | spin_lock(&rq2->lock); |
| 1706 | } else { | 1943 | } else { |
| @@ -1716,7 +1953,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) | |||
| 1716 | * Note this does not restore interrupts like task_rq_unlock, | 1953 | * Note this does not restore interrupts like task_rq_unlock, |
| 1717 | * you need to do so manually after calling. | 1954 | * you need to do so manually after calling. |
| 1718 | */ | 1955 | */ |
| 1719 | static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) | 1956 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) |
| 1720 | __releases(rq1->lock) | 1957 | __releases(rq1->lock) |
| 1721 | __releases(rq2->lock) | 1958 | __releases(rq2->lock) |
| 1722 | { | 1959 | { |
| @@ -1730,13 +1967,13 @@ static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) | |||
| 1730 | /* | 1967 | /* |
| 1731 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | 1968 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. |
| 1732 | */ | 1969 | */ |
| 1733 | static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) | 1970 | static void double_lock_balance(struct rq *this_rq, struct rq *busiest) |
| 1734 | __releases(this_rq->lock) | 1971 | __releases(this_rq->lock) |
| 1735 | __acquires(busiest->lock) | 1972 | __acquires(busiest->lock) |
| 1736 | __acquires(this_rq->lock) | 1973 | __acquires(this_rq->lock) |
| 1737 | { | 1974 | { |
| 1738 | if (unlikely(!spin_trylock(&busiest->lock))) { | 1975 | if (unlikely(!spin_trylock(&busiest->lock))) { |
| 1739 | if (busiest->cpu < this_rq->cpu) { | 1976 | if (busiest < this_rq) { |
| 1740 | spin_unlock(&this_rq->lock); | 1977 | spin_unlock(&this_rq->lock); |
| 1741 | spin_lock(&busiest->lock); | 1978 | spin_lock(&busiest->lock); |
| 1742 | spin_lock(&this_rq->lock); | 1979 | spin_lock(&this_rq->lock); |
| @@ -1751,11 +1988,11 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) | |||
| 1751 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then | 1988 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then |
| 1752 | * the cpu_allowed mask is restored. | 1989 | * the cpu_allowed mask is restored. |
| 1753 | */ | 1990 | */ |
| 1754 | static void sched_migrate_task(task_t *p, int dest_cpu) | 1991 | static void sched_migrate_task(struct task_struct *p, int dest_cpu) |
| 1755 | { | 1992 | { |
| 1756 | migration_req_t req; | 1993 | struct migration_req req; |
| 1757 | runqueue_t *rq; | ||
| 1758 | unsigned long flags; | 1994 | unsigned long flags; |
| 1995 | struct rq *rq; | ||
| 1759 | 1996 | ||
| 1760 | rq = task_rq_lock(p, &flags); | 1997 | rq = task_rq_lock(p, &flags); |
| 1761 | if (!cpu_isset(dest_cpu, p->cpus_allowed) | 1998 | if (!cpu_isset(dest_cpu, p->cpus_allowed) |
| @@ -1766,11 +2003,13 @@ static void sched_migrate_task(task_t *p, int dest_cpu) | |||
| 1766 | if (migrate_task(p, dest_cpu, &req)) { | 2003 | if (migrate_task(p, dest_cpu, &req)) { |
| 1767 | /* Need to wait for migration thread (might exit: take ref). */ | 2004 | /* Need to wait for migration thread (might exit: take ref). */ |
| 1768 | struct task_struct *mt = rq->migration_thread; | 2005 | struct task_struct *mt = rq->migration_thread; |
| 2006 | |||
| 1769 | get_task_struct(mt); | 2007 | get_task_struct(mt); |
| 1770 | task_rq_unlock(rq, &flags); | 2008 | task_rq_unlock(rq, &flags); |
| 1771 | wake_up_process(mt); | 2009 | wake_up_process(mt); |
| 1772 | put_task_struct(mt); | 2010 | put_task_struct(mt); |
| 1773 | wait_for_completion(&req.done); | 2011 | wait_for_completion(&req.done); |
| 2012 | |||
| 1774 | return; | 2013 | return; |
| 1775 | } | 2014 | } |
| 1776 | out: | 2015 | out: |
| @@ -1794,14 +2033,14 @@ void sched_exec(void) | |||
| 1794 | * pull_task - move a task from a remote runqueue to the local runqueue. | 2033 | * pull_task - move a task from a remote runqueue to the local runqueue. |
| 1795 | * Both runqueues must be locked. | 2034 | * Both runqueues must be locked. |
| 1796 | */ | 2035 | */ |
| 1797 | static | 2036 | static void pull_task(struct rq *src_rq, struct prio_array *src_array, |
| 1798 | void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | 2037 | struct task_struct *p, struct rq *this_rq, |
| 1799 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) | 2038 | struct prio_array *this_array, int this_cpu) |
| 1800 | { | 2039 | { |
| 1801 | dequeue_task(p, src_array); | 2040 | dequeue_task(p, src_array); |
| 1802 | src_rq->nr_running--; | 2041 | dec_nr_running(p, src_rq); |
| 1803 | set_task_cpu(p, this_cpu); | 2042 | set_task_cpu(p, this_cpu); |
| 1804 | this_rq->nr_running++; | 2043 | inc_nr_running(p, this_rq); |
| 1805 | enqueue_task(p, this_array); | 2044 | enqueue_task(p, this_array); |
| 1806 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) | 2045 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) |
| 1807 | + this_rq->timestamp_last_tick; | 2046 | + this_rq->timestamp_last_tick; |
| @@ -1817,7 +2056,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | |||
| 1817 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 2056 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
| 1818 | */ | 2057 | */ |
| 1819 | static | 2058 | static |
| 1820 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | 2059 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, |
| 1821 | struct sched_domain *sd, enum idle_type idle, | 2060 | struct sched_domain *sd, enum idle_type idle, |
| 1822 | int *all_pinned) | 2061 | int *all_pinned) |
| 1823 | { | 2062 | { |
| @@ -1848,26 +2087,42 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | |||
| 1848 | return 1; | 2087 | return 1; |
| 1849 | } | 2088 | } |
| 1850 | 2089 | ||
| 2090 | #define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) | ||
| 2091 | |||
| 1851 | /* | 2092 | /* |
| 1852 | * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, | 2093 | * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted |
| 1853 | * as part of a balancing operation within "domain". Returns the number of | 2094 | * load from busiest to this_rq, as part of a balancing operation within |
| 1854 | * tasks moved. | 2095 | * "domain". Returns the number of tasks moved. |
| 1855 | * | 2096 | * |
| 1856 | * Called with both runqueues locked. | 2097 | * Called with both runqueues locked. |
| 1857 | */ | 2098 | */ |
| 1858 | static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, | 2099 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| 1859 | unsigned long max_nr_move, struct sched_domain *sd, | 2100 | unsigned long max_nr_move, unsigned long max_load_move, |
| 1860 | enum idle_type idle, int *all_pinned) | 2101 | struct sched_domain *sd, enum idle_type idle, |
| 2102 | int *all_pinned) | ||
| 1861 | { | 2103 | { |
| 1862 | prio_array_t *array, *dst_array; | 2104 | int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, |
| 2105 | best_prio_seen, skip_for_load; | ||
| 2106 | struct prio_array *array, *dst_array; | ||
| 1863 | struct list_head *head, *curr; | 2107 | struct list_head *head, *curr; |
| 1864 | int idx, pulled = 0, pinned = 0; | 2108 | struct task_struct *tmp; |
| 1865 | task_t *tmp; | 2109 | long rem_load_move; |
| 1866 | 2110 | ||
| 1867 | if (max_nr_move == 0) | 2111 | if (max_nr_move == 0 || max_load_move == 0) |
| 1868 | goto out; | 2112 | goto out; |
| 1869 | 2113 | ||
| 2114 | rem_load_move = max_load_move; | ||
| 1870 | pinned = 1; | 2115 | pinned = 1; |
| 2116 | this_best_prio = rq_best_prio(this_rq); | ||
| 2117 | best_prio = rq_best_prio(busiest); | ||
| 2118 | /* | ||
| 2119 | * Enable handling of the case where there is more than one task | ||
| 2120 | * with the best priority. If the current running task is one | ||
| 2121 | * of those with prio==best_prio we know it won't be moved | ||
| 2122 | * and therefore it's safe to override the skip (based on load) of | ||
| 2123 | * any task we find with that prio. | ||
| 2124 | */ | ||
| 2125 | best_prio_seen = best_prio == busiest->curr->prio; | ||
| 1871 | 2126 | ||
| 1872 | /* | 2127 | /* |
| 1873 | * We first consider expired tasks. Those will likely not be | 2128 | * We first consider expired tasks. Those will likely not be |
| @@ -1903,11 +2158,22 @@ skip_bitmap: | |||
| 1903 | head = array->queue + idx; | 2158 | head = array->queue + idx; |
| 1904 | curr = head->prev; | 2159 | curr = head->prev; |
| 1905 | skip_queue: | 2160 | skip_queue: |
| 1906 | tmp = list_entry(curr, task_t, run_list); | 2161 | tmp = list_entry(curr, struct task_struct, run_list); |
| 1907 | 2162 | ||
| 1908 | curr = curr->prev; | 2163 | curr = curr->prev; |
| 1909 | 2164 | ||
| 1910 | if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { | 2165 | /* |
| 2166 | * To help distribute high priority tasks accross CPUs we don't | ||
| 2167 | * skip a task if it will be the highest priority task (i.e. smallest | ||
| 2168 | * prio value) on its new queue regardless of its load weight | ||
| 2169 | */ | ||
| 2170 | skip_for_load = tmp->load_weight > rem_load_move; | ||
| 2171 | if (skip_for_load && idx < this_best_prio) | ||
| 2172 | skip_for_load = !best_prio_seen && idx == best_prio; | ||
| 2173 | if (skip_for_load || | ||
| 2174 | !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { | ||
| 2175 | |||
| 2176 | best_prio_seen |= idx == best_prio; | ||
| 1911 | if (curr != head) | 2177 | if (curr != head) |
| 1912 | goto skip_queue; | 2178 | goto skip_queue; |
| 1913 | idx++; | 2179 | idx++; |
| @@ -1921,9 +2187,15 @@ skip_queue: | |||
| 1921 | 2187 | ||
| 1922 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); | 2188 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); |
| 1923 | pulled++; | 2189 | pulled++; |
| 2190 | rem_load_move -= tmp->load_weight; | ||
| 1924 | 2191 | ||
| 1925 | /* We only want to steal up to the prescribed number of tasks. */ | 2192 | /* |
| 1926 | if (pulled < max_nr_move) { | 2193 | * We only want to steal up to the prescribed number of tasks |
| 2194 | * and the prescribed amount of weighted load. | ||
| 2195 | */ | ||
| 2196 | if (pulled < max_nr_move && rem_load_move > 0) { | ||
| 2197 | if (idx < this_best_prio) | ||
| 2198 | this_best_prio = idx; | ||
| 1927 | if (curr != head) | 2199 | if (curr != head) |
| 1928 | goto skip_queue; | 2200 | goto skip_queue; |
| 1929 | idx++; | 2201 | idx++; |
| @@ -1944,19 +2216,30 @@ out: | |||
| 1944 | 2216 | ||
| 1945 | /* | 2217 | /* |
| 1946 | * find_busiest_group finds and returns the busiest CPU group within the | 2218 | * find_busiest_group finds and returns the busiest CPU group within the |
| 1947 | * domain. It calculates and returns the number of tasks which should be | 2219 | * domain. It calculates and returns the amount of weighted load which |
| 1948 | * moved to restore balance via the imbalance parameter. | 2220 | * should be moved to restore balance via the imbalance parameter. |
| 1949 | */ | 2221 | */ |
| 1950 | static struct sched_group * | 2222 | static struct sched_group * |
| 1951 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 2223 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
| 1952 | unsigned long *imbalance, enum idle_type idle, int *sd_idle) | 2224 | unsigned long *imbalance, enum idle_type idle, int *sd_idle, |
| 2225 | cpumask_t *cpus) | ||
| 1953 | { | 2226 | { |
| 1954 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 2227 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
| 1955 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 2228 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
| 1956 | unsigned long max_pull; | 2229 | unsigned long max_pull; |
| 2230 | unsigned long busiest_load_per_task, busiest_nr_running; | ||
| 2231 | unsigned long this_load_per_task, this_nr_running; | ||
| 1957 | int load_idx; | 2232 | int load_idx; |
| 2233 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
| 2234 | int power_savings_balance = 1; | ||
| 2235 | unsigned long leader_nr_running = 0, min_load_per_task = 0; | ||
| 2236 | unsigned long min_nr_running = ULONG_MAX; | ||
| 2237 | struct sched_group *group_min = NULL, *group_leader = NULL; | ||
| 2238 | #endif | ||
| 1958 | 2239 | ||
| 1959 | max_load = this_load = total_load = total_pwr = 0; | 2240 | max_load = this_load = total_load = total_pwr = 0; |
| 2241 | busiest_load_per_task = busiest_nr_running = 0; | ||
| 2242 | this_load_per_task = this_nr_running = 0; | ||
| 1960 | if (idle == NOT_IDLE) | 2243 | if (idle == NOT_IDLE) |
| 1961 | load_idx = sd->busy_idx; | 2244 | load_idx = sd->busy_idx; |
| 1962 | else if (idle == NEWLY_IDLE) | 2245 | else if (idle == NEWLY_IDLE) |
| @@ -1965,16 +2248,24 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 1965 | load_idx = sd->idle_idx; | 2248 | load_idx = sd->idle_idx; |
| 1966 | 2249 | ||
| 1967 | do { | 2250 | do { |
| 1968 | unsigned long load; | 2251 | unsigned long load, group_capacity; |
| 1969 | int local_group; | 2252 | int local_group; |
| 1970 | int i; | 2253 | int i; |
| 2254 | unsigned long sum_nr_running, sum_weighted_load; | ||
| 1971 | 2255 | ||
| 1972 | local_group = cpu_isset(this_cpu, group->cpumask); | 2256 | local_group = cpu_isset(this_cpu, group->cpumask); |
| 1973 | 2257 | ||
| 1974 | /* Tally up the load of all CPUs in the group */ | 2258 | /* Tally up the load of all CPUs in the group */ |
| 1975 | avg_load = 0; | 2259 | sum_weighted_load = sum_nr_running = avg_load = 0; |
| 1976 | 2260 | ||
| 1977 | for_each_cpu_mask(i, group->cpumask) { | 2261 | for_each_cpu_mask(i, group->cpumask) { |
| 2262 | struct rq *rq; | ||
| 2263 | |||
| 2264 | if (!cpu_isset(i, *cpus)) | ||
| 2265 | continue; | ||
| 2266 | |||
| 2267 | rq = cpu_rq(i); | ||
| 2268 | |||
| 1978 | if (*sd_idle && !idle_cpu(i)) | 2269 | if (*sd_idle && !idle_cpu(i)) |
| 1979 | *sd_idle = 0; | 2270 | *sd_idle = 0; |
| 1980 | 2271 | ||
| @@ -1985,6 +2276,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 1985 | load = source_load(i, load_idx); | 2276 | load = source_load(i, load_idx); |
| 1986 | 2277 | ||
| 1987 | avg_load += load; | 2278 | avg_load += load; |
| 2279 | sum_nr_running += rq->nr_running; | ||
| 2280 | sum_weighted_load += rq->raw_weighted_load; | ||
| 1988 | } | 2281 | } |
| 1989 | 2282 | ||
| 1990 | total_load += avg_load; | 2283 | total_load += avg_load; |
| @@ -1993,17 +2286,80 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 1993 | /* Adjust by relative CPU power of the group */ | 2286 | /* Adjust by relative CPU power of the group */ |
| 1994 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | 2287 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; |
| 1995 | 2288 | ||
| 2289 | group_capacity = group->cpu_power / SCHED_LOAD_SCALE; | ||
| 2290 | |||
| 1996 | if (local_group) { | 2291 | if (local_group) { |
| 1997 | this_load = avg_load; | 2292 | this_load = avg_load; |
| 1998 | this = group; | 2293 | this = group; |
| 1999 | } else if (avg_load > max_load) { | 2294 | this_nr_running = sum_nr_running; |
| 2295 | this_load_per_task = sum_weighted_load; | ||
| 2296 | } else if (avg_load > max_load && | ||
| 2297 | sum_nr_running > group_capacity) { | ||
| 2000 | max_load = avg_load; | 2298 | max_load = avg_load; |
| 2001 | busiest = group; | 2299 | busiest = group; |
| 2300 | busiest_nr_running = sum_nr_running; | ||
| 2301 | busiest_load_per_task = sum_weighted_load; | ||
| 2302 | } | ||
| 2303 | |||
| 2304 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
| 2305 | /* | ||
| 2306 | * Busy processors will not participate in power savings | ||
| 2307 | * balance. | ||
| 2308 | */ | ||
| 2309 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
| 2310 | goto group_next; | ||
| 2311 | |||
| 2312 | /* | ||
| 2313 | * If the local group is idle or completely loaded | ||
| 2314 | * no need to do power savings balance at this domain | ||
| 2315 | */ | ||
| 2316 | if (local_group && (this_nr_running >= group_capacity || | ||
| 2317 | !this_nr_running)) | ||
| 2318 | power_savings_balance = 0; | ||
| 2319 | |||
| 2320 | /* | ||
| 2321 | * If a group is already running at full capacity or idle, | ||
| 2322 | * don't include that group in power savings calculations | ||
| 2323 | */ | ||
| 2324 | if (!power_savings_balance || sum_nr_running >= group_capacity | ||
| 2325 | || !sum_nr_running) | ||
| 2326 | goto group_next; | ||
| 2327 | |||
| 2328 | /* | ||
| 2329 | * Calculate the group which has the least non-idle load. | ||
| 2330 | * This is the group from where we need to pick up the load | ||
| 2331 | * for saving power | ||
| 2332 | */ | ||
| 2333 | if ((sum_nr_running < min_nr_running) || | ||
| 2334 | (sum_nr_running == min_nr_running && | ||
| 2335 | first_cpu(group->cpumask) < | ||
| 2336 | first_cpu(group_min->cpumask))) { | ||
| 2337 | group_min = group; | ||
| 2338 | min_nr_running = sum_nr_running; | ||
| 2339 | min_load_per_task = sum_weighted_load / | ||
| 2340 | sum_nr_running; | ||
| 2341 | } | ||
| 2342 | |||
| 2343 | /* | ||
| 2344 | * Calculate the group which is almost near its | ||
| 2345 | * capacity but still has some space to pick up some load | ||
| 2346 | * from other group and save more power | ||
| 2347 | */ | ||
| 2348 | if (sum_nr_running <= group_capacity - 1) { | ||
| 2349 | if (sum_nr_running > leader_nr_running || | ||
| 2350 | (sum_nr_running == leader_nr_running && | ||
| 2351 | first_cpu(group->cpumask) > | ||
| 2352 | first_cpu(group_leader->cpumask))) { | ||
| 2353 | group_leader = group; | ||
| 2354 | leader_nr_running = sum_nr_running; | ||
| 2355 | } | ||
| 2002 | } | 2356 | } |
| 2357 | group_next: | ||
| 2358 | #endif | ||
| 2003 | group = group->next; | 2359 | group = group->next; |
| 2004 | } while (group != sd->groups); | 2360 | } while (group != sd->groups); |
| 2005 | 2361 | ||
| 2006 | if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) | 2362 | if (!busiest || this_load >= max_load || busiest_nr_running == 0) |
| 2007 | goto out_balanced; | 2363 | goto out_balanced; |
| 2008 | 2364 | ||
| 2009 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; | 2365 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; |
| @@ -2012,6 +2368,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2012 | 100*max_load <= sd->imbalance_pct*this_load) | 2368 | 100*max_load <= sd->imbalance_pct*this_load) |
| 2013 | goto out_balanced; | 2369 | goto out_balanced; |
| 2014 | 2370 | ||
| 2371 | busiest_load_per_task /= busiest_nr_running; | ||
| 2015 | /* | 2372 | /* |
| 2016 | * We're trying to get all the cpus to the average_load, so we don't | 2373 | * We're trying to get all the cpus to the average_load, so we don't |
| 2017 | * want to push ourselves above the average load, nor do we wish to | 2374 | * want to push ourselves above the average load, nor do we wish to |
| @@ -2023,21 +2380,49 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2023 | * by pulling tasks to us. Be careful of negative numbers as they'll | 2380 | * by pulling tasks to us. Be careful of negative numbers as they'll |
| 2024 | * appear as very large values with unsigned longs. | 2381 | * appear as very large values with unsigned longs. |
| 2025 | */ | 2382 | */ |
| 2383 | if (max_load <= busiest_load_per_task) | ||
| 2384 | goto out_balanced; | ||
| 2385 | |||
| 2386 | /* | ||
| 2387 | * In the presence of smp nice balancing, certain scenarios can have | ||
| 2388 | * max load less than avg load(as we skip the groups at or below | ||
| 2389 | * its cpu_power, while calculating max_load..) | ||
| 2390 | */ | ||
| 2391 | if (max_load < avg_load) { | ||
| 2392 | *imbalance = 0; | ||
| 2393 | goto small_imbalance; | ||
| 2394 | } | ||
| 2026 | 2395 | ||
| 2027 | /* Don't want to pull so many tasks that a group would go idle */ | 2396 | /* Don't want to pull so many tasks that a group would go idle */ |
| 2028 | max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); | 2397 | max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); |
| 2029 | 2398 | ||
| 2030 | /* How much load to actually move to equalise the imbalance */ | 2399 | /* How much load to actually move to equalise the imbalance */ |
| 2031 | *imbalance = min(max_pull * busiest->cpu_power, | 2400 | *imbalance = min(max_pull * busiest->cpu_power, |
| 2032 | (avg_load - this_load) * this->cpu_power) | 2401 | (avg_load - this_load) * this->cpu_power) |
| 2033 | / SCHED_LOAD_SCALE; | 2402 | / SCHED_LOAD_SCALE; |
| 2034 | 2403 | ||
| 2035 | if (*imbalance < SCHED_LOAD_SCALE) { | 2404 | /* |
| 2036 | unsigned long pwr_now = 0, pwr_move = 0; | 2405 | * if *imbalance is less than the average load per runnable task |
| 2037 | unsigned long tmp; | 2406 | * there is no gaurantee that any tasks will be moved so we'll have |
| 2407 | * a think about bumping its value to force at least one task to be | ||
| 2408 | * moved | ||
| 2409 | */ | ||
| 2410 | if (*imbalance < busiest_load_per_task) { | ||
| 2411 | unsigned long tmp, pwr_now, pwr_move; | ||
| 2412 | unsigned int imbn; | ||
| 2413 | |||
| 2414 | small_imbalance: | ||
| 2415 | pwr_move = pwr_now = 0; | ||
| 2416 | imbn = 2; | ||
| 2417 | if (this_nr_running) { | ||
| 2418 | this_load_per_task /= this_nr_running; | ||
| 2419 | if (busiest_load_per_task > this_load_per_task) | ||
| 2420 | imbn = 1; | ||
| 2421 | } else | ||
| 2422 | this_load_per_task = SCHED_LOAD_SCALE; | ||
| 2038 | 2423 | ||
| 2039 | if (max_load - this_load >= SCHED_LOAD_SCALE*2) { | 2424 | if (max_load - this_load >= busiest_load_per_task * imbn) { |
| 2040 | *imbalance = 1; | 2425 | *imbalance = busiest_load_per_task; |
| 2041 | return busiest; | 2426 | return busiest; |
| 2042 | } | 2427 | } |
| 2043 | 2428 | ||
| @@ -2047,39 +2432,47 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2047 | * moving them. | 2432 | * moving them. |
| 2048 | */ | 2433 | */ |
| 2049 | 2434 | ||
| 2050 | pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); | 2435 | pwr_now += busiest->cpu_power * |
| 2051 | pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); | 2436 | min(busiest_load_per_task, max_load); |
| 2437 | pwr_now += this->cpu_power * | ||
| 2438 | min(this_load_per_task, this_load); | ||
| 2052 | pwr_now /= SCHED_LOAD_SCALE; | 2439 | pwr_now /= SCHED_LOAD_SCALE; |
| 2053 | 2440 | ||
| 2054 | /* Amount of load we'd subtract */ | 2441 | /* Amount of load we'd subtract */ |
| 2055 | tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; | 2442 | tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; |
| 2056 | if (max_load > tmp) | 2443 | if (max_load > tmp) |
| 2057 | pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, | 2444 | pwr_move += busiest->cpu_power * |
| 2058 | max_load - tmp); | 2445 | min(busiest_load_per_task, max_load - tmp); |
| 2059 | 2446 | ||
| 2060 | /* Amount of load we'd add */ | 2447 | /* Amount of load we'd add */ |
| 2061 | if (max_load*busiest->cpu_power < | 2448 | if (max_load*busiest->cpu_power < |
| 2062 | SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) | 2449 | busiest_load_per_task*SCHED_LOAD_SCALE) |
| 2063 | tmp = max_load*busiest->cpu_power/this->cpu_power; | 2450 | tmp = max_load*busiest->cpu_power/this->cpu_power; |
| 2064 | else | 2451 | else |
| 2065 | tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; | 2452 | tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; |
| 2066 | pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); | 2453 | pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); |
| 2067 | pwr_move /= SCHED_LOAD_SCALE; | 2454 | pwr_move /= SCHED_LOAD_SCALE; |
| 2068 | 2455 | ||
| 2069 | /* Move if we gain throughput */ | 2456 | /* Move if we gain throughput */ |
| 2070 | if (pwr_move <= pwr_now) | 2457 | if (pwr_move <= pwr_now) |
| 2071 | goto out_balanced; | 2458 | goto out_balanced; |
| 2072 | 2459 | ||
| 2073 | *imbalance = 1; | 2460 | *imbalance = busiest_load_per_task; |
| 2074 | return busiest; | ||
| 2075 | } | 2461 | } |
| 2076 | 2462 | ||
| 2077 | /* Get rid of the scaling factor, rounding down as we divide */ | ||
| 2078 | *imbalance = *imbalance / SCHED_LOAD_SCALE; | ||
| 2079 | return busiest; | 2463 | return busiest; |
| 2080 | 2464 | ||
| 2081 | out_balanced: | 2465 | out_balanced: |
| 2466 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
| 2467 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
| 2468 | goto ret; | ||
| 2082 | 2469 | ||
| 2470 | if (this == group_leader && group_leader != group_min) { | ||
| 2471 | *imbalance = min_load_per_task; | ||
| 2472 | return group_min; | ||
| 2473 | } | ||
| 2474 | ret: | ||
| 2475 | #endif | ||
| 2083 | *imbalance = 0; | 2476 | *imbalance = 0; |
| 2084 | return NULL; | 2477 | return NULL; |
| 2085 | } | 2478 | } |
| @@ -2087,19 +2480,27 @@ out_balanced: | |||
| 2087 | /* | 2480 | /* |
| 2088 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 2481 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
| 2089 | */ | 2482 | */ |
| 2090 | static runqueue_t *find_busiest_queue(struct sched_group *group, | 2483 | static struct rq * |
| 2091 | enum idle_type idle) | 2484 | find_busiest_queue(struct sched_group *group, enum idle_type idle, |
| 2485 | unsigned long imbalance, cpumask_t *cpus) | ||
| 2092 | { | 2486 | { |
| 2093 | unsigned long load, max_load = 0; | 2487 | struct rq *busiest = NULL, *rq; |
| 2094 | runqueue_t *busiest = NULL; | 2488 | unsigned long max_load = 0; |
| 2095 | int i; | 2489 | int i; |
| 2096 | 2490 | ||
| 2097 | for_each_cpu_mask(i, group->cpumask) { | 2491 | for_each_cpu_mask(i, group->cpumask) { |
| 2098 | load = source_load(i, 0); | ||
| 2099 | 2492 | ||
| 2100 | if (load > max_load) { | 2493 | if (!cpu_isset(i, *cpus)) |
| 2101 | max_load = load; | 2494 | continue; |
| 2102 | busiest = cpu_rq(i); | 2495 | |
| 2496 | rq = cpu_rq(i); | ||
| 2497 | |||
| 2498 | if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) | ||
| 2499 | continue; | ||
| 2500 | |||
| 2501 | if (rq->raw_weighted_load > max_load) { | ||
| 2502 | max_load = rq->raw_weighted_load; | ||
| 2503 | busiest = rq; | ||
| 2103 | } | 2504 | } |
| 2104 | } | 2505 | } |
| 2105 | 2506 | ||
| @@ -2112,34 +2513,41 @@ static runqueue_t *find_busiest_queue(struct sched_group *group, | |||
| 2112 | */ | 2513 | */ |
| 2113 | #define MAX_PINNED_INTERVAL 512 | 2514 | #define MAX_PINNED_INTERVAL 512 |
| 2114 | 2515 | ||
| 2516 | static inline unsigned long minus_1_or_zero(unsigned long n) | ||
| 2517 | { | ||
| 2518 | return n > 0 ? n - 1 : 0; | ||
| 2519 | } | ||
| 2520 | |||
| 2115 | /* | 2521 | /* |
| 2116 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2522 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
| 2117 | * tasks if there is an imbalance. | 2523 | * tasks if there is an imbalance. |
| 2118 | * | 2524 | * |
| 2119 | * Called with this_rq unlocked. | 2525 | * Called with this_rq unlocked. |
| 2120 | */ | 2526 | */ |
| 2121 | static int load_balance(int this_cpu, runqueue_t *this_rq, | 2527 | static int load_balance(int this_cpu, struct rq *this_rq, |
| 2122 | struct sched_domain *sd, enum idle_type idle) | 2528 | struct sched_domain *sd, enum idle_type idle) |
| 2123 | { | 2529 | { |
| 2530 | int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | ||
| 2124 | struct sched_group *group; | 2531 | struct sched_group *group; |
| 2125 | runqueue_t *busiest; | ||
| 2126 | unsigned long imbalance; | 2532 | unsigned long imbalance; |
| 2127 | int nr_moved, all_pinned = 0; | 2533 | struct rq *busiest; |
| 2128 | int active_balance = 0; | 2534 | cpumask_t cpus = CPU_MASK_ALL; |
| 2129 | int sd_idle = 0; | ||
| 2130 | 2535 | ||
| 2131 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) | 2536 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && |
| 2537 | !sched_smt_power_savings) | ||
| 2132 | sd_idle = 1; | 2538 | sd_idle = 1; |
| 2133 | 2539 | ||
| 2134 | schedstat_inc(sd, lb_cnt[idle]); | 2540 | schedstat_inc(sd, lb_cnt[idle]); |
| 2135 | 2541 | ||
| 2136 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle); | 2542 | redo: |
| 2543 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | ||
| 2544 | &cpus); | ||
| 2137 | if (!group) { | 2545 | if (!group) { |
| 2138 | schedstat_inc(sd, lb_nobusyg[idle]); | 2546 | schedstat_inc(sd, lb_nobusyg[idle]); |
| 2139 | goto out_balanced; | 2547 | goto out_balanced; |
| 2140 | } | 2548 | } |
| 2141 | 2549 | ||
| 2142 | busiest = find_busiest_queue(group, idle); | 2550 | busiest = find_busiest_queue(group, idle, imbalance, &cpus); |
| 2143 | if (!busiest) { | 2551 | if (!busiest) { |
| 2144 | schedstat_inc(sd, lb_nobusyq[idle]); | 2552 | schedstat_inc(sd, lb_nobusyq[idle]); |
| 2145 | goto out_balanced; | 2553 | goto out_balanced; |
| @@ -2159,12 +2567,17 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
| 2159 | */ | 2567 | */ |
| 2160 | double_rq_lock(this_rq, busiest); | 2568 | double_rq_lock(this_rq, busiest); |
| 2161 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2569 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
| 2162 | imbalance, sd, idle, &all_pinned); | 2570 | minus_1_or_zero(busiest->nr_running), |
| 2571 | imbalance, sd, idle, &all_pinned); | ||
| 2163 | double_rq_unlock(this_rq, busiest); | 2572 | double_rq_unlock(this_rq, busiest); |
| 2164 | 2573 | ||
| 2165 | /* All tasks on this runqueue were pinned by CPU affinity */ | 2574 | /* All tasks on this runqueue were pinned by CPU affinity */ |
| 2166 | if (unlikely(all_pinned)) | 2575 | if (unlikely(all_pinned)) { |
| 2576 | cpu_clear(cpu_of(busiest), cpus); | ||
| 2577 | if (!cpus_empty(cpus)) | ||
| 2578 | goto redo; | ||
| 2167 | goto out_balanced; | 2579 | goto out_balanced; |
| 2580 | } | ||
| 2168 | } | 2581 | } |
| 2169 | 2582 | ||
| 2170 | if (!nr_moved) { | 2583 | if (!nr_moved) { |
| @@ -2216,7 +2629,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
| 2216 | sd->balance_interval *= 2; | 2629 | sd->balance_interval *= 2; |
| 2217 | } | 2630 | } |
| 2218 | 2631 | ||
| 2219 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2632 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
| 2633 | !sched_smt_power_savings) | ||
| 2220 | return -1; | 2634 | return -1; |
| 2221 | return nr_moved; | 2635 | return nr_moved; |
| 2222 | 2636 | ||
| @@ -2231,7 +2645,8 @@ out_one_pinned: | |||
| 2231 | (sd->balance_interval < sd->max_interval)) | 2645 | (sd->balance_interval < sd->max_interval)) |
| 2232 | sd->balance_interval *= 2; | 2646 | sd->balance_interval *= 2; |
| 2233 | 2647 | ||
| 2234 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2648 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
| 2649 | !sched_smt_power_savings) | ||
| 2235 | return -1; | 2650 | return -1; |
| 2236 | return 0; | 2651 | return 0; |
| 2237 | } | 2652 | } |
| @@ -2243,26 +2658,30 @@ out_one_pinned: | |||
| 2243 | * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). | 2658 | * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). |
| 2244 | * this_rq is locked. | 2659 | * this_rq is locked. |
| 2245 | */ | 2660 | */ |
| 2246 | static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | 2661 | static int |
| 2247 | struct sched_domain *sd) | 2662 | load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) |
| 2248 | { | 2663 | { |
| 2249 | struct sched_group *group; | 2664 | struct sched_group *group; |
| 2250 | runqueue_t *busiest = NULL; | 2665 | struct rq *busiest = NULL; |
| 2251 | unsigned long imbalance; | 2666 | unsigned long imbalance; |
| 2252 | int nr_moved = 0; | 2667 | int nr_moved = 0; |
| 2253 | int sd_idle = 0; | 2668 | int sd_idle = 0; |
| 2669 | cpumask_t cpus = CPU_MASK_ALL; | ||
| 2254 | 2670 | ||
| 2255 | if (sd->flags & SD_SHARE_CPUPOWER) | 2671 | if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) |
| 2256 | sd_idle = 1; | 2672 | sd_idle = 1; |
| 2257 | 2673 | ||
| 2258 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); | 2674 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); |
| 2259 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle); | 2675 | redo: |
| 2676 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, | ||
| 2677 | &sd_idle, &cpus); | ||
| 2260 | if (!group) { | 2678 | if (!group) { |
| 2261 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); | 2679 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); |
| 2262 | goto out_balanced; | 2680 | goto out_balanced; |
| 2263 | } | 2681 | } |
| 2264 | 2682 | ||
| 2265 | busiest = find_busiest_queue(group, NEWLY_IDLE); | 2683 | busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance, |
| 2684 | &cpus); | ||
| 2266 | if (!busiest) { | 2685 | if (!busiest) { |
| 2267 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); | 2686 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); |
| 2268 | goto out_balanced; | 2687 | goto out_balanced; |
| @@ -2277,8 +2696,15 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
| 2277 | /* Attempt to move tasks */ | 2696 | /* Attempt to move tasks */ |
| 2278 | double_lock_balance(this_rq, busiest); | 2697 | double_lock_balance(this_rq, busiest); |
| 2279 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2698 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
| 2699 | minus_1_or_zero(busiest->nr_running), | ||
| 2280 | imbalance, sd, NEWLY_IDLE, NULL); | 2700 | imbalance, sd, NEWLY_IDLE, NULL); |
| 2281 | spin_unlock(&busiest->lock); | 2701 | spin_unlock(&busiest->lock); |
| 2702 | |||
| 2703 | if (!nr_moved) { | ||
| 2704 | cpu_clear(cpu_of(busiest), cpus); | ||
| 2705 | if (!cpus_empty(cpus)) | ||
| 2706 | goto redo; | ||
| 2707 | } | ||
| 2282 | } | 2708 | } |
| 2283 | 2709 | ||
| 2284 | if (!nr_moved) { | 2710 | if (!nr_moved) { |
| @@ -2292,9 +2718,11 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
| 2292 | 2718 | ||
| 2293 | out_balanced: | 2719 | out_balanced: |
| 2294 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | 2720 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); |
| 2295 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2721 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
| 2722 | !sched_smt_power_savings) | ||
| 2296 | return -1; | 2723 | return -1; |
| 2297 | sd->nr_balance_failed = 0; | 2724 | sd->nr_balance_failed = 0; |
| 2725 | |||
| 2298 | return 0; | 2726 | return 0; |
| 2299 | } | 2727 | } |
| 2300 | 2728 | ||
| @@ -2302,16 +2730,15 @@ out_balanced: | |||
| 2302 | * idle_balance is called by schedule() if this_cpu is about to become | 2730 | * idle_balance is called by schedule() if this_cpu is about to become |
| 2303 | * idle. Attempts to pull tasks from other CPUs. | 2731 | * idle. Attempts to pull tasks from other CPUs. |
| 2304 | */ | 2732 | */ |
| 2305 | static void idle_balance(int this_cpu, runqueue_t *this_rq) | 2733 | static void idle_balance(int this_cpu, struct rq *this_rq) |
| 2306 | { | 2734 | { |
| 2307 | struct sched_domain *sd; | 2735 | struct sched_domain *sd; |
| 2308 | 2736 | ||
| 2309 | for_each_domain(this_cpu, sd) { | 2737 | for_each_domain(this_cpu, sd) { |
| 2310 | if (sd->flags & SD_BALANCE_NEWIDLE) { | 2738 | if (sd->flags & SD_BALANCE_NEWIDLE) { |
| 2311 | if (load_balance_newidle(this_cpu, this_rq, sd)) { | 2739 | /* If we've pulled tasks over stop searching: */ |
| 2312 | /* We've pulled tasks over so stop searching */ | 2740 | if (load_balance_newidle(this_cpu, this_rq, sd)) |
| 2313 | break; | 2741 | break; |
| 2314 | } | ||
| 2315 | } | 2742 | } |
| 2316 | } | 2743 | } |
| 2317 | } | 2744 | } |
| @@ -2324,14 +2751,14 @@ static void idle_balance(int this_cpu, runqueue_t *this_rq) | |||
| 2324 | * | 2751 | * |
| 2325 | * Called with busiest_rq locked. | 2752 | * Called with busiest_rq locked. |
| 2326 | */ | 2753 | */ |
| 2327 | static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) | 2754 | static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) |
| 2328 | { | 2755 | { |
| 2329 | struct sched_domain *sd; | ||
| 2330 | runqueue_t *target_rq; | ||
| 2331 | int target_cpu = busiest_rq->push_cpu; | 2756 | int target_cpu = busiest_rq->push_cpu; |
| 2757 | struct sched_domain *sd; | ||
| 2758 | struct rq *target_rq; | ||
| 2332 | 2759 | ||
| 2760 | /* Is there any task to move? */ | ||
| 2333 | if (busiest_rq->nr_running <= 1) | 2761 | if (busiest_rq->nr_running <= 1) |
| 2334 | /* no task to move */ | ||
| 2335 | return; | 2762 | return; |
| 2336 | 2763 | ||
| 2337 | target_rq = cpu_rq(target_cpu); | 2764 | target_rq = cpu_rq(target_cpu); |
| @@ -2347,21 +2774,22 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) | |||
| 2347 | double_lock_balance(busiest_rq, target_rq); | 2774 | double_lock_balance(busiest_rq, target_rq); |
| 2348 | 2775 | ||
| 2349 | /* Search for an sd spanning us and the target CPU. */ | 2776 | /* Search for an sd spanning us and the target CPU. */ |
| 2350 | for_each_domain(target_cpu, sd) | 2777 | for_each_domain(target_cpu, sd) { |
| 2351 | if ((sd->flags & SD_LOAD_BALANCE) && | 2778 | if ((sd->flags & SD_LOAD_BALANCE) && |
| 2352 | cpu_isset(busiest_cpu, sd->span)) | 2779 | cpu_isset(busiest_cpu, sd->span)) |
| 2353 | break; | 2780 | break; |
| 2781 | } | ||
| 2354 | 2782 | ||
| 2355 | if (unlikely(sd == NULL)) | 2783 | if (likely(sd)) { |
| 2356 | goto out; | 2784 | schedstat_inc(sd, alb_cnt); |
| 2357 | |||
| 2358 | schedstat_inc(sd, alb_cnt); | ||
| 2359 | 2785 | ||
| 2360 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) | 2786 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, |
| 2361 | schedstat_inc(sd, alb_pushed); | 2787 | RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, |
| 2362 | else | 2788 | NULL)) |
| 2363 | schedstat_inc(sd, alb_failed); | 2789 | schedstat_inc(sd, alb_pushed); |
| 2364 | out: | 2790 | else |
| 2791 | schedstat_inc(sd, alb_failed); | ||
| 2792 | } | ||
| 2365 | spin_unlock(&target_rq->lock); | 2793 | spin_unlock(&target_rq->lock); |
| 2366 | } | 2794 | } |
| 2367 | 2795 | ||
| @@ -2374,23 +2802,27 @@ out: | |||
| 2374 | * Balancing parameters are set up in arch_init_sched_domains. | 2802 | * Balancing parameters are set up in arch_init_sched_domains. |
| 2375 | */ | 2803 | */ |
| 2376 | 2804 | ||
| 2377 | /* Don't have all balancing operations going off at once */ | 2805 | /* Don't have all balancing operations going off at once: */ |
| 2378 | #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) | 2806 | static inline unsigned long cpu_offset(int cpu) |
| 2807 | { | ||
| 2808 | return jiffies + cpu * HZ / NR_CPUS; | ||
| 2809 | } | ||
| 2379 | 2810 | ||
| 2380 | static void rebalance_tick(int this_cpu, runqueue_t *this_rq, | 2811 | static void |
| 2381 | enum idle_type idle) | 2812 | rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) |
| 2382 | { | 2813 | { |
| 2383 | unsigned long old_load, this_load; | 2814 | unsigned long this_load, interval, j = cpu_offset(this_cpu); |
| 2384 | unsigned long j = jiffies + CPU_OFFSET(this_cpu); | ||
| 2385 | struct sched_domain *sd; | 2815 | struct sched_domain *sd; |
| 2386 | int i; | 2816 | int i, scale; |
| 2817 | |||
| 2818 | this_load = this_rq->raw_weighted_load; | ||
| 2819 | |||
| 2820 | /* Update our load: */ | ||
| 2821 | for (i = 0, scale = 1; i < 3; i++, scale <<= 1) { | ||
| 2822 | unsigned long old_load, new_load; | ||
| 2387 | 2823 | ||
| 2388 | this_load = this_rq->nr_running * SCHED_LOAD_SCALE; | ||
| 2389 | /* Update our load */ | ||
| 2390 | for (i = 0; i < 3; i++) { | ||
| 2391 | unsigned long new_load = this_load; | ||
| 2392 | int scale = 1 << i; | ||
| 2393 | old_load = this_rq->cpu_load[i]; | 2824 | old_load = this_rq->cpu_load[i]; |
| 2825 | new_load = this_load; | ||
| 2394 | /* | 2826 | /* |
| 2395 | * Round up the averaging division if load is increasing. This | 2827 | * Round up the averaging division if load is increasing. This |
| 2396 | * prevents us from getting stuck on 9 if the load is 10, for | 2828 | * prevents us from getting stuck on 9 if the load is 10, for |
| @@ -2402,8 +2834,6 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, | |||
| 2402 | } | 2834 | } |
| 2403 | 2835 | ||
| 2404 | for_each_domain(this_cpu, sd) { | 2836 | for_each_domain(this_cpu, sd) { |
| 2405 | unsigned long interval; | ||
| 2406 | |||
| 2407 | if (!(sd->flags & SD_LOAD_BALANCE)) | 2837 | if (!(sd->flags & SD_LOAD_BALANCE)) |
| 2408 | continue; | 2838 | continue; |
| 2409 | 2839 | ||
| @@ -2433,17 +2863,18 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, | |||
| 2433 | /* | 2863 | /* |
| 2434 | * on UP we do not need to balance between CPUs: | 2864 | * on UP we do not need to balance between CPUs: |
| 2435 | */ | 2865 | */ |
| 2436 | static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) | 2866 | static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle) |
| 2437 | { | 2867 | { |
| 2438 | } | 2868 | } |
| 2439 | static inline void idle_balance(int cpu, runqueue_t *rq) | 2869 | static inline void idle_balance(int cpu, struct rq *rq) |
| 2440 | { | 2870 | { |
| 2441 | } | 2871 | } |
| 2442 | #endif | 2872 | #endif |
| 2443 | 2873 | ||
| 2444 | static inline int wake_priority_sleeper(runqueue_t *rq) | 2874 | static inline int wake_priority_sleeper(struct rq *rq) |
| 2445 | { | 2875 | { |
| 2446 | int ret = 0; | 2876 | int ret = 0; |
| 2877 | |||
| 2447 | #ifdef CONFIG_SCHED_SMT | 2878 | #ifdef CONFIG_SCHED_SMT |
| 2448 | spin_lock(&rq->lock); | 2879 | spin_lock(&rq->lock); |
| 2449 | /* | 2880 | /* |
| @@ -2467,25 +2898,26 @@ EXPORT_PER_CPU_SYMBOL(kstat); | |||
| 2467 | * This is called on clock ticks and on context switches. | 2898 | * This is called on clock ticks and on context switches. |
| 2468 | * Bank in p->sched_time the ns elapsed since the last tick or switch. | 2899 | * Bank in p->sched_time the ns elapsed since the last tick or switch. |
| 2469 | */ | 2900 | */ |
| 2470 | static inline void update_cpu_clock(task_t *p, runqueue_t *rq, | 2901 | static inline void |
| 2471 | unsigned long long now) | 2902 | update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) |
| 2472 | { | 2903 | { |
| 2473 | unsigned long long last = max(p->timestamp, rq->timestamp_last_tick); | 2904 | p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick); |
| 2474 | p->sched_time += now - last; | ||
| 2475 | } | 2905 | } |
| 2476 | 2906 | ||
| 2477 | /* | 2907 | /* |
| 2478 | * Return current->sched_time plus any more ns on the sched_clock | 2908 | * Return current->sched_time plus any more ns on the sched_clock |
| 2479 | * that have not yet been banked. | 2909 | * that have not yet been banked. |
| 2480 | */ | 2910 | */ |
| 2481 | unsigned long long current_sched_time(const task_t *tsk) | 2911 | unsigned long long current_sched_time(const struct task_struct *p) |
| 2482 | { | 2912 | { |
| 2483 | unsigned long long ns; | 2913 | unsigned long long ns; |
| 2484 | unsigned long flags; | 2914 | unsigned long flags; |
| 2915 | |||
| 2485 | local_irq_save(flags); | 2916 | local_irq_save(flags); |
| 2486 | ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick); | 2917 | ns = max(p->timestamp, task_rq(p)->timestamp_last_tick); |
| 2487 | ns = tsk->sched_time + (sched_clock() - ns); | 2918 | ns = p->sched_time + sched_clock() - ns; |
| 2488 | local_irq_restore(flags); | 2919 | local_irq_restore(flags); |
| 2920 | |||
| 2489 | return ns; | 2921 | return ns; |
| 2490 | } | 2922 | } |
| 2491 | 2923 | ||
| @@ -2499,11 +2931,16 @@ unsigned long long current_sched_time(const task_t *tsk) | |||
| 2499 | * increasing number of running tasks. We also ignore the interactivity | 2931 | * increasing number of running tasks. We also ignore the interactivity |
| 2500 | * if a better static_prio task has expired: | 2932 | * if a better static_prio task has expired: |
| 2501 | */ | 2933 | */ |
| 2502 | #define EXPIRED_STARVING(rq) \ | 2934 | static inline int expired_starving(struct rq *rq) |
| 2503 | ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ | 2935 | { |
| 2504 | (jiffies - (rq)->expired_timestamp >= \ | 2936 | if (rq->curr->static_prio > rq->best_expired_prio) |
| 2505 | STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ | 2937 | return 1; |
| 2506 | ((rq)->curr->static_prio > (rq)->best_expired_prio)) | 2938 | if (!STARVATION_LIMIT || !rq->expired_timestamp) |
| 2939 | return 0; | ||
| 2940 | if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) | ||
| 2941 | return 1; | ||
| 2942 | return 0; | ||
| 2943 | } | ||
| 2507 | 2944 | ||
| 2508 | /* | 2945 | /* |
| 2509 | * Account user cpu time to a process. | 2946 | * Account user cpu time to a process. |
| @@ -2536,7 +2973,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
| 2536 | cputime_t cputime) | 2973 | cputime_t cputime) |
| 2537 | { | 2974 | { |
| 2538 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2975 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
| 2539 | runqueue_t *rq = this_rq(); | 2976 | struct rq *rq = this_rq(); |
| 2540 | cputime64_t tmp; | 2977 | cputime64_t tmp; |
| 2541 | 2978 | ||
| 2542 | p->stime = cputime_add(p->stime, cputime); | 2979 | p->stime = cputime_add(p->stime, cputime); |
| @@ -2566,7 +3003,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal) | |||
| 2566 | { | 3003 | { |
| 2567 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3004 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
| 2568 | cputime64_t tmp = cputime_to_cputime64(steal); | 3005 | cputime64_t tmp = cputime_to_cputime64(steal); |
| 2569 | runqueue_t *rq = this_rq(); | 3006 | struct rq *rq = this_rq(); |
| 2570 | 3007 | ||
| 2571 | if (p == rq->idle) { | 3008 | if (p == rq->idle) { |
| 2572 | p->stime = cputime_add(p->stime, steal); | 3009 | p->stime = cputime_add(p->stime, steal); |
| @@ -2587,10 +3024,10 @@ void account_steal_time(struct task_struct *p, cputime_t steal) | |||
| 2587 | */ | 3024 | */ |
| 2588 | void scheduler_tick(void) | 3025 | void scheduler_tick(void) |
| 2589 | { | 3026 | { |
| 2590 | int cpu = smp_processor_id(); | ||
| 2591 | runqueue_t *rq = this_rq(); | ||
| 2592 | task_t *p = current; | ||
| 2593 | unsigned long long now = sched_clock(); | 3027 | unsigned long long now = sched_clock(); |
| 3028 | struct task_struct *p = current; | ||
| 3029 | int cpu = smp_processor_id(); | ||
| 3030 | struct rq *rq = cpu_rq(cpu); | ||
| 2594 | 3031 | ||
| 2595 | update_cpu_clock(p, rq, now); | 3032 | update_cpu_clock(p, rq, now); |
| 2596 | 3033 | ||
| @@ -2640,7 +3077,7 @@ void scheduler_tick(void) | |||
| 2640 | 3077 | ||
| 2641 | if (!rq->expired_timestamp) | 3078 | if (!rq->expired_timestamp) |
| 2642 | rq->expired_timestamp = jiffies; | 3079 | rq->expired_timestamp = jiffies; |
| 2643 | if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { | 3080 | if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { |
| 2644 | enqueue_task(p, rq->expired); | 3081 | enqueue_task(p, rq->expired); |
| 2645 | if (p->static_prio < rq->best_expired_prio) | 3082 | if (p->static_prio < rq->best_expired_prio) |
| 2646 | rq->best_expired_prio = p->static_prio; | 3083 | rq->best_expired_prio = p->static_prio; |
| @@ -2679,55 +3116,42 @@ out: | |||
| 2679 | } | 3116 | } |
| 2680 | 3117 | ||
| 2681 | #ifdef CONFIG_SCHED_SMT | 3118 | #ifdef CONFIG_SCHED_SMT |
| 2682 | static inline void wakeup_busy_runqueue(runqueue_t *rq) | 3119 | static inline void wakeup_busy_runqueue(struct rq *rq) |
| 2683 | { | 3120 | { |
| 2684 | /* If an SMT runqueue is sleeping due to priority reasons wake it up */ | 3121 | /* If an SMT runqueue is sleeping due to priority reasons wake it up */ |
| 2685 | if (rq->curr == rq->idle && rq->nr_running) | 3122 | if (rq->curr == rq->idle && rq->nr_running) |
| 2686 | resched_task(rq->idle); | 3123 | resched_task(rq->idle); |
| 2687 | } | 3124 | } |
| 2688 | 3125 | ||
| 2689 | static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 3126 | /* |
| 3127 | * Called with interrupt disabled and this_rq's runqueue locked. | ||
| 3128 | */ | ||
| 3129 | static void wake_sleeping_dependent(int this_cpu) | ||
| 2690 | { | 3130 | { |
| 2691 | struct sched_domain *tmp, *sd = NULL; | 3131 | struct sched_domain *tmp, *sd = NULL; |
| 2692 | cpumask_t sibling_map; | ||
| 2693 | int i; | 3132 | int i; |
| 2694 | 3133 | ||
| 2695 | for_each_domain(this_cpu, tmp) | 3134 | for_each_domain(this_cpu, tmp) { |
| 2696 | if (tmp->flags & SD_SHARE_CPUPOWER) | 3135 | if (tmp->flags & SD_SHARE_CPUPOWER) { |
| 2697 | sd = tmp; | 3136 | sd = tmp; |
| 3137 | break; | ||
| 3138 | } | ||
| 3139 | } | ||
| 2698 | 3140 | ||
| 2699 | if (!sd) | 3141 | if (!sd) |
| 2700 | return; | 3142 | return; |
| 2701 | 3143 | ||
| 2702 | /* | 3144 | for_each_cpu_mask(i, sd->span) { |
| 2703 | * Unlock the current runqueue because we have to lock in | 3145 | struct rq *smt_rq = cpu_rq(i); |
| 2704 | * CPU order to avoid deadlocks. Caller knows that we might | ||
| 2705 | * unlock. We keep IRQs disabled. | ||
| 2706 | */ | ||
| 2707 | spin_unlock(&this_rq->lock); | ||
| 2708 | |||
| 2709 | sibling_map = sd->span; | ||
| 2710 | |||
| 2711 | for_each_cpu_mask(i, sibling_map) | ||
| 2712 | spin_lock(&cpu_rq(i)->lock); | ||
| 2713 | /* | ||
| 2714 | * We clear this CPU from the mask. This both simplifies the | ||
| 2715 | * inner loop and keps this_rq locked when we exit: | ||
| 2716 | */ | ||
| 2717 | cpu_clear(this_cpu, sibling_map); | ||
| 2718 | 3146 | ||
| 2719 | for_each_cpu_mask(i, sibling_map) { | 3147 | if (i == this_cpu) |
| 2720 | runqueue_t *smt_rq = cpu_rq(i); | 3148 | continue; |
| 3149 | if (unlikely(!spin_trylock(&smt_rq->lock))) | ||
| 3150 | continue; | ||
| 2721 | 3151 | ||
| 2722 | wakeup_busy_runqueue(smt_rq); | 3152 | wakeup_busy_runqueue(smt_rq); |
| 3153 | spin_unlock(&smt_rq->lock); | ||
| 2723 | } | 3154 | } |
| 2724 | |||
| 2725 | for_each_cpu_mask(i, sibling_map) | ||
| 2726 | spin_unlock(&cpu_rq(i)->lock); | ||
| 2727 | /* | ||
| 2728 | * We exit with this_cpu's rq still held and IRQs | ||
| 2729 | * still disabled: | ||
| 2730 | */ | ||
| 2731 | } | 3155 | } |
| 2732 | 3156 | ||
| 2733 | /* | 3157 | /* |
| @@ -2735,57 +3159,53 @@ static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | |||
| 2735 | * utilize, if another task runs on a sibling. This models the | 3159 | * utilize, if another task runs on a sibling. This models the |
| 2736 | * slowdown effect of other tasks running on siblings: | 3160 | * slowdown effect of other tasks running on siblings: |
| 2737 | */ | 3161 | */ |
| 2738 | static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) | 3162 | static inline unsigned long |
| 3163 | smt_slice(struct task_struct *p, struct sched_domain *sd) | ||
| 2739 | { | 3164 | { |
| 2740 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; | 3165 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; |
| 2741 | } | 3166 | } |
| 2742 | 3167 | ||
| 2743 | static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 3168 | /* |
| 3169 | * To minimise lock contention and not have to drop this_rq's runlock we only | ||
| 3170 | * trylock the sibling runqueues and bypass those runqueues if we fail to | ||
| 3171 | * acquire their lock. As we only trylock the normal locking order does not | ||
| 3172 | * need to be obeyed. | ||
| 3173 | */ | ||
| 3174 | static int | ||
| 3175 | dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p) | ||
| 2744 | { | 3176 | { |
| 2745 | struct sched_domain *tmp, *sd = NULL; | 3177 | struct sched_domain *tmp, *sd = NULL; |
| 2746 | cpumask_t sibling_map; | ||
| 2747 | prio_array_t *array; | ||
| 2748 | int ret = 0, i; | 3178 | int ret = 0, i; |
| 2749 | task_t *p; | ||
| 2750 | 3179 | ||
| 2751 | for_each_domain(this_cpu, tmp) | 3180 | /* kernel/rt threads do not participate in dependent sleeping */ |
| 2752 | if (tmp->flags & SD_SHARE_CPUPOWER) | 3181 | if (!p->mm || rt_task(p)) |
| 3182 | return 0; | ||
| 3183 | |||
| 3184 | for_each_domain(this_cpu, tmp) { | ||
| 3185 | if (tmp->flags & SD_SHARE_CPUPOWER) { | ||
| 2753 | sd = tmp; | 3186 | sd = tmp; |
| 3187 | break; | ||
| 3188 | } | ||
| 3189 | } | ||
| 2754 | 3190 | ||
| 2755 | if (!sd) | 3191 | if (!sd) |
| 2756 | return 0; | 3192 | return 0; |
| 2757 | 3193 | ||
| 2758 | /* | 3194 | for_each_cpu_mask(i, sd->span) { |
| 2759 | * The same locking rules and details apply as for | 3195 | struct task_struct *smt_curr; |
| 2760 | * wake_sleeping_dependent(): | 3196 | struct rq *smt_rq; |
| 2761 | */ | ||
| 2762 | spin_unlock(&this_rq->lock); | ||
| 2763 | sibling_map = sd->span; | ||
| 2764 | for_each_cpu_mask(i, sibling_map) | ||
| 2765 | spin_lock(&cpu_rq(i)->lock); | ||
| 2766 | cpu_clear(this_cpu, sibling_map); | ||
| 2767 | 3197 | ||
| 2768 | /* | 3198 | if (i == this_cpu) |
| 2769 | * Establish next task to be run - it might have gone away because | 3199 | continue; |
| 2770 | * we released the runqueue lock above: | ||
| 2771 | */ | ||
| 2772 | if (!this_rq->nr_running) | ||
| 2773 | goto out_unlock; | ||
| 2774 | array = this_rq->active; | ||
| 2775 | if (!array->nr_active) | ||
| 2776 | array = this_rq->expired; | ||
| 2777 | BUG_ON(!array->nr_active); | ||
| 2778 | 3200 | ||
| 2779 | p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, | 3201 | smt_rq = cpu_rq(i); |
| 2780 | task_t, run_list); | 3202 | if (unlikely(!spin_trylock(&smt_rq->lock))) |
| 3203 | continue; | ||
| 2781 | 3204 | ||
| 2782 | for_each_cpu_mask(i, sibling_map) { | 3205 | smt_curr = smt_rq->curr; |
| 2783 | runqueue_t *smt_rq = cpu_rq(i); | ||
| 2784 | task_t *smt_curr = smt_rq->curr; | ||
| 2785 | 3206 | ||
| 2786 | /* Kernel threads do not participate in dependent sleeping */ | 3207 | if (!smt_curr->mm) |
| 2787 | if (!p->mm || !smt_curr->mm || rt_task(p)) | 3208 | goto unlock; |
| 2788 | goto check_smt_task; | ||
| 2789 | 3209 | ||
| 2790 | /* | 3210 | /* |
| 2791 | * If a user task with lower static priority than the | 3211 | * If a user task with lower static priority than the |
| @@ -2803,49 +3223,23 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | |||
| 2803 | if ((jiffies % DEF_TIMESLICE) > | 3223 | if ((jiffies % DEF_TIMESLICE) > |
| 2804 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) | 3224 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) |
| 2805 | ret = 1; | 3225 | ret = 1; |
| 2806 | } else | 3226 | } else { |
| 2807 | if (smt_curr->static_prio < p->static_prio && | 3227 | if (smt_curr->static_prio < p->static_prio && |
| 2808 | !TASK_PREEMPTS_CURR(p, smt_rq) && | 3228 | !TASK_PREEMPTS_CURR(p, smt_rq) && |
| 2809 | smt_slice(smt_curr, sd) > task_timeslice(p)) | 3229 | smt_slice(smt_curr, sd) > task_timeslice(p)) |
| 2810 | ret = 1; | 3230 | ret = 1; |
| 2811 | |||
| 2812 | check_smt_task: | ||
| 2813 | if ((!smt_curr->mm && smt_curr != smt_rq->idle) || | ||
| 2814 | rt_task(smt_curr)) | ||
| 2815 | continue; | ||
| 2816 | if (!p->mm) { | ||
| 2817 | wakeup_busy_runqueue(smt_rq); | ||
| 2818 | continue; | ||
| 2819 | } | ||
| 2820 | |||
| 2821 | /* | ||
| 2822 | * Reschedule a lower priority task on the SMT sibling for | ||
| 2823 | * it to be put to sleep, or wake it up if it has been put to | ||
| 2824 | * sleep for priority reasons to see if it should run now. | ||
| 2825 | */ | ||
| 2826 | if (rt_task(p)) { | ||
| 2827 | if ((jiffies % DEF_TIMESLICE) > | ||
| 2828 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) | ||
| 2829 | resched_task(smt_curr); | ||
| 2830 | } else { | ||
| 2831 | if (TASK_PREEMPTS_CURR(p, smt_rq) && | ||
| 2832 | smt_slice(p, sd) > task_timeslice(smt_curr)) | ||
| 2833 | resched_task(smt_curr); | ||
| 2834 | else | ||
| 2835 | wakeup_busy_runqueue(smt_rq); | ||
| 2836 | } | 3231 | } |
| 3232 | unlock: | ||
| 3233 | spin_unlock(&smt_rq->lock); | ||
| 2837 | } | 3234 | } |
| 2838 | out_unlock: | ||
| 2839 | for_each_cpu_mask(i, sibling_map) | ||
| 2840 | spin_unlock(&cpu_rq(i)->lock); | ||
| 2841 | return ret; | 3235 | return ret; |
| 2842 | } | 3236 | } |
| 2843 | #else | 3237 | #else |
| 2844 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 3238 | static inline void wake_sleeping_dependent(int this_cpu) |
| 2845 | { | 3239 | { |
| 2846 | } | 3240 | } |
| 2847 | 3241 | static inline int | |
| 2848 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 3242 | dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p) |
| 2849 | { | 3243 | { |
| 2850 | return 0; | 3244 | return 0; |
| 2851 | } | 3245 | } |
| @@ -2858,12 +3252,13 @@ void fastcall add_preempt_count(int val) | |||
| 2858 | /* | 3252 | /* |
| 2859 | * Underflow? | 3253 | * Underflow? |
| 2860 | */ | 3254 | */ |
| 2861 | BUG_ON((preempt_count() < 0)); | 3255 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) |
| 3256 | return; | ||
| 2862 | preempt_count() += val; | 3257 | preempt_count() += val; |
| 2863 | /* | 3258 | /* |
| 2864 | * Spinlock count overflowing soon? | 3259 | * Spinlock count overflowing soon? |
| 2865 | */ | 3260 | */ |
| 2866 | BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); | 3261 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); |
| 2867 | } | 3262 | } |
| 2868 | EXPORT_SYMBOL(add_preempt_count); | 3263 | EXPORT_SYMBOL(add_preempt_count); |
| 2869 | 3264 | ||
| @@ -2872,11 +3267,15 @@ void fastcall sub_preempt_count(int val) | |||
| 2872 | /* | 3267 | /* |
| 2873 | * Underflow? | 3268 | * Underflow? |
| 2874 | */ | 3269 | */ |
| 2875 | BUG_ON(val > preempt_count()); | 3270 | if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) |
| 3271 | return; | ||
| 2876 | /* | 3272 | /* |
| 2877 | * Is the spinlock portion underflowing? | 3273 | * Is the spinlock portion underflowing? |
| 2878 | */ | 3274 | */ |
| 2879 | BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); | 3275 | if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && |
| 3276 | !(preempt_count() & PREEMPT_MASK))) | ||
| 3277 | return; | ||
| 3278 | |||
| 2880 | preempt_count() -= val; | 3279 | preempt_count() -= val; |
| 2881 | } | 3280 | } |
| 2882 | EXPORT_SYMBOL(sub_preempt_count); | 3281 | EXPORT_SYMBOL(sub_preempt_count); |
| @@ -2894,14 +3293,14 @@ static inline int interactive_sleep(enum sleep_type sleep_type) | |||
| 2894 | */ | 3293 | */ |
| 2895 | asmlinkage void __sched schedule(void) | 3294 | asmlinkage void __sched schedule(void) |
| 2896 | { | 3295 | { |
| 2897 | long *switch_count; | 3296 | struct task_struct *prev, *next; |
| 2898 | task_t *prev, *next; | 3297 | struct prio_array *array; |
| 2899 | runqueue_t *rq; | ||
| 2900 | prio_array_t *array; | ||
| 2901 | struct list_head *queue; | 3298 | struct list_head *queue; |
| 2902 | unsigned long long now; | 3299 | unsigned long long now; |
| 2903 | unsigned long run_time; | 3300 | unsigned long run_time; |
| 2904 | int cpu, idx, new_prio; | 3301 | int cpu, idx, new_prio; |
| 3302 | long *switch_count; | ||
| 3303 | struct rq *rq; | ||
| 2905 | 3304 | ||
| 2906 | /* | 3305 | /* |
| 2907 | * Test if we are atomic. Since do_exit() needs to call into | 3306 | * Test if we are atomic. Since do_exit() needs to call into |
| @@ -2949,9 +3348,6 @@ need_resched_nonpreemptible: | |||
| 2949 | 3348 | ||
| 2950 | spin_lock_irq(&rq->lock); | 3349 | spin_lock_irq(&rq->lock); |
| 2951 | 3350 | ||
| 2952 | if (unlikely(prev->flags & PF_DEAD)) | ||
| 2953 | prev->state = EXIT_DEAD; | ||
| 2954 | |||
| 2955 | switch_count = &prev->nivcsw; | 3351 | switch_count = &prev->nivcsw; |
| 2956 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3352 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
| 2957 | switch_count = &prev->nvcsw; | 3353 | switch_count = &prev->nvcsw; |
| @@ -2967,32 +3363,13 @@ need_resched_nonpreemptible: | |||
| 2967 | 3363 | ||
| 2968 | cpu = smp_processor_id(); | 3364 | cpu = smp_processor_id(); |
| 2969 | if (unlikely(!rq->nr_running)) { | 3365 | if (unlikely(!rq->nr_running)) { |
| 2970 | go_idle: | ||
| 2971 | idle_balance(cpu, rq); | 3366 | idle_balance(cpu, rq); |
| 2972 | if (!rq->nr_running) { | 3367 | if (!rq->nr_running) { |
| 2973 | next = rq->idle; | 3368 | next = rq->idle; |
| 2974 | rq->expired_timestamp = 0; | 3369 | rq->expired_timestamp = 0; |
| 2975 | wake_sleeping_dependent(cpu, rq); | 3370 | wake_sleeping_dependent(cpu); |
| 2976 | /* | ||
| 2977 | * wake_sleeping_dependent() might have released | ||
| 2978 | * the runqueue, so break out if we got new | ||
| 2979 | * tasks meanwhile: | ||
| 2980 | */ | ||
| 2981 | if (!rq->nr_running) | ||
| 2982 | goto switch_tasks; | ||
| 2983 | } | ||
| 2984 | } else { | ||
| 2985 | if (dependent_sleeper(cpu, rq)) { | ||
| 2986 | next = rq->idle; | ||
| 2987 | goto switch_tasks; | 3371 | goto switch_tasks; |
| 2988 | } | 3372 | } |
| 2989 | /* | ||
| 2990 | * dependent_sleeper() releases and reacquires the runqueue | ||
| 2991 | * lock, hence go into the idle loop if the rq went | ||
| 2992 | * empty meanwhile: | ||
| 2993 | */ | ||
| 2994 | if (unlikely(!rq->nr_running)) | ||
| 2995 | goto go_idle; | ||
| 2996 | } | 3373 | } |
| 2997 | 3374 | ||
| 2998 | array = rq->active; | 3375 | array = rq->active; |
| @@ -3010,7 +3387,7 @@ go_idle: | |||
| 3010 | 3387 | ||
| 3011 | idx = sched_find_first_bit(array->bitmap); | 3388 | idx = sched_find_first_bit(array->bitmap); |
| 3012 | queue = array->queue + idx; | 3389 | queue = array->queue + idx; |
| 3013 | next = list_entry(queue->next, task_t, run_list); | 3390 | next = list_entry(queue->next, struct task_struct, run_list); |
| 3014 | 3391 | ||
| 3015 | if (!rt_task(next) && interactive_sleep(next->sleep_type)) { | 3392 | if (!rt_task(next) && interactive_sleep(next->sleep_type)) { |
| 3016 | unsigned long long delta = now - next->timestamp; | 3393 | unsigned long long delta = now - next->timestamp; |
| @@ -3030,6 +3407,8 @@ go_idle: | |||
| 3030 | } | 3407 | } |
| 3031 | } | 3408 | } |
| 3032 | next->sleep_type = SLEEP_NORMAL; | 3409 | next->sleep_type = SLEEP_NORMAL; |
| 3410 | if (dependent_sleeper(cpu, rq, next)) | ||
| 3411 | next = rq->idle; | ||
| 3033 | switch_tasks: | 3412 | switch_tasks: |
| 3034 | if (next == rq->idle) | 3413 | if (next == rq->idle) |
| 3035 | schedstat_inc(rq, sched_goidle); | 3414 | schedstat_inc(rq, sched_goidle); |
| @@ -3071,12 +3450,11 @@ switch_tasks: | |||
| 3071 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3450 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
| 3072 | goto need_resched; | 3451 | goto need_resched; |
| 3073 | } | 3452 | } |
| 3074 | |||
| 3075 | EXPORT_SYMBOL(schedule); | 3453 | EXPORT_SYMBOL(schedule); |
| 3076 | 3454 | ||
| 3077 | #ifdef CONFIG_PREEMPT | 3455 | #ifdef CONFIG_PREEMPT |
| 3078 | /* | 3456 | /* |
| 3079 | * this is is the entry point to schedule() from in-kernel preemption | 3457 | * this is the entry point to schedule() from in-kernel preemption |
| 3080 | * off of preempt_enable. Kernel preemptions off return from interrupt | 3458 | * off of preempt_enable. Kernel preemptions off return from interrupt |
| 3081 | * occur there and call schedule directly. | 3459 | * occur there and call schedule directly. |
| 3082 | */ | 3460 | */ |
| @@ -3116,11 +3494,10 @@ need_resched: | |||
| 3116 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3494 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
| 3117 | goto need_resched; | 3495 | goto need_resched; |
| 3118 | } | 3496 | } |
| 3119 | |||
| 3120 | EXPORT_SYMBOL(preempt_schedule); | 3497 | EXPORT_SYMBOL(preempt_schedule); |
| 3121 | 3498 | ||
| 3122 | /* | 3499 | /* |
| 3123 | * this is is the entry point to schedule() from kernel preemption | 3500 | * this is the entry point to schedule() from kernel preemption |
| 3124 | * off of irq context. | 3501 | * off of irq context. |
| 3125 | * Note, that this is called and return with irqs disabled. This will | 3502 | * Note, that this is called and return with irqs disabled. This will |
| 3126 | * protect us against recursive calling from irq. | 3503 | * protect us against recursive calling from irq. |
| @@ -3132,7 +3509,7 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
| 3132 | struct task_struct *task = current; | 3509 | struct task_struct *task = current; |
| 3133 | int saved_lock_depth; | 3510 | int saved_lock_depth; |
| 3134 | #endif | 3511 | #endif |
| 3135 | /* Catch callers which need to be fixed*/ | 3512 | /* Catch callers which need to be fixed */ |
| 3136 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 3513 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
| 3137 | 3514 | ||
| 3138 | need_resched: | 3515 | need_resched: |
| @@ -3165,10 +3542,8 @@ need_resched: | |||
| 3165 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, | 3542 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, |
| 3166 | void *key) | 3543 | void *key) |
| 3167 | { | 3544 | { |
| 3168 | task_t *p = curr->private; | 3545 | return try_to_wake_up(curr->private, mode, sync); |
| 3169 | return try_to_wake_up(p, mode, sync); | ||
| 3170 | } | 3546 | } |
| 3171 | |||
| 3172 | EXPORT_SYMBOL(default_wake_function); | 3547 | EXPORT_SYMBOL(default_wake_function); |
| 3173 | 3548 | ||
| 3174 | /* | 3549 | /* |
| @@ -3186,13 +3561,11 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | |||
| 3186 | struct list_head *tmp, *next; | 3561 | struct list_head *tmp, *next; |
| 3187 | 3562 | ||
| 3188 | list_for_each_safe(tmp, next, &q->task_list) { | 3563 | list_for_each_safe(tmp, next, &q->task_list) { |
| 3189 | wait_queue_t *curr; | 3564 | wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); |
| 3190 | unsigned flags; | 3565 | unsigned flags = curr->flags; |
| 3191 | curr = list_entry(tmp, wait_queue_t, task_list); | 3566 | |
| 3192 | flags = curr->flags; | ||
| 3193 | if (curr->func(curr, mode, sync, key) && | 3567 | if (curr->func(curr, mode, sync, key) && |
| 3194 | (flags & WQ_FLAG_EXCLUSIVE) && | 3568 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) |
| 3195 | !--nr_exclusive) | ||
| 3196 | break; | 3569 | break; |
| 3197 | } | 3570 | } |
| 3198 | } | 3571 | } |
| @@ -3213,7 +3586,6 @@ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, | |||
| 3213 | __wake_up_common(q, mode, nr_exclusive, 0, key); | 3586 | __wake_up_common(q, mode, nr_exclusive, 0, key); |
| 3214 | spin_unlock_irqrestore(&q->lock, flags); | 3587 | spin_unlock_irqrestore(&q->lock, flags); |
| 3215 | } | 3588 | } |
| 3216 | |||
| 3217 | EXPORT_SYMBOL(__wake_up); | 3589 | EXPORT_SYMBOL(__wake_up); |
| 3218 | 3590 | ||
| 3219 | /* | 3591 | /* |
| @@ -3282,6 +3654,7 @@ EXPORT_SYMBOL(complete_all); | |||
| 3282 | void fastcall __sched wait_for_completion(struct completion *x) | 3654 | void fastcall __sched wait_for_completion(struct completion *x) |
| 3283 | { | 3655 | { |
| 3284 | might_sleep(); | 3656 | might_sleep(); |
| 3657 | |||
| 3285 | spin_lock_irq(&x->wait.lock); | 3658 | spin_lock_irq(&x->wait.lock); |
| 3286 | if (!x->done) { | 3659 | if (!x->done) { |
| 3287 | DECLARE_WAITQUEUE(wait, current); | 3660 | DECLARE_WAITQUEUE(wait, current); |
| @@ -3426,7 +3799,6 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) | |||
| 3426 | schedule(); | 3799 | schedule(); |
| 3427 | SLEEP_ON_TAIL | 3800 | SLEEP_ON_TAIL |
| 3428 | } | 3801 | } |
| 3429 | |||
| 3430 | EXPORT_SYMBOL(interruptible_sleep_on); | 3802 | EXPORT_SYMBOL(interruptible_sleep_on); |
| 3431 | 3803 | ||
| 3432 | long fastcall __sched | 3804 | long fastcall __sched |
| @@ -3442,7 +3814,6 @@ interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) | |||
| 3442 | 3814 | ||
| 3443 | return timeout; | 3815 | return timeout; |
| 3444 | } | 3816 | } |
| 3445 | |||
| 3446 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); | 3817 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); |
| 3447 | 3818 | ||
| 3448 | void fastcall __sched sleep_on(wait_queue_head_t *q) | 3819 | void fastcall __sched sleep_on(wait_queue_head_t *q) |
| @@ -3455,7 +3826,6 @@ void fastcall __sched sleep_on(wait_queue_head_t *q) | |||
| 3455 | schedule(); | 3826 | schedule(); |
| 3456 | SLEEP_ON_TAIL | 3827 | SLEEP_ON_TAIL |
| 3457 | } | 3828 | } |
| 3458 | |||
| 3459 | EXPORT_SYMBOL(sleep_on); | 3829 | EXPORT_SYMBOL(sleep_on); |
| 3460 | 3830 | ||
| 3461 | long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | 3831 | long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) |
| @@ -3473,12 +3843,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | |||
| 3473 | 3843 | ||
| 3474 | EXPORT_SYMBOL(sleep_on_timeout); | 3844 | EXPORT_SYMBOL(sleep_on_timeout); |
| 3475 | 3845 | ||
| 3476 | void set_user_nice(task_t *p, long nice) | 3846 | #ifdef CONFIG_RT_MUTEXES |
| 3847 | |||
| 3848 | /* | ||
| 3849 | * rt_mutex_setprio - set the current priority of a task | ||
| 3850 | * @p: task | ||
| 3851 | * @prio: prio value (kernel-internal form) | ||
| 3852 | * | ||
| 3853 | * This function changes the 'effective' priority of a task. It does | ||
| 3854 | * not touch ->normal_prio like __setscheduler(). | ||
| 3855 | * | ||
| 3856 | * Used by the rt_mutex code to implement priority inheritance logic. | ||
| 3857 | */ | ||
| 3858 | void rt_mutex_setprio(struct task_struct *p, int prio) | ||
| 3859 | { | ||
| 3860 | struct prio_array *array; | ||
| 3861 | unsigned long flags; | ||
| 3862 | struct rq *rq; | ||
| 3863 | int oldprio; | ||
| 3864 | |||
| 3865 | BUG_ON(prio < 0 || prio > MAX_PRIO); | ||
| 3866 | |||
| 3867 | rq = task_rq_lock(p, &flags); | ||
| 3868 | |||
| 3869 | oldprio = p->prio; | ||
| 3870 | array = p->array; | ||
| 3871 | if (array) | ||
| 3872 | dequeue_task(p, array); | ||
| 3873 | p->prio = prio; | ||
| 3874 | |||
| 3875 | if (array) { | ||
| 3876 | /* | ||
| 3877 | * If changing to an RT priority then queue it | ||
| 3878 | * in the active array! | ||
| 3879 | */ | ||
| 3880 | if (rt_task(p)) | ||
| 3881 | array = rq->active; | ||
| 3882 | enqueue_task(p, array); | ||
| 3883 | /* | ||
| 3884 | * Reschedule if we are currently running on this runqueue and | ||
| 3885 | * our priority decreased, or if we are not currently running on | ||
| 3886 | * this runqueue and our priority is higher than the current's | ||
| 3887 | */ | ||
| 3888 | if (task_running(rq, p)) { | ||
| 3889 | if (p->prio > oldprio) | ||
| 3890 | resched_task(rq->curr); | ||
| 3891 | } else if (TASK_PREEMPTS_CURR(p, rq)) | ||
| 3892 | resched_task(rq->curr); | ||
| 3893 | } | ||
| 3894 | task_rq_unlock(rq, &flags); | ||
| 3895 | } | ||
| 3896 | |||
| 3897 | #endif | ||
| 3898 | |||
| 3899 | void set_user_nice(struct task_struct *p, long nice) | ||
| 3477 | { | 3900 | { |
| 3901 | struct prio_array *array; | ||
| 3902 | int old_prio, delta; | ||
| 3478 | unsigned long flags; | 3903 | unsigned long flags; |
| 3479 | prio_array_t *array; | 3904 | struct rq *rq; |
| 3480 | runqueue_t *rq; | ||
| 3481 | int old_prio, new_prio, delta; | ||
| 3482 | 3905 | ||
| 3483 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | 3906 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) |
| 3484 | return; | 3907 | return; |
| @@ -3493,22 +3916,25 @@ void set_user_nice(task_t *p, long nice) | |||
| 3493 | * it wont have any effect on scheduling until the task is | 3916 | * it wont have any effect on scheduling until the task is |
| 3494 | * not SCHED_NORMAL/SCHED_BATCH: | 3917 | * not SCHED_NORMAL/SCHED_BATCH: |
| 3495 | */ | 3918 | */ |
| 3496 | if (rt_task(p)) { | 3919 | if (has_rt_policy(p)) { |
| 3497 | p->static_prio = NICE_TO_PRIO(nice); | 3920 | p->static_prio = NICE_TO_PRIO(nice); |
| 3498 | goto out_unlock; | 3921 | goto out_unlock; |
| 3499 | } | 3922 | } |
| 3500 | array = p->array; | 3923 | array = p->array; |
| 3501 | if (array) | 3924 | if (array) { |
| 3502 | dequeue_task(p, array); | 3925 | dequeue_task(p, array); |
| 3926 | dec_raw_weighted_load(rq, p); | ||
| 3927 | } | ||
| 3503 | 3928 | ||
| 3504 | old_prio = p->prio; | ||
| 3505 | new_prio = NICE_TO_PRIO(nice); | ||
| 3506 | delta = new_prio - old_prio; | ||
| 3507 | p->static_prio = NICE_TO_PRIO(nice); | 3929 | p->static_prio = NICE_TO_PRIO(nice); |
| 3508 | p->prio += delta; | 3930 | set_load_weight(p); |
| 3931 | old_prio = p->prio; | ||
| 3932 | p->prio = effective_prio(p); | ||
| 3933 | delta = p->prio - old_prio; | ||
| 3509 | 3934 | ||
| 3510 | if (array) { | 3935 | if (array) { |
| 3511 | enqueue_task(p, array); | 3936 | enqueue_task(p, array); |
| 3937 | inc_raw_weighted_load(rq, p); | ||
| 3512 | /* | 3938 | /* |
| 3513 | * If the task increased its priority or is running and | 3939 | * If the task increased its priority or is running and |
| 3514 | * lowered its priority, then reschedule its CPU: | 3940 | * lowered its priority, then reschedule its CPU: |
| @@ -3519,7 +3945,6 @@ void set_user_nice(task_t *p, long nice) | |||
| 3519 | out_unlock: | 3945 | out_unlock: |
| 3520 | task_rq_unlock(rq, &flags); | 3946 | task_rq_unlock(rq, &flags); |
| 3521 | } | 3947 | } |
| 3522 | |||
| 3523 | EXPORT_SYMBOL(set_user_nice); | 3948 | EXPORT_SYMBOL(set_user_nice); |
| 3524 | 3949 | ||
| 3525 | /* | 3950 | /* |
| @@ -3527,10 +3952,11 @@ EXPORT_SYMBOL(set_user_nice); | |||
| 3527 | * @p: task | 3952 | * @p: task |
| 3528 | * @nice: nice value | 3953 | * @nice: nice value |
| 3529 | */ | 3954 | */ |
| 3530 | int can_nice(const task_t *p, const int nice) | 3955 | int can_nice(const struct task_struct *p, const int nice) |
| 3531 | { | 3956 | { |
| 3532 | /* convert nice value [19,-20] to rlimit style value [1,40] */ | 3957 | /* convert nice value [19,-20] to rlimit style value [1,40] */ |
| 3533 | int nice_rlim = 20 - nice; | 3958 | int nice_rlim = 20 - nice; |
| 3959 | |||
| 3534 | return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || | 3960 | return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || |
| 3535 | capable(CAP_SYS_NICE)); | 3961 | capable(CAP_SYS_NICE)); |
| 3536 | } | 3962 | } |
| @@ -3546,8 +3972,7 @@ int can_nice(const task_t *p, const int nice) | |||
| 3546 | */ | 3972 | */ |
| 3547 | asmlinkage long sys_nice(int increment) | 3973 | asmlinkage long sys_nice(int increment) |
| 3548 | { | 3974 | { |
| 3549 | int retval; | 3975 | long nice, retval; |
| 3550 | long nice; | ||
| 3551 | 3976 | ||
| 3552 | /* | 3977 | /* |
| 3553 | * Setpriority might change our priority at the same moment. | 3978 | * Setpriority might change our priority at the same moment. |
| @@ -3586,7 +4011,7 @@ asmlinkage long sys_nice(int increment) | |||
| 3586 | * RT tasks are offset by -200. Normal tasks are centered | 4011 | * RT tasks are offset by -200. Normal tasks are centered |
| 3587 | * around 0, value goes from -16 to +15. | 4012 | * around 0, value goes from -16 to +15. |
| 3588 | */ | 4013 | */ |
| 3589 | int task_prio(const task_t *p) | 4014 | int task_prio(const struct task_struct *p) |
| 3590 | { | 4015 | { |
| 3591 | return p->prio - MAX_RT_PRIO; | 4016 | return p->prio - MAX_RT_PRIO; |
| 3592 | } | 4017 | } |
| @@ -3595,7 +4020,7 @@ int task_prio(const task_t *p) | |||
| 3595 | * task_nice - return the nice value of a given task. | 4020 | * task_nice - return the nice value of a given task. |
| 3596 | * @p: the task in question. | 4021 | * @p: the task in question. |
| 3597 | */ | 4022 | */ |
| 3598 | int task_nice(const task_t *p) | 4023 | int task_nice(const struct task_struct *p) |
| 3599 | { | 4024 | { |
| 3600 | return TASK_NICE(p); | 4025 | return TASK_NICE(p); |
| 3601 | } | 4026 | } |
| @@ -3614,7 +4039,7 @@ int idle_cpu(int cpu) | |||
| 3614 | * idle_task - return the idle task for a given cpu. | 4039 | * idle_task - return the idle task for a given cpu. |
| 3615 | * @cpu: the processor in question. | 4040 | * @cpu: the processor in question. |
| 3616 | */ | 4041 | */ |
| 3617 | task_t *idle_task(int cpu) | 4042 | struct task_struct *idle_task(int cpu) |
| 3618 | { | 4043 | { |
| 3619 | return cpu_rq(cpu)->idle; | 4044 | return cpu_rq(cpu)->idle; |
| 3620 | } | 4045 | } |
| @@ -3623,7 +4048,7 @@ task_t *idle_task(int cpu) | |||
| 3623 | * find_process_by_pid - find a process with a matching PID value. | 4048 | * find_process_by_pid - find a process with a matching PID value. |
| 3624 | * @pid: the pid in question. | 4049 | * @pid: the pid in question. |
| 3625 | */ | 4050 | */ |
| 3626 | static inline task_t *find_process_by_pid(pid_t pid) | 4051 | static inline struct task_struct *find_process_by_pid(pid_t pid) |
| 3627 | { | 4052 | { |
| 3628 | return pid ? find_task_by_pid(pid) : current; | 4053 | return pid ? find_task_by_pid(pid) : current; |
| 3629 | } | 4054 | } |
| @@ -3632,18 +4057,18 @@ static inline task_t *find_process_by_pid(pid_t pid) | |||
| 3632 | static void __setscheduler(struct task_struct *p, int policy, int prio) | 4057 | static void __setscheduler(struct task_struct *p, int policy, int prio) |
| 3633 | { | 4058 | { |
| 3634 | BUG_ON(p->array); | 4059 | BUG_ON(p->array); |
| 4060 | |||
| 3635 | p->policy = policy; | 4061 | p->policy = policy; |
| 3636 | p->rt_priority = prio; | 4062 | p->rt_priority = prio; |
| 3637 | if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { | 4063 | p->normal_prio = normal_prio(p); |
| 3638 | p->prio = MAX_RT_PRIO-1 - p->rt_priority; | 4064 | /* we are holding p->pi_lock already */ |
| 3639 | } else { | 4065 | p->prio = rt_mutex_getprio(p); |
| 3640 | p->prio = p->static_prio; | 4066 | /* |
| 3641 | /* | 4067 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: |
| 3642 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: | 4068 | */ |
| 3643 | */ | 4069 | if (policy == SCHED_BATCH) |
| 3644 | if (policy == SCHED_BATCH) | 4070 | p->sleep_avg = 0; |
| 3645 | p->sleep_avg = 0; | 4071 | set_load_weight(p); |
| 3646 | } | ||
| 3647 | } | 4072 | } |
| 3648 | 4073 | ||
| 3649 | /** | 4074 | /** |
| @@ -3652,16 +4077,19 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) | |||
| 3652 | * @p: the task in question. | 4077 | * @p: the task in question. |
| 3653 | * @policy: new policy. | 4078 | * @policy: new policy. |
| 3654 | * @param: structure containing the new RT priority. | 4079 | * @param: structure containing the new RT priority. |
| 4080 | * | ||
| 4081 | * NOTE: the task may be already dead | ||
| 3655 | */ | 4082 | */ |
| 3656 | int sched_setscheduler(struct task_struct *p, int policy, | 4083 | int sched_setscheduler(struct task_struct *p, int policy, |
| 3657 | struct sched_param *param) | 4084 | struct sched_param *param) |
| 3658 | { | 4085 | { |
| 3659 | int retval; | 4086 | int retval, oldprio, oldpolicy = -1; |
| 3660 | int oldprio, oldpolicy = -1; | 4087 | struct prio_array *array; |
| 3661 | prio_array_t *array; | ||
| 3662 | unsigned long flags; | 4088 | unsigned long flags; |
| 3663 | runqueue_t *rq; | 4089 | struct rq *rq; |
| 3664 | 4090 | ||
| 4091 | /* may grab non-irq protected spin_locks */ | ||
| 4092 | BUG_ON(in_interrupt()); | ||
| 3665 | recheck: | 4093 | recheck: |
| 3666 | /* double check policy once rq lock held */ | 4094 | /* double check policy once rq lock held */ |
| 3667 | if (policy < 0) | 4095 | if (policy < 0) |
| @@ -3678,28 +4106,32 @@ recheck: | |||
| 3678 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || | 4106 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || |
| 3679 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) | 4107 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) |
| 3680 | return -EINVAL; | 4108 | return -EINVAL; |
| 3681 | if ((policy == SCHED_NORMAL || policy == SCHED_BATCH) | 4109 | if (is_rt_policy(policy) != (param->sched_priority != 0)) |
| 3682 | != (param->sched_priority == 0)) | ||
| 3683 | return -EINVAL; | 4110 | return -EINVAL; |
| 3684 | 4111 | ||
| 3685 | /* | 4112 | /* |
| 3686 | * Allow unprivileged RT tasks to decrease priority: | 4113 | * Allow unprivileged RT tasks to decrease priority: |
| 3687 | */ | 4114 | */ |
| 3688 | if (!capable(CAP_SYS_NICE)) { | 4115 | if (!capable(CAP_SYS_NICE)) { |
| 3689 | /* | 4116 | if (is_rt_policy(policy)) { |
| 3690 | * can't change policy, except between SCHED_NORMAL | 4117 | unsigned long rlim_rtprio; |
| 3691 | * and SCHED_BATCH: | 4118 | unsigned long flags; |
| 3692 | */ | 4119 | |
| 3693 | if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) && | 4120 | if (!lock_task_sighand(p, &flags)) |
| 3694 | (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) && | 4121 | return -ESRCH; |
| 3695 | !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) | 4122 | rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; |
| 3696 | return -EPERM; | 4123 | unlock_task_sighand(p, &flags); |
| 3697 | /* can't increase priority */ | 4124 | |
| 3698 | if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) && | 4125 | /* can't set/change the rt policy */ |
| 3699 | param->sched_priority > p->rt_priority && | 4126 | if (policy != p->policy && !rlim_rtprio) |
| 3700 | param->sched_priority > | 4127 | return -EPERM; |
| 3701 | p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) | 4128 | |
| 3702 | return -EPERM; | 4129 | /* can't increase priority */ |
| 4130 | if (param->sched_priority > p->rt_priority && | ||
| 4131 | param->sched_priority > rlim_rtprio) | ||
| 4132 | return -EPERM; | ||
| 4133 | } | ||
| 4134 | |||
| 3703 | /* can't change other user's priorities */ | 4135 | /* can't change other user's priorities */ |
| 3704 | if ((current->euid != p->euid) && | 4136 | if ((current->euid != p->euid) && |
| 3705 | (current->euid != p->uid)) | 4137 | (current->euid != p->uid)) |
| @@ -3710,14 +4142,20 @@ recheck: | |||
| 3710 | if (retval) | 4142 | if (retval) |
| 3711 | return retval; | 4143 | return retval; |
| 3712 | /* | 4144 | /* |
| 4145 | * make sure no PI-waiters arrive (or leave) while we are | ||
| 4146 | * changing the priority of the task: | ||
| 4147 | */ | ||
| 4148 | spin_lock_irqsave(&p->pi_lock, flags); | ||
| 4149 | /* | ||
| 3713 | * To be able to change p->policy safely, the apropriate | 4150 | * To be able to change p->policy safely, the apropriate |
| 3714 | * runqueue lock must be held. | 4151 | * runqueue lock must be held. |
| 3715 | */ | 4152 | */ |
| 3716 | rq = task_rq_lock(p, &flags); | 4153 | rq = __task_rq_lock(p); |
| 3717 | /* recheck policy now with rq lock held */ | 4154 | /* recheck policy now with rq lock held */ |
| 3718 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 4155 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
| 3719 | policy = oldpolicy = -1; | 4156 | policy = oldpolicy = -1; |
| 3720 | task_rq_unlock(rq, &flags); | 4157 | __task_rq_unlock(rq); |
| 4158 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
| 3721 | goto recheck; | 4159 | goto recheck; |
| 3722 | } | 4160 | } |
| 3723 | array = p->array; | 4161 | array = p->array; |
| @@ -3738,7 +4176,11 @@ recheck: | |||
| 3738 | } else if (TASK_PREEMPTS_CURR(p, rq)) | 4176 | } else if (TASK_PREEMPTS_CURR(p, rq)) |
| 3739 | resched_task(rq->curr); | 4177 | resched_task(rq->curr); |
| 3740 | } | 4178 | } |
| 3741 | task_rq_unlock(rq, &flags); | 4179 | __task_rq_unlock(rq); |
| 4180 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
| 4181 | |||
| 4182 | rt_mutex_adjust_pi(p); | ||
| 4183 | |||
| 3742 | return 0; | 4184 | return 0; |
| 3743 | } | 4185 | } |
| 3744 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 4186 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
| @@ -3746,22 +4188,22 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); | |||
| 3746 | static int | 4188 | static int |
| 3747 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | 4189 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) |
| 3748 | { | 4190 | { |
| 3749 | int retval; | ||
| 3750 | struct sched_param lparam; | 4191 | struct sched_param lparam; |
| 3751 | struct task_struct *p; | 4192 | struct task_struct *p; |
| 4193 | int retval; | ||
| 3752 | 4194 | ||
| 3753 | if (!param || pid < 0) | 4195 | if (!param || pid < 0) |
| 3754 | return -EINVAL; | 4196 | return -EINVAL; |
| 3755 | if (copy_from_user(&lparam, param, sizeof(struct sched_param))) | 4197 | if (copy_from_user(&lparam, param, sizeof(struct sched_param))) |
| 3756 | return -EFAULT; | 4198 | return -EFAULT; |
| 3757 | read_lock_irq(&tasklist_lock); | 4199 | |
| 4200 | rcu_read_lock(); | ||
| 4201 | retval = -ESRCH; | ||
| 3758 | p = find_process_by_pid(pid); | 4202 | p = find_process_by_pid(pid); |
| 3759 | if (!p) { | 4203 | if (p != NULL) |
| 3760 | read_unlock_irq(&tasklist_lock); | 4204 | retval = sched_setscheduler(p, policy, &lparam); |
| 3761 | return -ESRCH; | 4205 | rcu_read_unlock(); |
| 3762 | } | 4206 | |
| 3763 | retval = sched_setscheduler(p, policy, &lparam); | ||
| 3764 | read_unlock_irq(&tasklist_lock); | ||
| 3765 | return retval; | 4207 | return retval; |
| 3766 | } | 4208 | } |
| 3767 | 4209 | ||
| @@ -3797,8 +4239,8 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) | |||
| 3797 | */ | 4239 | */ |
| 3798 | asmlinkage long sys_sched_getscheduler(pid_t pid) | 4240 | asmlinkage long sys_sched_getscheduler(pid_t pid) |
| 3799 | { | 4241 | { |
| 4242 | struct task_struct *p; | ||
| 3800 | int retval = -EINVAL; | 4243 | int retval = -EINVAL; |
| 3801 | task_t *p; | ||
| 3802 | 4244 | ||
| 3803 | if (pid < 0) | 4245 | if (pid < 0) |
| 3804 | goto out_nounlock; | 4246 | goto out_nounlock; |
| @@ -3825,8 +4267,8 @@ out_nounlock: | |||
| 3825 | asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) | 4267 | asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) |
| 3826 | { | 4268 | { |
| 3827 | struct sched_param lp; | 4269 | struct sched_param lp; |
| 4270 | struct task_struct *p; | ||
| 3828 | int retval = -EINVAL; | 4271 | int retval = -EINVAL; |
| 3829 | task_t *p; | ||
| 3830 | 4272 | ||
| 3831 | if (!param || pid < 0) | 4273 | if (!param || pid < 0) |
| 3832 | goto out_nounlock; | 4274 | goto out_nounlock; |
| @@ -3859,9 +4301,9 @@ out_unlock: | |||
| 3859 | 4301 | ||
| 3860 | long sched_setaffinity(pid_t pid, cpumask_t new_mask) | 4302 | long sched_setaffinity(pid_t pid, cpumask_t new_mask) |
| 3861 | { | 4303 | { |
| 3862 | task_t *p; | ||
| 3863 | int retval; | ||
| 3864 | cpumask_t cpus_allowed; | 4304 | cpumask_t cpus_allowed; |
| 4305 | struct task_struct *p; | ||
| 4306 | int retval; | ||
| 3865 | 4307 | ||
| 3866 | lock_cpu_hotplug(); | 4308 | lock_cpu_hotplug(); |
| 3867 | read_lock(&tasklist_lock); | 4309 | read_lock(&tasklist_lock); |
| @@ -3947,8 +4389,8 @@ cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; | |||
| 3947 | 4389 | ||
| 3948 | long sched_getaffinity(pid_t pid, cpumask_t *mask) | 4390 | long sched_getaffinity(pid_t pid, cpumask_t *mask) |
| 3949 | { | 4391 | { |
| 4392 | struct task_struct *p; | ||
| 3950 | int retval; | 4393 | int retval; |
| 3951 | task_t *p; | ||
| 3952 | 4394 | ||
| 3953 | lock_cpu_hotplug(); | 4395 | lock_cpu_hotplug(); |
| 3954 | read_lock(&tasklist_lock); | 4396 | read_lock(&tasklist_lock); |
| @@ -4007,9 +4449,8 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, | |||
| 4007 | */ | 4449 | */ |
| 4008 | asmlinkage long sys_sched_yield(void) | 4450 | asmlinkage long sys_sched_yield(void) |
| 4009 | { | 4451 | { |
| 4010 | runqueue_t *rq = this_rq_lock(); | 4452 | struct rq *rq = this_rq_lock(); |
| 4011 | prio_array_t *array = current->array; | 4453 | struct prio_array *array = current->array, *target = rq->expired; |
| 4012 | prio_array_t *target = rq->expired; | ||
| 4013 | 4454 | ||
| 4014 | schedstat_inc(rq, yld_cnt); | 4455 | schedstat_inc(rq, yld_cnt); |
| 4015 | /* | 4456 | /* |
| @@ -4043,6 +4484,7 @@ asmlinkage long sys_sched_yield(void) | |||
| 4043 | * no need to preempt or enable interrupts: | 4484 | * no need to preempt or enable interrupts: |
| 4044 | */ | 4485 | */ |
| 4045 | __release(rq->lock); | 4486 | __release(rq->lock); |
| 4487 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | ||
| 4046 | _raw_spin_unlock(&rq->lock); | 4488 | _raw_spin_unlock(&rq->lock); |
| 4047 | preempt_enable_no_resched(); | 4489 | preempt_enable_no_resched(); |
| 4048 | 4490 | ||
| @@ -4051,7 +4493,16 @@ asmlinkage long sys_sched_yield(void) | |||
| 4051 | return 0; | 4493 | return 0; |
| 4052 | } | 4494 | } |
| 4053 | 4495 | ||
| 4054 | static inline void __cond_resched(void) | 4496 | static inline int __resched_legal(int expected_preempt_count) |
| 4497 | { | ||
| 4498 | if (unlikely(preempt_count() != expected_preempt_count)) | ||
| 4499 | return 0; | ||
| 4500 | if (unlikely(system_state != SYSTEM_RUNNING)) | ||
| 4501 | return 0; | ||
| 4502 | return 1; | ||
| 4503 | } | ||
| 4504 | |||
| 4505 | static void __cond_resched(void) | ||
| 4055 | { | 4506 | { |
| 4056 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 4507 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
| 4057 | __might_sleep(__FILE__, __LINE__); | 4508 | __might_sleep(__FILE__, __LINE__); |
| @@ -4061,10 +4512,6 @@ static inline void __cond_resched(void) | |||
| 4061 | * PREEMPT_ACTIVE, which could trigger a second | 4512 | * PREEMPT_ACTIVE, which could trigger a second |
| 4062 | * cond_resched() call. | 4513 | * cond_resched() call. |
| 4063 | */ | 4514 | */ |
| 4064 | if (unlikely(preempt_count())) | ||
| 4065 | return; | ||
| 4066 | if (unlikely(system_state != SYSTEM_RUNNING)) | ||
| 4067 | return; | ||
| 4068 | do { | 4515 | do { |
| 4069 | add_preempt_count(PREEMPT_ACTIVE); | 4516 | add_preempt_count(PREEMPT_ACTIVE); |
| 4070 | schedule(); | 4517 | schedule(); |
| @@ -4074,13 +4521,12 @@ static inline void __cond_resched(void) | |||
| 4074 | 4521 | ||
| 4075 | int __sched cond_resched(void) | 4522 | int __sched cond_resched(void) |
| 4076 | { | 4523 | { |
| 4077 | if (need_resched()) { | 4524 | if (need_resched() && __resched_legal(0)) { |
| 4078 | __cond_resched(); | 4525 | __cond_resched(); |
| 4079 | return 1; | 4526 | return 1; |
| 4080 | } | 4527 | } |
| 4081 | return 0; | 4528 | return 0; |
| 4082 | } | 4529 | } |
| 4083 | |||
| 4084 | EXPORT_SYMBOL(cond_resched); | 4530 | EXPORT_SYMBOL(cond_resched); |
| 4085 | 4531 | ||
| 4086 | /* | 4532 | /* |
| @@ -4101,7 +4547,8 @@ int cond_resched_lock(spinlock_t *lock) | |||
| 4101 | ret = 1; | 4547 | ret = 1; |
| 4102 | spin_lock(lock); | 4548 | spin_lock(lock); |
| 4103 | } | 4549 | } |
| 4104 | if (need_resched()) { | 4550 | if (need_resched() && __resched_legal(1)) { |
| 4551 | spin_release(&lock->dep_map, 1, _THIS_IP_); | ||
| 4105 | _raw_spin_unlock(lock); | 4552 | _raw_spin_unlock(lock); |
| 4106 | preempt_enable_no_resched(); | 4553 | preempt_enable_no_resched(); |
| 4107 | __cond_resched(); | 4554 | __cond_resched(); |
| @@ -4110,25 +4557,24 @@ int cond_resched_lock(spinlock_t *lock) | |||
| 4110 | } | 4557 | } |
| 4111 | return ret; | 4558 | return ret; |
| 4112 | } | 4559 | } |
| 4113 | |||
| 4114 | EXPORT_SYMBOL(cond_resched_lock); | 4560 | EXPORT_SYMBOL(cond_resched_lock); |
| 4115 | 4561 | ||
| 4116 | int __sched cond_resched_softirq(void) | 4562 | int __sched cond_resched_softirq(void) |
| 4117 | { | 4563 | { |
| 4118 | BUG_ON(!in_softirq()); | 4564 | BUG_ON(!in_softirq()); |
| 4119 | 4565 | ||
| 4120 | if (need_resched()) { | 4566 | if (need_resched() && __resched_legal(0)) { |
| 4121 | __local_bh_enable(); | 4567 | raw_local_irq_disable(); |
| 4568 | _local_bh_enable(); | ||
| 4569 | raw_local_irq_enable(); | ||
| 4122 | __cond_resched(); | 4570 | __cond_resched(); |
| 4123 | local_bh_disable(); | 4571 | local_bh_disable(); |
| 4124 | return 1; | 4572 | return 1; |
| 4125 | } | 4573 | } |
| 4126 | return 0; | 4574 | return 0; |
| 4127 | } | 4575 | } |
| 4128 | |||
| 4129 | EXPORT_SYMBOL(cond_resched_softirq); | 4576 | EXPORT_SYMBOL(cond_resched_softirq); |
| 4130 | 4577 | ||
| 4131 | |||
| 4132 | /** | 4578 | /** |
| 4133 | * yield - yield the current processor to other threads. | 4579 | * yield - yield the current processor to other threads. |
| 4134 | * | 4580 | * |
| @@ -4140,7 +4586,6 @@ void __sched yield(void) | |||
| 4140 | set_current_state(TASK_RUNNING); | 4586 | set_current_state(TASK_RUNNING); |
| 4141 | sys_sched_yield(); | 4587 | sys_sched_yield(); |
| 4142 | } | 4588 | } |
| 4143 | |||
| 4144 | EXPORT_SYMBOL(yield); | 4589 | EXPORT_SYMBOL(yield); |
| 4145 | 4590 | ||
| 4146 | /* | 4591 | /* |
| @@ -4152,23 +4597,26 @@ EXPORT_SYMBOL(yield); | |||
| 4152 | */ | 4597 | */ |
| 4153 | void __sched io_schedule(void) | 4598 | void __sched io_schedule(void) |
| 4154 | { | 4599 | { |
| 4155 | struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); | 4600 | struct rq *rq = &__raw_get_cpu_var(runqueues); |
| 4156 | 4601 | ||
| 4602 | delayacct_blkio_start(); | ||
| 4157 | atomic_inc(&rq->nr_iowait); | 4603 | atomic_inc(&rq->nr_iowait); |
| 4158 | schedule(); | 4604 | schedule(); |
| 4159 | atomic_dec(&rq->nr_iowait); | 4605 | atomic_dec(&rq->nr_iowait); |
| 4606 | delayacct_blkio_end(); | ||
| 4160 | } | 4607 | } |
| 4161 | |||
| 4162 | EXPORT_SYMBOL(io_schedule); | 4608 | EXPORT_SYMBOL(io_schedule); |
| 4163 | 4609 | ||
| 4164 | long __sched io_schedule_timeout(long timeout) | 4610 | long __sched io_schedule_timeout(long timeout) |
| 4165 | { | 4611 | { |
| 4166 | struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); | 4612 | struct rq *rq = &__raw_get_cpu_var(runqueues); |
| 4167 | long ret; | 4613 | long ret; |
| 4168 | 4614 | ||
| 4615 | delayacct_blkio_start(); | ||
| 4169 | atomic_inc(&rq->nr_iowait); | 4616 | atomic_inc(&rq->nr_iowait); |
| 4170 | ret = schedule_timeout(timeout); | 4617 | ret = schedule_timeout(timeout); |
| 4171 | atomic_dec(&rq->nr_iowait); | 4618 | atomic_dec(&rq->nr_iowait); |
| 4619 | delayacct_blkio_end(); | ||
| 4172 | return ret; | 4620 | return ret; |
| 4173 | } | 4621 | } |
| 4174 | 4622 | ||
| @@ -4230,9 +4678,9 @@ asmlinkage long sys_sched_get_priority_min(int policy) | |||
| 4230 | asmlinkage | 4678 | asmlinkage |
| 4231 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | 4679 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) |
| 4232 | { | 4680 | { |
| 4681 | struct task_struct *p; | ||
| 4233 | int retval = -EINVAL; | 4682 | int retval = -EINVAL; |
| 4234 | struct timespec t; | 4683 | struct timespec t; |
| 4235 | task_t *p; | ||
| 4236 | 4684 | ||
| 4237 | if (pid < 0) | 4685 | if (pid < 0) |
| 4238 | goto out_nounlock; | 4686 | goto out_nounlock; |
| @@ -4247,7 +4695,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | |||
| 4247 | if (retval) | 4695 | if (retval) |
| 4248 | goto out_unlock; | 4696 | goto out_unlock; |
| 4249 | 4697 | ||
| 4250 | jiffies_to_timespec(p->policy & SCHED_FIFO ? | 4698 | jiffies_to_timespec(p->policy == SCHED_FIFO ? |
| 4251 | 0 : task_timeslice(p), &t); | 4699 | 0 : task_timeslice(p), &t); |
| 4252 | read_unlock(&tasklist_lock); | 4700 | read_unlock(&tasklist_lock); |
| 4253 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | 4701 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
| @@ -4260,35 +4708,36 @@ out_unlock: | |||
| 4260 | 4708 | ||
| 4261 | static inline struct task_struct *eldest_child(struct task_struct *p) | 4709 | static inline struct task_struct *eldest_child(struct task_struct *p) |
| 4262 | { | 4710 | { |
| 4263 | if (list_empty(&p->children)) return NULL; | 4711 | if (list_empty(&p->children)) |
| 4712 | return NULL; | ||
| 4264 | return list_entry(p->children.next,struct task_struct,sibling); | 4713 | return list_entry(p->children.next,struct task_struct,sibling); |
| 4265 | } | 4714 | } |
| 4266 | 4715 | ||
| 4267 | static inline struct task_struct *older_sibling(struct task_struct *p) | 4716 | static inline struct task_struct *older_sibling(struct task_struct *p) |
| 4268 | { | 4717 | { |
| 4269 | if (p->sibling.prev==&p->parent->children) return NULL; | 4718 | if (p->sibling.prev==&p->parent->children) |
| 4719 | return NULL; | ||
| 4270 | return list_entry(p->sibling.prev,struct task_struct,sibling); | 4720 | return list_entry(p->sibling.prev,struct task_struct,sibling); |
| 4271 | } | 4721 | } |
| 4272 | 4722 | ||
| 4273 | static inline struct task_struct *younger_sibling(struct task_struct *p) | 4723 | static inline struct task_struct *younger_sibling(struct task_struct *p) |
| 4274 | { | 4724 | { |
| 4275 | if (p->sibling.next==&p->parent->children) return NULL; | 4725 | if (p->sibling.next==&p->parent->children) |
| 4726 | return NULL; | ||
| 4276 | return list_entry(p->sibling.next,struct task_struct,sibling); | 4727 | return list_entry(p->sibling.next,struct task_struct,sibling); |
| 4277 | } | 4728 | } |
| 4278 | 4729 | ||
| 4279 | static void show_task(task_t *p) | 4730 | static const char stat_nam[] = "RSDTtZX"; |
| 4731 | |||
| 4732 | static void show_task(struct task_struct *p) | ||
| 4280 | { | 4733 | { |
| 4281 | task_t *relative; | 4734 | struct task_struct *relative; |
| 4282 | unsigned state; | ||
| 4283 | unsigned long free = 0; | 4735 | unsigned long free = 0; |
| 4284 | static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; | 4736 | unsigned state; |
| 4285 | 4737 | ||
| 4286 | printk("%-13.13s ", p->comm); | ||
| 4287 | state = p->state ? __ffs(p->state) + 1 : 0; | 4738 | state = p->state ? __ffs(p->state) + 1 : 0; |
| 4288 | if (state < ARRAY_SIZE(stat_nam)) | 4739 | printk("%-13.13s %c", p->comm, |
| 4289 | printk(stat_nam[state]); | 4740 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); |
| 4290 | else | ||
| 4291 | printk("?"); | ||
| 4292 | #if (BITS_PER_LONG == 32) | 4741 | #if (BITS_PER_LONG == 32) |
| 4293 | if (state == TASK_RUNNING) | 4742 | if (state == TASK_RUNNING) |
| 4294 | printk(" running "); | 4743 | printk(" running "); |
| @@ -4332,7 +4781,7 @@ static void show_task(task_t *p) | |||
| 4332 | 4781 | ||
| 4333 | void show_state(void) | 4782 | void show_state(void) |
| 4334 | { | 4783 | { |
| 4335 | task_t *g, *p; | 4784 | struct task_struct *g, *p; |
| 4336 | 4785 | ||
| 4337 | #if (BITS_PER_LONG == 32) | 4786 | #if (BITS_PER_LONG == 32) |
| 4338 | printk("\n" | 4787 | printk("\n" |
| @@ -4354,7 +4803,7 @@ void show_state(void) | |||
| 4354 | } while_each_thread(g, p); | 4803 | } while_each_thread(g, p); |
| 4355 | 4804 | ||
| 4356 | read_unlock(&tasklist_lock); | 4805 | read_unlock(&tasklist_lock); |
| 4357 | mutex_debug_show_all_locks(); | 4806 | debug_show_all_locks(); |
| 4358 | } | 4807 | } |
| 4359 | 4808 | ||
| 4360 | /** | 4809 | /** |
| @@ -4365,15 +4814,15 @@ void show_state(void) | |||
| 4365 | * NOTE: this function does not set the idle thread's NEED_RESCHED | 4814 | * NOTE: this function does not set the idle thread's NEED_RESCHED |
| 4366 | * flag, to make booting more robust. | 4815 | * flag, to make booting more robust. |
| 4367 | */ | 4816 | */ |
| 4368 | void __devinit init_idle(task_t *idle, int cpu) | 4817 | void __devinit init_idle(struct task_struct *idle, int cpu) |
| 4369 | { | 4818 | { |
| 4370 | runqueue_t *rq = cpu_rq(cpu); | 4819 | struct rq *rq = cpu_rq(cpu); |
| 4371 | unsigned long flags; | 4820 | unsigned long flags; |
| 4372 | 4821 | ||
| 4373 | idle->timestamp = sched_clock(); | 4822 | idle->timestamp = sched_clock(); |
| 4374 | idle->sleep_avg = 0; | 4823 | idle->sleep_avg = 0; |
| 4375 | idle->array = NULL; | 4824 | idle->array = NULL; |
| 4376 | idle->prio = MAX_PRIO; | 4825 | idle->prio = idle->normal_prio = MAX_PRIO; |
| 4377 | idle->state = TASK_RUNNING; | 4826 | idle->state = TASK_RUNNING; |
| 4378 | idle->cpus_allowed = cpumask_of_cpu(cpu); | 4827 | idle->cpus_allowed = cpumask_of_cpu(cpu); |
| 4379 | set_task_cpu(idle, cpu); | 4828 | set_task_cpu(idle, cpu); |
| @@ -4406,7 +4855,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | |||
| 4406 | /* | 4855 | /* |
| 4407 | * This is how migration works: | 4856 | * This is how migration works: |
| 4408 | * | 4857 | * |
| 4409 | * 1) we queue a migration_req_t structure in the source CPU's | 4858 | * 1) we queue a struct migration_req structure in the source CPU's |
| 4410 | * runqueue and wake up that CPU's migration thread. | 4859 | * runqueue and wake up that CPU's migration thread. |
| 4411 | * 2) we down() the locked semaphore => thread blocks. | 4860 | * 2) we down() the locked semaphore => thread blocks. |
| 4412 | * 3) migration thread wakes up (implicitly it forces the migrated | 4861 | * 3) migration thread wakes up (implicitly it forces the migrated |
| @@ -4428,12 +4877,12 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | |||
| 4428 | * task must not exit() & deallocate itself prematurely. The | 4877 | * task must not exit() & deallocate itself prematurely. The |
| 4429 | * call is not atomic; no spinlocks may be held. | 4878 | * call is not atomic; no spinlocks may be held. |
| 4430 | */ | 4879 | */ |
| 4431 | int set_cpus_allowed(task_t *p, cpumask_t new_mask) | 4880 | int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) |
| 4432 | { | 4881 | { |
| 4882 | struct migration_req req; | ||
| 4433 | unsigned long flags; | 4883 | unsigned long flags; |
| 4884 | struct rq *rq; | ||
| 4434 | int ret = 0; | 4885 | int ret = 0; |
| 4435 | migration_req_t req; | ||
| 4436 | runqueue_t *rq; | ||
| 4437 | 4886 | ||
| 4438 | rq = task_rq_lock(p, &flags); | 4887 | rq = task_rq_lock(p, &flags); |
| 4439 | if (!cpus_intersects(new_mask, cpu_online_map)) { | 4888 | if (!cpus_intersects(new_mask, cpu_online_map)) { |
| @@ -4456,9 +4905,9 @@ int set_cpus_allowed(task_t *p, cpumask_t new_mask) | |||
| 4456 | } | 4905 | } |
| 4457 | out: | 4906 | out: |
| 4458 | task_rq_unlock(rq, &flags); | 4907 | task_rq_unlock(rq, &flags); |
| 4908 | |||
| 4459 | return ret; | 4909 | return ret; |
| 4460 | } | 4910 | } |
| 4461 | |||
| 4462 | EXPORT_SYMBOL_GPL(set_cpus_allowed); | 4911 | EXPORT_SYMBOL_GPL(set_cpus_allowed); |
| 4463 | 4912 | ||
| 4464 | /* | 4913 | /* |
| @@ -4469,13 +4918,16 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); | |||
| 4469 | * | 4918 | * |
| 4470 | * So we race with normal scheduler movements, but that's OK, as long | 4919 | * So we race with normal scheduler movements, but that's OK, as long |
| 4471 | * as the task is no longer on this CPU. | 4920 | * as the task is no longer on this CPU. |
| 4921 | * | ||
| 4922 | * Returns non-zero if task was successfully migrated. | ||
| 4472 | */ | 4923 | */ |
| 4473 | static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | 4924 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) |
| 4474 | { | 4925 | { |
| 4475 | runqueue_t *rq_dest, *rq_src; | 4926 | struct rq *rq_dest, *rq_src; |
| 4927 | int ret = 0; | ||
| 4476 | 4928 | ||
| 4477 | if (unlikely(cpu_is_offline(dest_cpu))) | 4929 | if (unlikely(cpu_is_offline(dest_cpu))) |
| 4478 | return; | 4930 | return ret; |
| 4479 | 4931 | ||
| 4480 | rq_src = cpu_rq(src_cpu); | 4932 | rq_src = cpu_rq(src_cpu); |
| 4481 | rq_dest = cpu_rq(dest_cpu); | 4933 | rq_dest = cpu_rq(dest_cpu); |
| @@ -4499,13 +4951,14 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
| 4499 | p->timestamp = p->timestamp - rq_src->timestamp_last_tick | 4951 | p->timestamp = p->timestamp - rq_src->timestamp_last_tick |
| 4500 | + rq_dest->timestamp_last_tick; | 4952 | + rq_dest->timestamp_last_tick; |
| 4501 | deactivate_task(p, rq_src); | 4953 | deactivate_task(p, rq_src); |
| 4502 | activate_task(p, rq_dest, 0); | 4954 | __activate_task(p, rq_dest); |
| 4503 | if (TASK_PREEMPTS_CURR(p, rq_dest)) | 4955 | if (TASK_PREEMPTS_CURR(p, rq_dest)) |
| 4504 | resched_task(rq_dest->curr); | 4956 | resched_task(rq_dest->curr); |
| 4505 | } | 4957 | } |
| 4506 | 4958 | ret = 1; | |
| 4507 | out: | 4959 | out: |
| 4508 | double_rq_unlock(rq_src, rq_dest); | 4960 | double_rq_unlock(rq_src, rq_dest); |
| 4961 | return ret; | ||
| 4509 | } | 4962 | } |
| 4510 | 4963 | ||
| 4511 | /* | 4964 | /* |
| @@ -4515,16 +4968,16 @@ out: | |||
| 4515 | */ | 4968 | */ |
| 4516 | static int migration_thread(void *data) | 4969 | static int migration_thread(void *data) |
| 4517 | { | 4970 | { |
| 4518 | runqueue_t *rq; | ||
| 4519 | int cpu = (long)data; | 4971 | int cpu = (long)data; |
| 4972 | struct rq *rq; | ||
| 4520 | 4973 | ||
| 4521 | rq = cpu_rq(cpu); | 4974 | rq = cpu_rq(cpu); |
| 4522 | BUG_ON(rq->migration_thread != current); | 4975 | BUG_ON(rq->migration_thread != current); |
| 4523 | 4976 | ||
| 4524 | set_current_state(TASK_INTERRUPTIBLE); | 4977 | set_current_state(TASK_INTERRUPTIBLE); |
| 4525 | while (!kthread_should_stop()) { | 4978 | while (!kthread_should_stop()) { |
| 4979 | struct migration_req *req; | ||
| 4526 | struct list_head *head; | 4980 | struct list_head *head; |
| 4527 | migration_req_t *req; | ||
| 4528 | 4981 | ||
| 4529 | try_to_freeze(); | 4982 | try_to_freeze(); |
| 4530 | 4983 | ||
| @@ -4548,7 +5001,7 @@ static int migration_thread(void *data) | |||
| 4548 | set_current_state(TASK_INTERRUPTIBLE); | 5001 | set_current_state(TASK_INTERRUPTIBLE); |
| 4549 | continue; | 5002 | continue; |
| 4550 | } | 5003 | } |
| 4551 | req = list_entry(head->next, migration_req_t, list); | 5004 | req = list_entry(head->next, struct migration_req, list); |
| 4552 | list_del_init(head->next); | 5005 | list_del_init(head->next); |
| 4553 | 5006 | ||
| 4554 | spin_unlock(&rq->lock); | 5007 | spin_unlock(&rq->lock); |
| @@ -4573,36 +5026,42 @@ wait_to_die: | |||
| 4573 | 5026 | ||
| 4574 | #ifdef CONFIG_HOTPLUG_CPU | 5027 | #ifdef CONFIG_HOTPLUG_CPU |
| 4575 | /* Figure out where task on dead CPU should go, use force if neccessary. */ | 5028 | /* Figure out where task on dead CPU should go, use force if neccessary. */ |
| 4576 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) | 5029 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) |
| 4577 | { | 5030 | { |
| 4578 | int dest_cpu; | 5031 | unsigned long flags; |
| 4579 | cpumask_t mask; | 5032 | cpumask_t mask; |
| 5033 | struct rq *rq; | ||
| 5034 | int dest_cpu; | ||
| 4580 | 5035 | ||
| 5036 | restart: | ||
| 4581 | /* On same node? */ | 5037 | /* On same node? */ |
| 4582 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); | 5038 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); |
| 4583 | cpus_and(mask, mask, tsk->cpus_allowed); | 5039 | cpus_and(mask, mask, p->cpus_allowed); |
| 4584 | dest_cpu = any_online_cpu(mask); | 5040 | dest_cpu = any_online_cpu(mask); |
| 4585 | 5041 | ||
| 4586 | /* On any allowed CPU? */ | 5042 | /* On any allowed CPU? */ |
| 4587 | if (dest_cpu == NR_CPUS) | 5043 | if (dest_cpu == NR_CPUS) |
| 4588 | dest_cpu = any_online_cpu(tsk->cpus_allowed); | 5044 | dest_cpu = any_online_cpu(p->cpus_allowed); |
| 4589 | 5045 | ||
| 4590 | /* No more Mr. Nice Guy. */ | 5046 | /* No more Mr. Nice Guy. */ |
| 4591 | if (dest_cpu == NR_CPUS) { | 5047 | if (dest_cpu == NR_CPUS) { |
| 4592 | cpus_setall(tsk->cpus_allowed); | 5048 | rq = task_rq_lock(p, &flags); |
| 4593 | dest_cpu = any_online_cpu(tsk->cpus_allowed); | 5049 | cpus_setall(p->cpus_allowed); |
| 5050 | dest_cpu = any_online_cpu(p->cpus_allowed); | ||
| 5051 | task_rq_unlock(rq, &flags); | ||
| 4594 | 5052 | ||
| 4595 | /* | 5053 | /* |
| 4596 | * Don't tell them about moving exiting tasks or | 5054 | * Don't tell them about moving exiting tasks or |
| 4597 | * kernel threads (both mm NULL), since they never | 5055 | * kernel threads (both mm NULL), since they never |
| 4598 | * leave kernel. | 5056 | * leave kernel. |
| 4599 | */ | 5057 | */ |
| 4600 | if (tsk->mm && printk_ratelimit()) | 5058 | if (p->mm && printk_ratelimit()) |
| 4601 | printk(KERN_INFO "process %d (%s) no " | 5059 | printk(KERN_INFO "process %d (%s) no " |
| 4602 | "longer affine to cpu%d\n", | 5060 | "longer affine to cpu%d\n", |
| 4603 | tsk->pid, tsk->comm, dead_cpu); | 5061 | p->pid, p->comm, dead_cpu); |
| 4604 | } | 5062 | } |
| 4605 | __migrate_task(tsk, dead_cpu, dest_cpu); | 5063 | if (!__migrate_task(p, dead_cpu, dest_cpu)) |
| 5064 | goto restart; | ||
| 4606 | } | 5065 | } |
| 4607 | 5066 | ||
| 4608 | /* | 5067 | /* |
| @@ -4612,9 +5071,9 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) | |||
| 4612 | * their home CPUs. So we just add the counter to another CPU's counter, | 5071 | * their home CPUs. So we just add the counter to another CPU's counter, |
| 4613 | * to keep the global sum constant after CPU-down: | 5072 | * to keep the global sum constant after CPU-down: |
| 4614 | */ | 5073 | */ |
| 4615 | static void migrate_nr_uninterruptible(runqueue_t *rq_src) | 5074 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
| 4616 | { | 5075 | { |
| 4617 | runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); | 5076 | struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); |
| 4618 | unsigned long flags; | 5077 | unsigned long flags; |
| 4619 | 5078 | ||
| 4620 | local_irq_save(flags); | 5079 | local_irq_save(flags); |
| @@ -4628,48 +5087,51 @@ static void migrate_nr_uninterruptible(runqueue_t *rq_src) | |||
| 4628 | /* Run through task list and migrate tasks from the dead cpu. */ | 5087 | /* Run through task list and migrate tasks from the dead cpu. */ |
| 4629 | static void migrate_live_tasks(int src_cpu) | 5088 | static void migrate_live_tasks(int src_cpu) |
| 4630 | { | 5089 | { |
| 4631 | struct task_struct *tsk, *t; | 5090 | struct task_struct *p, *t; |
| 4632 | 5091 | ||
| 4633 | write_lock_irq(&tasklist_lock); | 5092 | write_lock_irq(&tasklist_lock); |
| 4634 | 5093 | ||
| 4635 | do_each_thread(t, tsk) { | 5094 | do_each_thread(t, p) { |
| 4636 | if (tsk == current) | 5095 | if (p == current) |
| 4637 | continue; | 5096 | continue; |
| 4638 | 5097 | ||
| 4639 | if (task_cpu(tsk) == src_cpu) | 5098 | if (task_cpu(p) == src_cpu) |
| 4640 | move_task_off_dead_cpu(src_cpu, tsk); | 5099 | move_task_off_dead_cpu(src_cpu, p); |
| 4641 | } while_each_thread(t, tsk); | 5100 | } while_each_thread(t, p); |
| 4642 | 5101 | ||
| 4643 | write_unlock_irq(&tasklist_lock); | 5102 | write_unlock_irq(&tasklist_lock); |
| 4644 | } | 5103 | } |
| 4645 | 5104 | ||
| 4646 | /* Schedules idle task to be the next runnable task on current CPU. | 5105 | /* Schedules idle task to be the next runnable task on current CPU. |
| 4647 | * It does so by boosting its priority to highest possible and adding it to | 5106 | * It does so by boosting its priority to highest possible and adding it to |
| 4648 | * the _front_ of runqueue. Used by CPU offline code. | 5107 | * the _front_ of the runqueue. Used by CPU offline code. |
| 4649 | */ | 5108 | */ |
| 4650 | void sched_idle_next(void) | 5109 | void sched_idle_next(void) |
| 4651 | { | 5110 | { |
| 4652 | int cpu = smp_processor_id(); | 5111 | int this_cpu = smp_processor_id(); |
| 4653 | runqueue_t *rq = this_rq(); | 5112 | struct rq *rq = cpu_rq(this_cpu); |
| 4654 | struct task_struct *p = rq->idle; | 5113 | struct task_struct *p = rq->idle; |
| 4655 | unsigned long flags; | 5114 | unsigned long flags; |
| 4656 | 5115 | ||
| 4657 | /* cpu has to be offline */ | 5116 | /* cpu has to be offline */ |
| 4658 | BUG_ON(cpu_online(cpu)); | 5117 | BUG_ON(cpu_online(this_cpu)); |
| 4659 | 5118 | ||
| 4660 | /* Strictly not necessary since rest of the CPUs are stopped by now | 5119 | /* |
| 4661 | * and interrupts disabled on current cpu. | 5120 | * Strictly not necessary since rest of the CPUs are stopped by now |
| 5121 | * and interrupts disabled on the current cpu. | ||
| 4662 | */ | 5122 | */ |
| 4663 | spin_lock_irqsave(&rq->lock, flags); | 5123 | spin_lock_irqsave(&rq->lock, flags); |
| 4664 | 5124 | ||
| 4665 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); | 5125 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); |
| 4666 | /* Add idle task to _front_ of it's priority queue */ | 5126 | |
| 5127 | /* Add idle task to the _front_ of its priority queue: */ | ||
| 4667 | __activate_idle_task(p, rq); | 5128 | __activate_idle_task(p, rq); |
| 4668 | 5129 | ||
| 4669 | spin_unlock_irqrestore(&rq->lock, flags); | 5130 | spin_unlock_irqrestore(&rq->lock, flags); |
| 4670 | } | 5131 | } |
| 4671 | 5132 | ||
| 4672 | /* Ensures that the idle task is using init_mm right before its cpu goes | 5133 | /* |
| 5134 | * Ensures that the idle task is using init_mm right before its cpu goes | ||
| 4673 | * offline. | 5135 | * offline. |
| 4674 | */ | 5136 | */ |
| 4675 | void idle_task_exit(void) | 5137 | void idle_task_exit(void) |
| @@ -4683,17 +5145,17 @@ void idle_task_exit(void) | |||
| 4683 | mmdrop(mm); | 5145 | mmdrop(mm); |
| 4684 | } | 5146 | } |
| 4685 | 5147 | ||
| 4686 | static void migrate_dead(unsigned int dead_cpu, task_t *tsk) | 5148 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) |
| 4687 | { | 5149 | { |
| 4688 | struct runqueue *rq = cpu_rq(dead_cpu); | 5150 | struct rq *rq = cpu_rq(dead_cpu); |
| 4689 | 5151 | ||
| 4690 | /* Must be exiting, otherwise would be on tasklist. */ | 5152 | /* Must be exiting, otherwise would be on tasklist. */ |
| 4691 | BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD); | 5153 | BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); |
| 4692 | 5154 | ||
| 4693 | /* Cannot have done final schedule yet: would have vanished. */ | 5155 | /* Cannot have done final schedule yet: would have vanished. */ |
| 4694 | BUG_ON(tsk->flags & PF_DEAD); | 5156 | BUG_ON(p->state == TASK_DEAD); |
| 4695 | 5157 | ||
| 4696 | get_task_struct(tsk); | 5158 | get_task_struct(p); |
| 4697 | 5159 | ||
| 4698 | /* | 5160 | /* |
| 4699 | * Drop lock around migration; if someone else moves it, | 5161 | * Drop lock around migration; if someone else moves it, |
| @@ -4701,25 +5163,25 @@ static void migrate_dead(unsigned int dead_cpu, task_t *tsk) | |||
| 4701 | * fine. | 5163 | * fine. |
| 4702 | */ | 5164 | */ |
| 4703 | spin_unlock_irq(&rq->lock); | 5165 | spin_unlock_irq(&rq->lock); |
| 4704 | move_task_off_dead_cpu(dead_cpu, tsk); | 5166 | move_task_off_dead_cpu(dead_cpu, p); |
| 4705 | spin_lock_irq(&rq->lock); | 5167 | spin_lock_irq(&rq->lock); |
| 4706 | 5168 | ||
| 4707 | put_task_struct(tsk); | 5169 | put_task_struct(p); |
| 4708 | } | 5170 | } |
| 4709 | 5171 | ||
| 4710 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ | 5172 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ |
| 4711 | static void migrate_dead_tasks(unsigned int dead_cpu) | 5173 | static void migrate_dead_tasks(unsigned int dead_cpu) |
| 4712 | { | 5174 | { |
| 4713 | unsigned arr, i; | 5175 | struct rq *rq = cpu_rq(dead_cpu); |
| 4714 | struct runqueue *rq = cpu_rq(dead_cpu); | 5176 | unsigned int arr, i; |
| 4715 | 5177 | ||
| 4716 | for (arr = 0; arr < 2; arr++) { | 5178 | for (arr = 0; arr < 2; arr++) { |
| 4717 | for (i = 0; i < MAX_PRIO; i++) { | 5179 | for (i = 0; i < MAX_PRIO; i++) { |
| 4718 | struct list_head *list = &rq->arrays[arr].queue[i]; | 5180 | struct list_head *list = &rq->arrays[arr].queue[i]; |
| 5181 | |||
| 4719 | while (!list_empty(list)) | 5182 | while (!list_empty(list)) |
| 4720 | migrate_dead(dead_cpu, | 5183 | migrate_dead(dead_cpu, list_entry(list->next, |
| 4721 | list_entry(list->next, task_t, | 5184 | struct task_struct, run_list)); |
| 4722 | run_list)); | ||
| 4723 | } | 5185 | } |
| 4724 | } | 5186 | } |
| 4725 | } | 5187 | } |
| @@ -4729,13 +5191,13 @@ static void migrate_dead_tasks(unsigned int dead_cpu) | |||
| 4729 | * migration_call - callback that gets triggered when a CPU is added. | 5191 | * migration_call - callback that gets triggered when a CPU is added. |
| 4730 | * Here we can start up the necessary migration thread for the new CPU. | 5192 | * Here we can start up the necessary migration thread for the new CPU. |
| 4731 | */ | 5193 | */ |
| 4732 | static int migration_call(struct notifier_block *nfb, unsigned long action, | 5194 | static int __cpuinit |
| 4733 | void *hcpu) | 5195 | migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) |
| 4734 | { | 5196 | { |
| 4735 | int cpu = (long)hcpu; | ||
| 4736 | struct task_struct *p; | 5197 | struct task_struct *p; |
| 4737 | struct runqueue *rq; | 5198 | int cpu = (long)hcpu; |
| 4738 | unsigned long flags; | 5199 | unsigned long flags; |
| 5200 | struct rq *rq; | ||
| 4739 | 5201 | ||
| 4740 | switch (action) { | 5202 | switch (action) { |
| 4741 | case CPU_UP_PREPARE: | 5203 | case CPU_UP_PREPARE: |
| @@ -4750,18 +5212,23 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, | |||
| 4750 | task_rq_unlock(rq, &flags); | 5212 | task_rq_unlock(rq, &flags); |
| 4751 | cpu_rq(cpu)->migration_thread = p; | 5213 | cpu_rq(cpu)->migration_thread = p; |
| 4752 | break; | 5214 | break; |
| 5215 | |||
| 4753 | case CPU_ONLINE: | 5216 | case CPU_ONLINE: |
| 4754 | /* Strictly unneccessary, as first user will wake it. */ | 5217 | /* Strictly unneccessary, as first user will wake it. */ |
| 4755 | wake_up_process(cpu_rq(cpu)->migration_thread); | 5218 | wake_up_process(cpu_rq(cpu)->migration_thread); |
| 4756 | break; | 5219 | break; |
| 5220 | |||
| 4757 | #ifdef CONFIG_HOTPLUG_CPU | 5221 | #ifdef CONFIG_HOTPLUG_CPU |
| 4758 | case CPU_UP_CANCELED: | 5222 | case CPU_UP_CANCELED: |
| 5223 | if (!cpu_rq(cpu)->migration_thread) | ||
| 5224 | break; | ||
| 4759 | /* Unbind it from offline cpu so it can run. Fall thru. */ | 5225 | /* Unbind it from offline cpu so it can run. Fall thru. */ |
| 4760 | kthread_bind(cpu_rq(cpu)->migration_thread, | 5226 | kthread_bind(cpu_rq(cpu)->migration_thread, |
| 4761 | any_online_cpu(cpu_online_map)); | 5227 | any_online_cpu(cpu_online_map)); |
| 4762 | kthread_stop(cpu_rq(cpu)->migration_thread); | 5228 | kthread_stop(cpu_rq(cpu)->migration_thread); |
| 4763 | cpu_rq(cpu)->migration_thread = NULL; | 5229 | cpu_rq(cpu)->migration_thread = NULL; |
| 4764 | break; | 5230 | break; |
| 5231 | |||
| 4765 | case CPU_DEAD: | 5232 | case CPU_DEAD: |
| 4766 | migrate_live_tasks(cpu); | 5233 | migrate_live_tasks(cpu); |
| 4767 | rq = cpu_rq(cpu); | 5234 | rq = cpu_rq(cpu); |
| @@ -4782,9 +5249,10 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, | |||
| 4782 | * the requestors. */ | 5249 | * the requestors. */ |
| 4783 | spin_lock_irq(&rq->lock); | 5250 | spin_lock_irq(&rq->lock); |
| 4784 | while (!list_empty(&rq->migration_queue)) { | 5251 | while (!list_empty(&rq->migration_queue)) { |
| 4785 | migration_req_t *req; | 5252 | struct migration_req *req; |
| 5253 | |||
| 4786 | req = list_entry(rq->migration_queue.next, | 5254 | req = list_entry(rq->migration_queue.next, |
| 4787 | migration_req_t, list); | 5255 | struct migration_req, list); |
| 4788 | list_del_init(&req->list); | 5256 | list_del_init(&req->list); |
| 4789 | complete(&req->done); | 5257 | complete(&req->done); |
| 4790 | } | 5258 | } |
| @@ -4798,7 +5266,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, | |||
| 4798 | /* Register at highest priority so that task migration (migrate_all_tasks) | 5266 | /* Register at highest priority so that task migration (migrate_all_tasks) |
| 4799 | * happens before everything else. | 5267 | * happens before everything else. |
| 4800 | */ | 5268 | */ |
| 4801 | static struct notifier_block migration_notifier = { | 5269 | static struct notifier_block __cpuinitdata migration_notifier = { |
| 4802 | .notifier_call = migration_call, | 5270 | .notifier_call = migration_call, |
| 4803 | .priority = 10 | 5271 | .priority = 10 |
| 4804 | }; | 5272 | }; |
| @@ -4806,10 +5274,14 @@ static struct notifier_block migration_notifier = { | |||
| 4806 | int __init migration_init(void) | 5274 | int __init migration_init(void) |
| 4807 | { | 5275 | { |
| 4808 | void *cpu = (void *)(long)smp_processor_id(); | 5276 | void *cpu = (void *)(long)smp_processor_id(); |
| 4809 | /* Start one for boot CPU. */ | 5277 | int err; |
| 4810 | migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); | 5278 | |
| 5279 | /* Start one for the boot CPU: */ | ||
| 5280 | err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); | ||
| 5281 | BUG_ON(err == NOTIFY_BAD); | ||
| 4811 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 5282 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
| 4812 | register_cpu_notifier(&migration_notifier); | 5283 | register_cpu_notifier(&migration_notifier); |
| 5284 | |||
| 4813 | return 0; | 5285 | return 0; |
| 4814 | } | 5286 | } |
| 4815 | #endif | 5287 | #endif |
| @@ -4905,7 +5377,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
| 4905 | } while (sd); | 5377 | } while (sd); |
| 4906 | } | 5378 | } |
| 4907 | #else | 5379 | #else |
| 4908 | #define sched_domain_debug(sd, cpu) {} | 5380 | # define sched_domain_debug(sd, cpu) do { } while (0) |
| 4909 | #endif | 5381 | #endif |
| 4910 | 5382 | ||
| 4911 | static int sd_degenerate(struct sched_domain *sd) | 5383 | static int sd_degenerate(struct sched_domain *sd) |
| @@ -4931,8 +5403,8 @@ static int sd_degenerate(struct sched_domain *sd) | |||
| 4931 | return 1; | 5403 | return 1; |
| 4932 | } | 5404 | } |
| 4933 | 5405 | ||
| 4934 | static int sd_parent_degenerate(struct sched_domain *sd, | 5406 | static int |
| 4935 | struct sched_domain *parent) | 5407 | sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) |
| 4936 | { | 5408 | { |
| 4937 | unsigned long cflags = sd->flags, pflags = parent->flags; | 5409 | unsigned long cflags = sd->flags, pflags = parent->flags; |
| 4938 | 5410 | ||
| @@ -4965,7 +5437,7 @@ static int sd_parent_degenerate(struct sched_domain *sd, | |||
| 4965 | */ | 5437 | */ |
| 4966 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) | 5438 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) |
| 4967 | { | 5439 | { |
| 4968 | runqueue_t *rq = cpu_rq(cpu); | 5440 | struct rq *rq = cpu_rq(cpu); |
| 4969 | struct sched_domain *tmp; | 5441 | struct sched_domain *tmp; |
| 4970 | 5442 | ||
| 4971 | /* Remove the sched domains which do not contribute to scheduling. */ | 5443 | /* Remove the sched domains which do not contribute to scheduling. */ |
| @@ -5227,8 +5699,8 @@ static void touch_cache(void *__cache, unsigned long __size) | |||
| 5227 | /* | 5699 | /* |
| 5228 | * Measure the cache-cost of one task migration. Returns in units of nsec. | 5700 | * Measure the cache-cost of one task migration. Returns in units of nsec. |
| 5229 | */ | 5701 | */ |
| 5230 | static unsigned long long measure_one(void *cache, unsigned long size, | 5702 | static unsigned long long |
| 5231 | int source, int target) | 5703 | measure_one(void *cache, unsigned long size, int source, int target) |
| 5232 | { | 5704 | { |
| 5233 | cpumask_t mask, saved_mask; | 5705 | cpumask_t mask, saved_mask; |
| 5234 | unsigned long long t0, t1, t2, t3, cost; | 5706 | unsigned long long t0, t1, t2, t3, cost; |
| @@ -5380,7 +5852,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) | |||
| 5380 | cache = vmalloc(max_size); | 5852 | cache = vmalloc(max_size); |
| 5381 | if (!cache) { | 5853 | if (!cache) { |
| 5382 | printk("could not vmalloc %d bytes for cache!\n", 2*max_size); | 5854 | printk("could not vmalloc %d bytes for cache!\n", 2*max_size); |
| 5383 | return 1000000; // return 1 msec on very small boxen | 5855 | return 1000000; /* return 1 msec on very small boxen */ |
| 5384 | } | 5856 | } |
| 5385 | 5857 | ||
| 5386 | while (size <= max_size) { | 5858 | while (size <= max_size) { |
| @@ -5578,9 +6050,9 @@ static int find_next_best_node(int node, unsigned long *used_nodes) | |||
| 5578 | */ | 6050 | */ |
| 5579 | static cpumask_t sched_domain_node_span(int node) | 6051 | static cpumask_t sched_domain_node_span(int node) |
| 5580 | { | 6052 | { |
| 5581 | int i; | ||
| 5582 | cpumask_t span, nodemask; | ||
| 5583 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); | 6053 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); |
| 6054 | cpumask_t span, nodemask; | ||
| 6055 | int i; | ||
| 5584 | 6056 | ||
| 5585 | cpus_clear(span); | 6057 | cpus_clear(span); |
| 5586 | bitmap_zero(used_nodes, MAX_NUMNODES); | 6058 | bitmap_zero(used_nodes, MAX_NUMNODES); |
| @@ -5591,6 +6063,7 @@ static cpumask_t sched_domain_node_span(int node) | |||
| 5591 | 6063 | ||
| 5592 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | 6064 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { |
| 5593 | int next_node = find_next_best_node(node, used_nodes); | 6065 | int next_node = find_next_best_node(node, used_nodes); |
| 6066 | |||
| 5594 | nodemask = node_to_cpumask(next_node); | 6067 | nodemask = node_to_cpumask(next_node); |
| 5595 | cpus_or(span, span, nodemask); | 6068 | cpus_or(span, span, nodemask); |
| 5596 | } | 6069 | } |
| @@ -5599,22 +6072,27 @@ static cpumask_t sched_domain_node_span(int node) | |||
| 5599 | } | 6072 | } |
| 5600 | #endif | 6073 | #endif |
| 5601 | 6074 | ||
| 6075 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | ||
| 6076 | |||
| 5602 | /* | 6077 | /* |
| 5603 | * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we | 6078 | * SMT sched-domains: |
| 5604 | * can switch it on easily if needed. | ||
| 5605 | */ | 6079 | */ |
| 5606 | #ifdef CONFIG_SCHED_SMT | 6080 | #ifdef CONFIG_SCHED_SMT |
| 5607 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | 6081 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); |
| 5608 | static struct sched_group sched_group_cpus[NR_CPUS]; | 6082 | static struct sched_group sched_group_cpus[NR_CPUS]; |
| 6083 | |||
| 5609 | static int cpu_to_cpu_group(int cpu) | 6084 | static int cpu_to_cpu_group(int cpu) |
| 5610 | { | 6085 | { |
| 5611 | return cpu; | 6086 | return cpu; |
| 5612 | } | 6087 | } |
| 5613 | #endif | 6088 | #endif |
| 5614 | 6089 | ||
| 6090 | /* | ||
| 6091 | * multi-core sched-domains: | ||
| 6092 | */ | ||
| 5615 | #ifdef CONFIG_SCHED_MC | 6093 | #ifdef CONFIG_SCHED_MC |
| 5616 | static DEFINE_PER_CPU(struct sched_domain, core_domains); | 6094 | static DEFINE_PER_CPU(struct sched_domain, core_domains); |
| 5617 | static struct sched_group sched_group_core[NR_CPUS]; | 6095 | static struct sched_group *sched_group_core_bycpu[NR_CPUS]; |
| 5618 | #endif | 6096 | #endif |
| 5619 | 6097 | ||
| 5620 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 6098 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
| @@ -5630,10 +6108,11 @@ static int cpu_to_core_group(int cpu) | |||
| 5630 | #endif | 6108 | #endif |
| 5631 | 6109 | ||
| 5632 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | 6110 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); |
| 5633 | static struct sched_group sched_group_phys[NR_CPUS]; | 6111 | static struct sched_group *sched_group_phys_bycpu[NR_CPUS]; |
| 6112 | |||
| 5634 | static int cpu_to_phys_group(int cpu) | 6113 | static int cpu_to_phys_group(int cpu) |
| 5635 | { | 6114 | { |
| 5636 | #if defined(CONFIG_SCHED_MC) | 6115 | #ifdef CONFIG_SCHED_MC |
| 5637 | cpumask_t mask = cpu_coregroup_map(cpu); | 6116 | cpumask_t mask = cpu_coregroup_map(cpu); |
| 5638 | return first_cpu(mask); | 6117 | return first_cpu(mask); |
| 5639 | #elif defined(CONFIG_SCHED_SMT) | 6118 | #elif defined(CONFIG_SCHED_SMT) |
| @@ -5687,13 +6166,74 @@ next_sg: | |||
| 5687 | } | 6166 | } |
| 5688 | #endif | 6167 | #endif |
| 5689 | 6168 | ||
| 6169 | /* Free memory allocated for various sched_group structures */ | ||
| 6170 | static void free_sched_groups(const cpumask_t *cpu_map) | ||
| 6171 | { | ||
| 6172 | int cpu; | ||
| 6173 | #ifdef CONFIG_NUMA | ||
| 6174 | int i; | ||
| 6175 | |||
| 6176 | for_each_cpu_mask(cpu, *cpu_map) { | ||
| 6177 | struct sched_group *sched_group_allnodes | ||
| 6178 | = sched_group_allnodes_bycpu[cpu]; | ||
| 6179 | struct sched_group **sched_group_nodes | ||
| 6180 | = sched_group_nodes_bycpu[cpu]; | ||
| 6181 | |||
| 6182 | if (sched_group_allnodes) { | ||
| 6183 | kfree(sched_group_allnodes); | ||
| 6184 | sched_group_allnodes_bycpu[cpu] = NULL; | ||
| 6185 | } | ||
| 6186 | |||
| 6187 | if (!sched_group_nodes) | ||
| 6188 | continue; | ||
| 6189 | |||
| 6190 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
| 6191 | cpumask_t nodemask = node_to_cpumask(i); | ||
| 6192 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | ||
| 6193 | |||
| 6194 | cpus_and(nodemask, nodemask, *cpu_map); | ||
| 6195 | if (cpus_empty(nodemask)) | ||
| 6196 | continue; | ||
| 6197 | |||
| 6198 | if (sg == NULL) | ||
| 6199 | continue; | ||
| 6200 | sg = sg->next; | ||
| 6201 | next_sg: | ||
| 6202 | oldsg = sg; | ||
| 6203 | sg = sg->next; | ||
| 6204 | kfree(oldsg); | ||
| 6205 | if (oldsg != sched_group_nodes[i]) | ||
| 6206 | goto next_sg; | ||
| 6207 | } | ||
| 6208 | kfree(sched_group_nodes); | ||
| 6209 | sched_group_nodes_bycpu[cpu] = NULL; | ||
| 6210 | } | ||
| 6211 | #endif | ||
| 6212 | for_each_cpu_mask(cpu, *cpu_map) { | ||
| 6213 | if (sched_group_phys_bycpu[cpu]) { | ||
| 6214 | kfree(sched_group_phys_bycpu[cpu]); | ||
| 6215 | sched_group_phys_bycpu[cpu] = NULL; | ||
| 6216 | } | ||
| 6217 | #ifdef CONFIG_SCHED_MC | ||
| 6218 | if (sched_group_core_bycpu[cpu]) { | ||
| 6219 | kfree(sched_group_core_bycpu[cpu]); | ||
| 6220 | sched_group_core_bycpu[cpu] = NULL; | ||
| 6221 | } | ||
| 6222 | #endif | ||
| 6223 | } | ||
| 6224 | } | ||
| 6225 | |||
| 5690 | /* | 6226 | /* |
| 5691 | * Build sched domains for a given set of cpus and attach the sched domains | 6227 | * Build sched domains for a given set of cpus and attach the sched domains |
| 5692 | * to the individual cpus | 6228 | * to the individual cpus |
| 5693 | */ | 6229 | */ |
| 5694 | void build_sched_domains(const cpumask_t *cpu_map) | 6230 | static int build_sched_domains(const cpumask_t *cpu_map) |
| 5695 | { | 6231 | { |
| 5696 | int i; | 6232 | int i; |
| 6233 | struct sched_group *sched_group_phys = NULL; | ||
| 6234 | #ifdef CONFIG_SCHED_MC | ||
| 6235 | struct sched_group *sched_group_core = NULL; | ||
| 6236 | #endif | ||
| 5697 | #ifdef CONFIG_NUMA | 6237 | #ifdef CONFIG_NUMA |
| 5698 | struct sched_group **sched_group_nodes = NULL; | 6238 | struct sched_group **sched_group_nodes = NULL; |
| 5699 | struct sched_group *sched_group_allnodes = NULL; | 6239 | struct sched_group *sched_group_allnodes = NULL; |
| @@ -5701,11 +6241,11 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
| 5701 | /* | 6241 | /* |
| 5702 | * Allocate the per-node list of sched groups | 6242 | * Allocate the per-node list of sched groups |
| 5703 | */ | 6243 | */ |
| 5704 | sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, | 6244 | sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, |
| 5705 | GFP_ATOMIC); | 6245 | GFP_KERNEL); |
| 5706 | if (!sched_group_nodes) { | 6246 | if (!sched_group_nodes) { |
| 5707 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 6247 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
| 5708 | return; | 6248 | return -ENOMEM; |
| 5709 | } | 6249 | } |
| 5710 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | 6250 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; |
| 5711 | #endif | 6251 | #endif |
| @@ -5731,7 +6271,7 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
| 5731 | if (!sched_group_allnodes) { | 6271 | if (!sched_group_allnodes) { |
| 5732 | printk(KERN_WARNING | 6272 | printk(KERN_WARNING |
| 5733 | "Can not alloc allnodes sched group\n"); | 6273 | "Can not alloc allnodes sched group\n"); |
| 5734 | break; | 6274 | goto error; |
| 5735 | } | 6275 | } |
| 5736 | sched_group_allnodes_bycpu[i] | 6276 | sched_group_allnodes_bycpu[i] |
| 5737 | = sched_group_allnodes; | 6277 | = sched_group_allnodes; |
| @@ -5752,6 +6292,18 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
| 5752 | cpus_and(sd->span, sd->span, *cpu_map); | 6292 | cpus_and(sd->span, sd->span, *cpu_map); |
| 5753 | #endif | 6293 | #endif |
| 5754 | 6294 | ||
| 6295 | if (!sched_group_phys) { | ||
| 6296 | sched_group_phys | ||
| 6297 | = kmalloc(sizeof(struct sched_group) * NR_CPUS, | ||
| 6298 | GFP_KERNEL); | ||
| 6299 | if (!sched_group_phys) { | ||
| 6300 | printk (KERN_WARNING "Can not alloc phys sched" | ||
| 6301 | "group\n"); | ||
| 6302 | goto error; | ||
| 6303 | } | ||
| 6304 | sched_group_phys_bycpu[i] = sched_group_phys; | ||
| 6305 | } | ||
| 6306 | |||
| 5755 | p = sd; | 6307 | p = sd; |
| 5756 | sd = &per_cpu(phys_domains, i); | 6308 | sd = &per_cpu(phys_domains, i); |
| 5757 | group = cpu_to_phys_group(i); | 6309 | group = cpu_to_phys_group(i); |
| @@ -5761,6 +6313,18 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
| 5761 | sd->groups = &sched_group_phys[group]; | 6313 | sd->groups = &sched_group_phys[group]; |
| 5762 | 6314 | ||
| 5763 | #ifdef CONFIG_SCHED_MC | 6315 | #ifdef CONFIG_SCHED_MC |
| 6316 | if (!sched_group_core) { | ||
| 6317 | sched_group_core | ||
| 6318 | = kmalloc(sizeof(struct sched_group) * NR_CPUS, | ||
| 6319 | GFP_KERNEL); | ||
| 6320 | if (!sched_group_core) { | ||
| 6321 | printk (KERN_WARNING "Can not alloc core sched" | ||
| 6322 | "group\n"); | ||
| 6323 | goto error; | ||
| 6324 | } | ||
| 6325 | sched_group_core_bycpu[i] = sched_group_core; | ||
| 6326 | } | ||
| 6327 | |||
| 5764 | p = sd; | 6328 | p = sd; |
| 5765 | sd = &per_cpu(core_domains, i); | 6329 | sd = &per_cpu(core_domains, i); |
| 5766 | group = cpu_to_core_group(i); | 6330 | group = cpu_to_core_group(i); |
| @@ -5844,24 +6408,21 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
| 5844 | domainspan = sched_domain_node_span(i); | 6408 | domainspan = sched_domain_node_span(i); |
| 5845 | cpus_and(domainspan, domainspan, *cpu_map); | 6409 | cpus_and(domainspan, domainspan, *cpu_map); |
| 5846 | 6410 | ||
| 5847 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | 6411 | sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); |
| 6412 | if (!sg) { | ||
| 6413 | printk(KERN_WARNING "Can not alloc domain group for " | ||
| 6414 | "node %d\n", i); | ||
| 6415 | goto error; | ||
| 6416 | } | ||
| 5848 | sched_group_nodes[i] = sg; | 6417 | sched_group_nodes[i] = sg; |
| 5849 | for_each_cpu_mask(j, nodemask) { | 6418 | for_each_cpu_mask(j, nodemask) { |
| 5850 | struct sched_domain *sd; | 6419 | struct sched_domain *sd; |
| 5851 | sd = &per_cpu(node_domains, j); | 6420 | sd = &per_cpu(node_domains, j); |
| 5852 | sd->groups = sg; | 6421 | sd->groups = sg; |
| 5853 | if (sd->groups == NULL) { | ||
| 5854 | /* Turn off balancing if we have no groups */ | ||
| 5855 | sd->flags = 0; | ||
| 5856 | } | ||
| 5857 | } | ||
| 5858 | if (!sg) { | ||
| 5859 | printk(KERN_WARNING | ||
| 5860 | "Can not alloc domain group for node %d\n", i); | ||
| 5861 | continue; | ||
| 5862 | } | 6422 | } |
| 5863 | sg->cpu_power = 0; | 6423 | sg->cpu_power = 0; |
| 5864 | sg->cpumask = nodemask; | 6424 | sg->cpumask = nodemask; |
| 6425 | sg->next = sg; | ||
| 5865 | cpus_or(covered, covered, nodemask); | 6426 | cpus_or(covered, covered, nodemask); |
| 5866 | prev = sg; | 6427 | prev = sg; |
| 5867 | 6428 | ||
| @@ -5880,54 +6441,90 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
| 5880 | if (cpus_empty(tmp)) | 6441 | if (cpus_empty(tmp)) |
| 5881 | continue; | 6442 | continue; |
| 5882 | 6443 | ||
| 5883 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | 6444 | sg = kmalloc_node(sizeof(struct sched_group), |
| 6445 | GFP_KERNEL, i); | ||
| 5884 | if (!sg) { | 6446 | if (!sg) { |
| 5885 | printk(KERN_WARNING | 6447 | printk(KERN_WARNING |
| 5886 | "Can not alloc domain group for node %d\n", j); | 6448 | "Can not alloc domain group for node %d\n", j); |
| 5887 | break; | 6449 | goto error; |
| 5888 | } | 6450 | } |
| 5889 | sg->cpu_power = 0; | 6451 | sg->cpu_power = 0; |
| 5890 | sg->cpumask = tmp; | 6452 | sg->cpumask = tmp; |
| 6453 | sg->next = prev->next; | ||
| 5891 | cpus_or(covered, covered, tmp); | 6454 | cpus_or(covered, covered, tmp); |
| 5892 | prev->next = sg; | 6455 | prev->next = sg; |
| 5893 | prev = sg; | 6456 | prev = sg; |
| 5894 | } | 6457 | } |
| 5895 | prev->next = sched_group_nodes[i]; | ||
| 5896 | } | 6458 | } |
| 5897 | #endif | 6459 | #endif |
| 5898 | 6460 | ||
| 5899 | /* Calculate CPU power for physical packages and nodes */ | 6461 | /* Calculate CPU power for physical packages and nodes */ |
| 6462 | #ifdef CONFIG_SCHED_SMT | ||
| 5900 | for_each_cpu_mask(i, *cpu_map) { | 6463 | for_each_cpu_mask(i, *cpu_map) { |
| 5901 | int power; | ||
| 5902 | struct sched_domain *sd; | 6464 | struct sched_domain *sd; |
| 5903 | #ifdef CONFIG_SCHED_SMT | ||
| 5904 | sd = &per_cpu(cpu_domains, i); | 6465 | sd = &per_cpu(cpu_domains, i); |
| 5905 | power = SCHED_LOAD_SCALE; | 6466 | sd->groups->cpu_power = SCHED_LOAD_SCALE; |
| 5906 | sd->groups->cpu_power = power; | 6467 | } |
| 5907 | #endif | 6468 | #endif |
| 5908 | #ifdef CONFIG_SCHED_MC | 6469 | #ifdef CONFIG_SCHED_MC |
| 6470 | for_each_cpu_mask(i, *cpu_map) { | ||
| 6471 | int power; | ||
| 6472 | struct sched_domain *sd; | ||
| 5909 | sd = &per_cpu(core_domains, i); | 6473 | sd = &per_cpu(core_domains, i); |
| 5910 | power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) | 6474 | if (sched_smt_power_savings) |
| 6475 | power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); | ||
| 6476 | else | ||
| 6477 | power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) | ||
| 5911 | * SCHED_LOAD_SCALE / 10; | 6478 | * SCHED_LOAD_SCALE / 10; |
| 5912 | sd->groups->cpu_power = power; | 6479 | sd->groups->cpu_power = power; |
| 6480 | } | ||
| 6481 | #endif | ||
| 5913 | 6482 | ||
| 6483 | for_each_cpu_mask(i, *cpu_map) { | ||
| 6484 | struct sched_domain *sd; | ||
| 6485 | #ifdef CONFIG_SCHED_MC | ||
| 5914 | sd = &per_cpu(phys_domains, i); | 6486 | sd = &per_cpu(phys_domains, i); |
| 6487 | if (i != first_cpu(sd->groups->cpumask)) | ||
| 6488 | continue; | ||
| 5915 | 6489 | ||
| 5916 | /* | 6490 | sd->groups->cpu_power = 0; |
| 5917 | * This has to be < 2 * SCHED_LOAD_SCALE | 6491 | if (sched_mc_power_savings || sched_smt_power_savings) { |
| 5918 | * Lets keep it SCHED_LOAD_SCALE, so that | 6492 | int j; |
| 5919 | * while calculating NUMA group's cpu_power | 6493 | |
| 5920 | * we can simply do | 6494 | for_each_cpu_mask(j, sd->groups->cpumask) { |
| 5921 | * numa_group->cpu_power += phys_group->cpu_power; | 6495 | struct sched_domain *sd1; |
| 5922 | * | 6496 | sd1 = &per_cpu(core_domains, j); |
| 5923 | * See "only add power once for each physical pkg" | 6497 | /* |
| 5924 | * comment below | 6498 | * for each core we will add once |
| 5925 | */ | 6499 | * to the group in physical domain |
| 5926 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | 6500 | */ |
| 6501 | if (j != first_cpu(sd1->groups->cpumask)) | ||
| 6502 | continue; | ||
| 6503 | |||
| 6504 | if (sched_smt_power_savings) | ||
| 6505 | sd->groups->cpu_power += sd1->groups->cpu_power; | ||
| 6506 | else | ||
| 6507 | sd->groups->cpu_power += SCHED_LOAD_SCALE; | ||
| 6508 | } | ||
| 6509 | } else | ||
| 6510 | /* | ||
| 6511 | * This has to be < 2 * SCHED_LOAD_SCALE | ||
| 6512 | * Lets keep it SCHED_LOAD_SCALE, so that | ||
| 6513 | * while calculating NUMA group's cpu_power | ||
| 6514 | * we can simply do | ||
| 6515 | * numa_group->cpu_power += phys_group->cpu_power; | ||
| 6516 | * | ||
| 6517 | * See "only add power once for each physical pkg" | ||
| 6518 | * comment below | ||
| 6519 | */ | ||
| 6520 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | ||
| 5927 | #else | 6521 | #else |
| 6522 | int power; | ||
| 5928 | sd = &per_cpu(phys_domains, i); | 6523 | sd = &per_cpu(phys_domains, i); |
| 5929 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * | 6524 | if (sched_smt_power_savings) |
| 5930 | (cpus_weight(sd->groups->cpumask)-1) / 10; | 6525 | power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); |
| 6526 | else | ||
| 6527 | power = SCHED_LOAD_SCALE; | ||
| 5931 | sd->groups->cpu_power = power; | 6528 | sd->groups->cpu_power = power; |
| 5932 | #endif | 6529 | #endif |
| 5933 | } | 6530 | } |
| @@ -5936,7 +6533,12 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
| 5936 | for (i = 0; i < MAX_NUMNODES; i++) | 6533 | for (i = 0; i < MAX_NUMNODES; i++) |
| 5937 | init_numa_sched_groups_power(sched_group_nodes[i]); | 6534 | init_numa_sched_groups_power(sched_group_nodes[i]); |
| 5938 | 6535 | ||
| 5939 | init_numa_sched_groups_power(sched_group_allnodes); | 6536 | if (sched_group_allnodes) { |
| 6537 | int group = cpu_to_allnodes_group(first_cpu(*cpu_map)); | ||
| 6538 | struct sched_group *sg = &sched_group_allnodes[group]; | ||
| 6539 | |||
| 6540 | init_numa_sched_groups_power(sg); | ||
| 6541 | } | ||
| 5940 | #endif | 6542 | #endif |
| 5941 | 6543 | ||
| 5942 | /* Attach the domains */ | 6544 | /* Attach the domains */ |
| @@ -5955,13 +6557,20 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
| 5955 | * Tune cache-hot values: | 6557 | * Tune cache-hot values: |
| 5956 | */ | 6558 | */ |
| 5957 | calibrate_migration_costs(cpu_map); | 6559 | calibrate_migration_costs(cpu_map); |
| 6560 | |||
| 6561 | return 0; | ||
| 6562 | |||
| 6563 | error: | ||
| 6564 | free_sched_groups(cpu_map); | ||
| 6565 | return -ENOMEM; | ||
| 5958 | } | 6566 | } |
| 5959 | /* | 6567 | /* |
| 5960 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 6568 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
| 5961 | */ | 6569 | */ |
| 5962 | static void arch_init_sched_domains(const cpumask_t *cpu_map) | 6570 | static int arch_init_sched_domains(const cpumask_t *cpu_map) |
| 5963 | { | 6571 | { |
| 5964 | cpumask_t cpu_default_map; | 6572 | cpumask_t cpu_default_map; |
| 6573 | int err; | ||
| 5965 | 6574 | ||
| 5966 | /* | 6575 | /* |
| 5967 | * Setup mask for cpus without special case scheduling requirements. | 6576 | * Setup mask for cpus without special case scheduling requirements. |
| @@ -5970,51 +6579,14 @@ static void arch_init_sched_domains(const cpumask_t *cpu_map) | |||
| 5970 | */ | 6579 | */ |
| 5971 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); | 6580 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); |
| 5972 | 6581 | ||
| 5973 | build_sched_domains(&cpu_default_map); | 6582 | err = build_sched_domains(&cpu_default_map); |
| 6583 | |||
| 6584 | return err; | ||
| 5974 | } | 6585 | } |
| 5975 | 6586 | ||
| 5976 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | 6587 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) |
| 5977 | { | 6588 | { |
| 5978 | #ifdef CONFIG_NUMA | 6589 | free_sched_groups(cpu_map); |
| 5979 | int i; | ||
| 5980 | int cpu; | ||
| 5981 | |||
| 5982 | for_each_cpu_mask(cpu, *cpu_map) { | ||
| 5983 | struct sched_group *sched_group_allnodes | ||
| 5984 | = sched_group_allnodes_bycpu[cpu]; | ||
| 5985 | struct sched_group **sched_group_nodes | ||
| 5986 | = sched_group_nodes_bycpu[cpu]; | ||
| 5987 | |||
| 5988 | if (sched_group_allnodes) { | ||
| 5989 | kfree(sched_group_allnodes); | ||
| 5990 | sched_group_allnodes_bycpu[cpu] = NULL; | ||
| 5991 | } | ||
| 5992 | |||
| 5993 | if (!sched_group_nodes) | ||
| 5994 | continue; | ||
| 5995 | |||
| 5996 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
| 5997 | cpumask_t nodemask = node_to_cpumask(i); | ||
| 5998 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | ||
| 5999 | |||
| 6000 | cpus_and(nodemask, nodemask, *cpu_map); | ||
| 6001 | if (cpus_empty(nodemask)) | ||
| 6002 | continue; | ||
| 6003 | |||
| 6004 | if (sg == NULL) | ||
| 6005 | continue; | ||
| 6006 | sg = sg->next; | ||
| 6007 | next_sg: | ||
| 6008 | oldsg = sg; | ||
| 6009 | sg = sg->next; | ||
| 6010 | kfree(oldsg); | ||
| 6011 | if (oldsg != sched_group_nodes[i]) | ||
| 6012 | goto next_sg; | ||
| 6013 | } | ||
| 6014 | kfree(sched_group_nodes); | ||
| 6015 | sched_group_nodes_bycpu[cpu] = NULL; | ||
| 6016 | } | ||
| 6017 | #endif | ||
| 6018 | } | 6590 | } |
| 6019 | 6591 | ||
| 6020 | /* | 6592 | /* |
| @@ -6039,9 +6611,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
| 6039 | * correct sched domains | 6611 | * correct sched domains |
| 6040 | * Call with hotplug lock held | 6612 | * Call with hotplug lock held |
| 6041 | */ | 6613 | */ |
| 6042 | void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | 6614 | int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) |
| 6043 | { | 6615 | { |
| 6044 | cpumask_t change_map; | 6616 | cpumask_t change_map; |
| 6617 | int err = 0; | ||
| 6045 | 6618 | ||
| 6046 | cpus_and(*partition1, *partition1, cpu_online_map); | 6619 | cpus_and(*partition1, *partition1, cpu_online_map); |
| 6047 | cpus_and(*partition2, *partition2, cpu_online_map); | 6620 | cpus_and(*partition2, *partition2, cpu_online_map); |
| @@ -6050,10 +6623,89 @@ void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | |||
| 6050 | /* Detach sched domains from all of the affected cpus */ | 6623 | /* Detach sched domains from all of the affected cpus */ |
| 6051 | detach_destroy_domains(&change_map); | 6624 | detach_destroy_domains(&change_map); |
| 6052 | if (!cpus_empty(*partition1)) | 6625 | if (!cpus_empty(*partition1)) |
| 6053 | build_sched_domains(partition1); | 6626 | err = build_sched_domains(partition1); |
| 6054 | if (!cpus_empty(*partition2)) | 6627 | if (!err && !cpus_empty(*partition2)) |
| 6055 | build_sched_domains(partition2); | 6628 | err = build_sched_domains(partition2); |
| 6629 | |||
| 6630 | return err; | ||
| 6631 | } | ||
| 6632 | |||
| 6633 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
| 6634 | int arch_reinit_sched_domains(void) | ||
| 6635 | { | ||
| 6636 | int err; | ||
| 6637 | |||
| 6638 | lock_cpu_hotplug(); | ||
| 6639 | detach_destroy_domains(&cpu_online_map); | ||
| 6640 | err = arch_init_sched_domains(&cpu_online_map); | ||
| 6641 | unlock_cpu_hotplug(); | ||
| 6642 | |||
| 6643 | return err; | ||
| 6644 | } | ||
| 6645 | |||
| 6646 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | ||
| 6647 | { | ||
| 6648 | int ret; | ||
| 6649 | |||
| 6650 | if (buf[0] != '0' && buf[0] != '1') | ||
| 6651 | return -EINVAL; | ||
| 6652 | |||
| 6653 | if (smt) | ||
| 6654 | sched_smt_power_savings = (buf[0] == '1'); | ||
| 6655 | else | ||
| 6656 | sched_mc_power_savings = (buf[0] == '1'); | ||
| 6657 | |||
| 6658 | ret = arch_reinit_sched_domains(); | ||
| 6659 | |||
| 6660 | return ret ? ret : count; | ||
| 6661 | } | ||
| 6662 | |||
| 6663 | int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | ||
| 6664 | { | ||
| 6665 | int err = 0; | ||
| 6666 | |||
| 6667 | #ifdef CONFIG_SCHED_SMT | ||
| 6668 | if (smt_capable()) | ||
| 6669 | err = sysfs_create_file(&cls->kset.kobj, | ||
| 6670 | &attr_sched_smt_power_savings.attr); | ||
| 6671 | #endif | ||
| 6672 | #ifdef CONFIG_SCHED_MC | ||
| 6673 | if (!err && mc_capable()) | ||
| 6674 | err = sysfs_create_file(&cls->kset.kobj, | ||
| 6675 | &attr_sched_mc_power_savings.attr); | ||
| 6676 | #endif | ||
| 6677 | return err; | ||
| 6678 | } | ||
| 6679 | #endif | ||
| 6680 | |||
| 6681 | #ifdef CONFIG_SCHED_MC | ||
| 6682 | static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) | ||
| 6683 | { | ||
| 6684 | return sprintf(page, "%u\n", sched_mc_power_savings); | ||
| 6685 | } | ||
| 6686 | static ssize_t sched_mc_power_savings_store(struct sys_device *dev, | ||
| 6687 | const char *buf, size_t count) | ||
| 6688 | { | ||
| 6689 | return sched_power_savings_store(buf, count, 0); | ||
| 6690 | } | ||
| 6691 | SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, | ||
| 6692 | sched_mc_power_savings_store); | ||
| 6693 | #endif | ||
| 6694 | |||
| 6695 | #ifdef CONFIG_SCHED_SMT | ||
| 6696 | static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) | ||
| 6697 | { | ||
| 6698 | return sprintf(page, "%u\n", sched_smt_power_savings); | ||
| 6699 | } | ||
| 6700 | static ssize_t sched_smt_power_savings_store(struct sys_device *dev, | ||
| 6701 | const char *buf, size_t count) | ||
| 6702 | { | ||
| 6703 | return sched_power_savings_store(buf, count, 1); | ||
| 6056 | } | 6704 | } |
| 6705 | SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, | ||
| 6706 | sched_smt_power_savings_store); | ||
| 6707 | #endif | ||
| 6708 | |||
| 6057 | 6709 | ||
| 6058 | #ifdef CONFIG_HOTPLUG_CPU | 6710 | #ifdef CONFIG_HOTPLUG_CPU |
| 6059 | /* | 6711 | /* |
| @@ -6108,6 +6760,7 @@ int in_sched_functions(unsigned long addr) | |||
| 6108 | { | 6760 | { |
| 6109 | /* Linker adds these: start and end of __sched functions */ | 6761 | /* Linker adds these: start and end of __sched functions */ |
| 6110 | extern char __sched_text_start[], __sched_text_end[]; | 6762 | extern char __sched_text_start[], __sched_text_end[]; |
| 6763 | |||
| 6111 | return in_lock_functions(addr) || | 6764 | return in_lock_functions(addr) || |
| 6112 | (addr >= (unsigned long)__sched_text_start | 6765 | (addr >= (unsigned long)__sched_text_start |
| 6113 | && addr < (unsigned long)__sched_text_end); | 6766 | && addr < (unsigned long)__sched_text_end); |
| @@ -6115,14 +6768,15 @@ int in_sched_functions(unsigned long addr) | |||
| 6115 | 6768 | ||
| 6116 | void __init sched_init(void) | 6769 | void __init sched_init(void) |
| 6117 | { | 6770 | { |
| 6118 | runqueue_t *rq; | ||
| 6119 | int i, j, k; | 6771 | int i, j, k; |
| 6120 | 6772 | ||
| 6121 | for_each_possible_cpu(i) { | 6773 | for_each_possible_cpu(i) { |
| 6122 | prio_array_t *array; | 6774 | struct prio_array *array; |
| 6775 | struct rq *rq; | ||
| 6123 | 6776 | ||
| 6124 | rq = cpu_rq(i); | 6777 | rq = cpu_rq(i); |
| 6125 | spin_lock_init(&rq->lock); | 6778 | spin_lock_init(&rq->lock); |
| 6779 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); | ||
| 6126 | rq->nr_running = 0; | 6780 | rq->nr_running = 0; |
| 6127 | rq->active = rq->arrays; | 6781 | rq->active = rq->arrays; |
| 6128 | rq->expired = rq->arrays + 1; | 6782 | rq->expired = rq->arrays + 1; |
| @@ -6134,9 +6788,9 @@ void __init sched_init(void) | |||
| 6134 | rq->cpu_load[j] = 0; | 6788 | rq->cpu_load[j] = 0; |
| 6135 | rq->active_balance = 0; | 6789 | rq->active_balance = 0; |
| 6136 | rq->push_cpu = 0; | 6790 | rq->push_cpu = 0; |
| 6791 | rq->cpu = i; | ||
| 6137 | rq->migration_thread = NULL; | 6792 | rq->migration_thread = NULL; |
| 6138 | INIT_LIST_HEAD(&rq->migration_queue); | 6793 | INIT_LIST_HEAD(&rq->migration_queue); |
| 6139 | rq->cpu = i; | ||
| 6140 | #endif | 6794 | #endif |
| 6141 | atomic_set(&rq->nr_iowait, 0); | 6795 | atomic_set(&rq->nr_iowait, 0); |
| 6142 | 6796 | ||
| @@ -6151,6 +6805,12 @@ void __init sched_init(void) | |||
| 6151 | } | 6805 | } |
| 6152 | } | 6806 | } |
| 6153 | 6807 | ||
| 6808 | set_load_weight(&init_task); | ||
| 6809 | |||
| 6810 | #ifdef CONFIG_RT_MUTEXES | ||
| 6811 | plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); | ||
| 6812 | #endif | ||
| 6813 | |||
| 6154 | /* | 6814 | /* |
| 6155 | * The boot idle thread does lazy MMU switching as well: | 6815 | * The boot idle thread does lazy MMU switching as well: |
| 6156 | */ | 6816 | */ |
| @@ -6169,7 +6829,7 @@ void __init sched_init(void) | |||
| 6169 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 6829 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
| 6170 | void __might_sleep(char *file, int line) | 6830 | void __might_sleep(char *file, int line) |
| 6171 | { | 6831 | { |
| 6172 | #if defined(in_atomic) | 6832 | #ifdef in_atomic |
| 6173 | static unsigned long prev_jiffy; /* ratelimiting */ | 6833 | static unsigned long prev_jiffy; /* ratelimiting */ |
| 6174 | 6834 | ||
| 6175 | if ((in_atomic() || irqs_disabled()) && | 6835 | if ((in_atomic() || irqs_disabled()) && |
| @@ -6191,17 +6851,18 @@ EXPORT_SYMBOL(__might_sleep); | |||
| 6191 | #ifdef CONFIG_MAGIC_SYSRQ | 6851 | #ifdef CONFIG_MAGIC_SYSRQ |
| 6192 | void normalize_rt_tasks(void) | 6852 | void normalize_rt_tasks(void) |
| 6193 | { | 6853 | { |
| 6854 | struct prio_array *array; | ||
| 6194 | struct task_struct *p; | 6855 | struct task_struct *p; |
| 6195 | prio_array_t *array; | ||
| 6196 | unsigned long flags; | 6856 | unsigned long flags; |
| 6197 | runqueue_t *rq; | 6857 | struct rq *rq; |
| 6198 | 6858 | ||
| 6199 | read_lock_irq(&tasklist_lock); | 6859 | read_lock_irq(&tasklist_lock); |
| 6200 | for_each_process (p) { | 6860 | for_each_process(p) { |
| 6201 | if (!rt_task(p)) | 6861 | if (!rt_task(p)) |
| 6202 | continue; | 6862 | continue; |
| 6203 | 6863 | ||
| 6204 | rq = task_rq_lock(p, &flags); | 6864 | spin_lock_irqsave(&p->pi_lock, flags); |
| 6865 | rq = __task_rq_lock(p); | ||
| 6205 | 6866 | ||
| 6206 | array = p->array; | 6867 | array = p->array; |
| 6207 | if (array) | 6868 | if (array) |
| @@ -6212,7 +6873,8 @@ void normalize_rt_tasks(void) | |||
| 6212 | resched_task(rq->curr); | 6873 | resched_task(rq->curr); |
| 6213 | } | 6874 | } |
| 6214 | 6875 | ||
| 6215 | task_rq_unlock(rq, &flags); | 6876 | __task_rq_unlock(rq); |
| 6877 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
| 6216 | } | 6878 | } |
| 6217 | read_unlock_irq(&tasklist_lock); | 6879 | read_unlock_irq(&tasklist_lock); |
| 6218 | } | 6880 | } |
| @@ -6236,7 +6898,7 @@ void normalize_rt_tasks(void) | |||
| 6236 | * | 6898 | * |
| 6237 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 6899 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
| 6238 | */ | 6900 | */ |
| 6239 | task_t *curr_task(int cpu) | 6901 | struct task_struct *curr_task(int cpu) |
| 6240 | { | 6902 | { |
| 6241 | return cpu_curr(cpu); | 6903 | return cpu_curr(cpu); |
| 6242 | } | 6904 | } |
| @@ -6256,7 +6918,7 @@ task_t *curr_task(int cpu) | |||
| 6256 | * | 6918 | * |
| 6257 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 6919 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
| 6258 | */ | 6920 | */ |
| 6259 | void set_curr_task(int cpu, task_t *p) | 6921 | void set_curr_task(int cpu, struct task_struct *p) |
| 6260 | { | 6922 | { |
| 6261 | cpu_curr(cpu) = p; | 6923 | cpu_curr(cpu) = p; |
| 6262 | } | 6924 | } |
diff --git a/kernel/signal.c b/kernel/signal.c index 1b3c921737e2..fb5da6d19f14 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -10,7 +10,6 @@ | |||
| 10 | * to allow signals to be sent reliably. | 10 | * to allow signals to be sent reliably. |
| 11 | */ | 11 | */ |
| 12 | 12 | ||
| 13 | #include <linux/config.h> | ||
| 14 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
| 15 | #include <linux/module.h> | 14 | #include <linux/module.h> |
| 16 | #include <linux/smp_lock.h> | 15 | #include <linux/smp_lock.h> |
| @@ -418,9 +417,8 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info) | |||
| 418 | static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, | 417 | static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, |
| 419 | siginfo_t *info) | 418 | siginfo_t *info) |
| 420 | { | 419 | { |
| 421 | int sig = 0; | 420 | int sig = next_signal(pending, mask); |
| 422 | 421 | ||
| 423 | sig = next_signal(pending, mask); | ||
| 424 | if (sig) { | 422 | if (sig) { |
| 425 | if (current->notifier) { | 423 | if (current->notifier) { |
| 426 | if (sigismember(current->notifier_mask, sig)) { | 424 | if (sigismember(current->notifier_mask, sig)) { |
| @@ -433,9 +431,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, | |||
| 433 | 431 | ||
| 434 | if (!collect_signal(sig, pending, info)) | 432 | if (!collect_signal(sig, pending, info)) |
| 435 | sig = 0; | 433 | sig = 0; |
| 436 | |||
| 437 | } | 434 | } |
| 438 | recalc_sigpending(); | ||
| 439 | 435 | ||
| 440 | return sig; | 436 | return sig; |
| 441 | } | 437 | } |
| @@ -452,6 +448,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
| 452 | if (!signr) | 448 | if (!signr) |
| 453 | signr = __dequeue_signal(&tsk->signal->shared_pending, | 449 | signr = __dequeue_signal(&tsk->signal->shared_pending, |
| 454 | mask, info); | 450 | mask, info); |
| 451 | recalc_sigpending_tsk(tsk); | ||
| 455 | if (signr && unlikely(sig_kernel_stop(signr))) { | 452 | if (signr && unlikely(sig_kernel_stop(signr))) { |
| 456 | /* | 453 | /* |
| 457 | * Set a marker that we have dequeued a stop signal. Our | 454 | * Set a marker that we have dequeued a stop signal. Our |
| @@ -584,7 +581,7 @@ static int check_kill_permission(int sig, struct siginfo *info, | |||
| 584 | && !capable(CAP_KILL)) | 581 | && !capable(CAP_KILL)) |
| 585 | return error; | 582 | return error; |
| 586 | 583 | ||
| 587 | error = security_task_kill(t, info, sig); | 584 | error = security_task_kill(t, info, sig, 0); |
| 588 | if (!error) | 585 | if (!error) |
| 589 | audit_signal_info(sig, t); /* Let audit system see the signal */ | 586 | audit_signal_info(sig, t); /* Let audit system see the signal */ |
| 590 | return error; | 587 | return error; |
| @@ -792,22 +789,31 @@ out: | |||
| 792 | /* | 789 | /* |
| 793 | * Force a signal that the process can't ignore: if necessary | 790 | * Force a signal that the process can't ignore: if necessary |
| 794 | * we unblock the signal and change any SIG_IGN to SIG_DFL. | 791 | * we unblock the signal and change any SIG_IGN to SIG_DFL. |
| 792 | * | ||
| 793 | * Note: If we unblock the signal, we always reset it to SIG_DFL, | ||
| 794 | * since we do not want to have a signal handler that was blocked | ||
| 795 | * be invoked when user space had explicitly blocked it. | ||
| 796 | * | ||
| 797 | * We don't want to have recursive SIGSEGV's etc, for example. | ||
| 795 | */ | 798 | */ |
| 796 | |||
| 797 | int | 799 | int |
| 798 | force_sig_info(int sig, struct siginfo *info, struct task_struct *t) | 800 | force_sig_info(int sig, struct siginfo *info, struct task_struct *t) |
| 799 | { | 801 | { |
| 800 | unsigned long int flags; | 802 | unsigned long int flags; |
| 801 | int ret; | 803 | int ret, blocked, ignored; |
| 804 | struct k_sigaction *action; | ||
| 802 | 805 | ||
| 803 | spin_lock_irqsave(&t->sighand->siglock, flags); | 806 | spin_lock_irqsave(&t->sighand->siglock, flags); |
| 804 | if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) { | 807 | action = &t->sighand->action[sig-1]; |
| 805 | t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; | 808 | ignored = action->sa.sa_handler == SIG_IGN; |
| 806 | } | 809 | blocked = sigismember(&t->blocked, sig); |
| 807 | if (sigismember(&t->blocked, sig)) { | 810 | if (blocked || ignored) { |
| 808 | sigdelset(&t->blocked, sig); | 811 | action->sa.sa_handler = SIG_DFL; |
| 812 | if (blocked) { | ||
| 813 | sigdelset(&t->blocked, sig); | ||
| 814 | recalc_sigpending_tsk(t); | ||
| 815 | } | ||
| 809 | } | 816 | } |
| 810 | recalc_sigpending_tsk(t); | ||
| 811 | ret = specific_send_sig_info(sig, info, t); | 817 | ret = specific_send_sig_info(sig, info, t); |
| 812 | spin_unlock_irqrestore(&t->sighand->siglock, flags); | 818 | spin_unlock_irqrestore(&t->sighand->siglock, flags); |
| 813 | 819 | ||
| @@ -1107,7 +1113,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) | |||
| 1107 | 1113 | ||
| 1108 | /* like kill_proc_info(), but doesn't use uid/euid of "current" */ | 1114 | /* like kill_proc_info(), but doesn't use uid/euid of "current" */ |
| 1109 | int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, | 1115 | int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, |
| 1110 | uid_t uid, uid_t euid) | 1116 | uid_t uid, uid_t euid, u32 secid) |
| 1111 | { | 1117 | { |
| 1112 | int ret = -EINVAL; | 1118 | int ret = -EINVAL; |
| 1113 | struct task_struct *p; | 1119 | struct task_struct *p; |
| @@ -1127,6 +1133,9 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, | |||
| 1127 | ret = -EPERM; | 1133 | ret = -EPERM; |
| 1128 | goto out_unlock; | 1134 | goto out_unlock; |
| 1129 | } | 1135 | } |
| 1136 | ret = security_task_kill(p, info, sig, secid); | ||
| 1137 | if (ret) | ||
| 1138 | goto out_unlock; | ||
| 1130 | if (sig && p->sighand) { | 1139 | if (sig && p->sighand) { |
| 1131 | unsigned long flags; | 1140 | unsigned long flags; |
| 1132 | spin_lock_irqsave(&p->sighand->siglock, flags); | 1141 | spin_lock_irqsave(&p->sighand->siglock, flags); |
| @@ -1531,6 +1540,35 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why) | |||
| 1531 | spin_unlock_irqrestore(&sighand->siglock, flags); | 1540 | spin_unlock_irqrestore(&sighand->siglock, flags); |
| 1532 | } | 1541 | } |
| 1533 | 1542 | ||
| 1543 | static inline int may_ptrace_stop(void) | ||
| 1544 | { | ||
| 1545 | if (!likely(current->ptrace & PT_PTRACED)) | ||
| 1546 | return 0; | ||
| 1547 | |||
| 1548 | if (unlikely(current->parent == current->real_parent && | ||
| 1549 | (current->ptrace & PT_ATTACHED))) | ||
| 1550 | return 0; | ||
| 1551 | |||
| 1552 | if (unlikely(current->signal == current->parent->signal) && | ||
| 1553 | unlikely(current->signal->flags & SIGNAL_GROUP_EXIT)) | ||
| 1554 | return 0; | ||
| 1555 | |||
| 1556 | /* | ||
| 1557 | * Are we in the middle of do_coredump? | ||
| 1558 | * If so and our tracer is also part of the coredump stopping | ||
| 1559 | * is a deadlock situation, and pointless because our tracer | ||
| 1560 | * is dead so don't allow us to stop. | ||
| 1561 | * If SIGKILL was already sent before the caller unlocked | ||
| 1562 | * ->siglock we must see ->core_waiters != 0. Otherwise it | ||
| 1563 | * is safe to enter schedule(). | ||
| 1564 | */ | ||
| 1565 | if (unlikely(current->mm->core_waiters) && | ||
| 1566 | unlikely(current->mm == current->parent->mm)) | ||
| 1567 | return 0; | ||
| 1568 | |||
| 1569 | return 1; | ||
| 1570 | } | ||
| 1571 | |||
| 1534 | /* | 1572 | /* |
| 1535 | * This must be called with current->sighand->siglock held. | 1573 | * This must be called with current->sighand->siglock held. |
| 1536 | * | 1574 | * |
| @@ -1559,11 +1597,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) | |||
| 1559 | spin_unlock_irq(¤t->sighand->siglock); | 1597 | spin_unlock_irq(¤t->sighand->siglock); |
| 1560 | try_to_freeze(); | 1598 | try_to_freeze(); |
| 1561 | read_lock(&tasklist_lock); | 1599 | read_lock(&tasklist_lock); |
| 1562 | if (likely(current->ptrace & PT_PTRACED) && | 1600 | if (may_ptrace_stop()) { |
| 1563 | likely(current->parent != current->real_parent || | ||
| 1564 | !(current->ptrace & PT_ATTACHED)) && | ||
| 1565 | (likely(current->parent->signal != current->signal) || | ||
| 1566 | !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { | ||
| 1567 | do_notify_parent_cldstop(current, CLD_TRAPPED); | 1601 | do_notify_parent_cldstop(current, CLD_TRAPPED); |
| 1568 | read_unlock(&tasklist_lock); | 1602 | read_unlock(&tasklist_lock); |
| 1569 | schedule(); | 1603 | schedule(); |
| @@ -2541,6 +2575,11 @@ asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize) | |||
| 2541 | } | 2575 | } |
| 2542 | #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ | 2576 | #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ |
| 2543 | 2577 | ||
| 2578 | __attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) | ||
| 2579 | { | ||
| 2580 | return NULL; | ||
| 2581 | } | ||
| 2582 | |||
| 2544 | void __init signals_init(void) | 2583 | void __init signals_init(void) |
| 2545 | { | 2584 | { |
| 2546 | sigqueue_cachep = | 2585 | sigqueue_cachep = |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 336f92d64e2e..bf25015dce16 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -62,6 +62,137 @@ static inline void wakeup_softirqd(void) | |||
| 62 | } | 62 | } |
| 63 | 63 | ||
| 64 | /* | 64 | /* |
| 65 | * This one is for softirq.c-internal use, | ||
| 66 | * where hardirqs are disabled legitimately: | ||
| 67 | */ | ||
| 68 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
| 69 | static void __local_bh_disable(unsigned long ip) | ||
| 70 | { | ||
| 71 | unsigned long flags; | ||
| 72 | |||
| 73 | WARN_ON_ONCE(in_irq()); | ||
| 74 | |||
| 75 | raw_local_irq_save(flags); | ||
| 76 | add_preempt_count(SOFTIRQ_OFFSET); | ||
| 77 | /* | ||
| 78 | * Were softirqs turned off above: | ||
| 79 | */ | ||
| 80 | if (softirq_count() == SOFTIRQ_OFFSET) | ||
| 81 | trace_softirqs_off(ip); | ||
| 82 | raw_local_irq_restore(flags); | ||
| 83 | } | ||
| 84 | #else /* !CONFIG_TRACE_IRQFLAGS */ | ||
| 85 | static inline void __local_bh_disable(unsigned long ip) | ||
| 86 | { | ||
| 87 | add_preempt_count(SOFTIRQ_OFFSET); | ||
| 88 | barrier(); | ||
| 89 | } | ||
| 90 | #endif /* CONFIG_TRACE_IRQFLAGS */ | ||
| 91 | |||
| 92 | void local_bh_disable(void) | ||
| 93 | { | ||
| 94 | __local_bh_disable((unsigned long)__builtin_return_address(0)); | ||
| 95 | } | ||
| 96 | |||
| 97 | EXPORT_SYMBOL(local_bh_disable); | ||
| 98 | |||
| 99 | void __local_bh_enable(void) | ||
| 100 | { | ||
| 101 | WARN_ON_ONCE(in_irq()); | ||
| 102 | |||
| 103 | /* | ||
| 104 | * softirqs should never be enabled by __local_bh_enable(), | ||
| 105 | * it always nests inside local_bh_enable() sections: | ||
| 106 | */ | ||
| 107 | WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET); | ||
| 108 | |||
| 109 | sub_preempt_count(SOFTIRQ_OFFSET); | ||
| 110 | } | ||
| 111 | EXPORT_SYMBOL_GPL(__local_bh_enable); | ||
| 112 | |||
| 113 | /* | ||
| 114 | * Special-case - softirqs can safely be enabled in | ||
| 115 | * cond_resched_softirq(), or by __do_softirq(), | ||
| 116 | * without processing still-pending softirqs: | ||
| 117 | */ | ||
| 118 | void _local_bh_enable(void) | ||
| 119 | { | ||
| 120 | WARN_ON_ONCE(in_irq()); | ||
| 121 | WARN_ON_ONCE(!irqs_disabled()); | ||
| 122 | |||
| 123 | if (softirq_count() == SOFTIRQ_OFFSET) | ||
| 124 | trace_softirqs_on((unsigned long)__builtin_return_address(0)); | ||
| 125 | sub_preempt_count(SOFTIRQ_OFFSET); | ||
| 126 | } | ||
| 127 | |||
| 128 | EXPORT_SYMBOL(_local_bh_enable); | ||
| 129 | |||
| 130 | void local_bh_enable(void) | ||
| 131 | { | ||
| 132 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
| 133 | unsigned long flags; | ||
| 134 | |||
| 135 | WARN_ON_ONCE(in_irq()); | ||
| 136 | #endif | ||
| 137 | WARN_ON_ONCE(irqs_disabled()); | ||
| 138 | |||
| 139 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
| 140 | local_irq_save(flags); | ||
| 141 | #endif | ||
| 142 | /* | ||
| 143 | * Are softirqs going to be turned on now: | ||
| 144 | */ | ||
| 145 | if (softirq_count() == SOFTIRQ_OFFSET) | ||
| 146 | trace_softirqs_on((unsigned long)__builtin_return_address(0)); | ||
| 147 | /* | ||
| 148 | * Keep preemption disabled until we are done with | ||
| 149 | * softirq processing: | ||
| 150 | */ | ||
| 151 | sub_preempt_count(SOFTIRQ_OFFSET - 1); | ||
| 152 | |||
| 153 | if (unlikely(!in_interrupt() && local_softirq_pending())) | ||
| 154 | do_softirq(); | ||
| 155 | |||
| 156 | dec_preempt_count(); | ||
| 157 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
| 158 | local_irq_restore(flags); | ||
| 159 | #endif | ||
| 160 | preempt_check_resched(); | ||
| 161 | } | ||
| 162 | EXPORT_SYMBOL(local_bh_enable); | ||
| 163 | |||
| 164 | void local_bh_enable_ip(unsigned long ip) | ||
| 165 | { | ||
| 166 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
| 167 | unsigned long flags; | ||
| 168 | |||
| 169 | WARN_ON_ONCE(in_irq()); | ||
| 170 | |||
| 171 | local_irq_save(flags); | ||
| 172 | #endif | ||
| 173 | /* | ||
| 174 | * Are softirqs going to be turned on now: | ||
| 175 | */ | ||
| 176 | if (softirq_count() == SOFTIRQ_OFFSET) | ||
| 177 | trace_softirqs_on(ip); | ||
| 178 | /* | ||
| 179 | * Keep preemption disabled until we are done with | ||
| 180 | * softirq processing: | ||
| 181 | */ | ||
| 182 | sub_preempt_count(SOFTIRQ_OFFSET - 1); | ||
| 183 | |||
| 184 | if (unlikely(!in_interrupt() && local_softirq_pending())) | ||
| 185 | do_softirq(); | ||
| 186 | |||
| 187 | dec_preempt_count(); | ||
| 188 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
| 189 | local_irq_restore(flags); | ||
| 190 | #endif | ||
| 191 | preempt_check_resched(); | ||
| 192 | } | ||
| 193 | EXPORT_SYMBOL(local_bh_enable_ip); | ||
| 194 | |||
| 195 | /* | ||
| 65 | * We restart softirq processing MAX_SOFTIRQ_RESTART times, | 196 | * We restart softirq processing MAX_SOFTIRQ_RESTART times, |
| 66 | * and we fall back to softirqd after that. | 197 | * and we fall back to softirqd after that. |
| 67 | * | 198 | * |
| @@ -80,8 +211,11 @@ asmlinkage void __do_softirq(void) | |||
| 80 | int cpu; | 211 | int cpu; |
| 81 | 212 | ||
| 82 | pending = local_softirq_pending(); | 213 | pending = local_softirq_pending(); |
| 214 | account_system_vtime(current); | ||
| 215 | |||
| 216 | __local_bh_disable((unsigned long)__builtin_return_address(0)); | ||
| 217 | trace_softirq_enter(); | ||
| 83 | 218 | ||
| 84 | local_bh_disable(); | ||
| 85 | cpu = smp_processor_id(); | 219 | cpu = smp_processor_id(); |
| 86 | restart: | 220 | restart: |
| 87 | /* Reset the pending bitmask before enabling irqs */ | 221 | /* Reset the pending bitmask before enabling irqs */ |
| @@ -109,7 +243,10 @@ restart: | |||
| 109 | if (pending) | 243 | if (pending) |
| 110 | wakeup_softirqd(); | 244 | wakeup_softirqd(); |
| 111 | 245 | ||
| 112 | __local_bh_enable(); | 246 | trace_softirq_exit(); |
| 247 | |||
| 248 | account_system_vtime(current); | ||
| 249 | _local_bh_enable(); | ||
| 113 | } | 250 | } |
| 114 | 251 | ||
| 115 | #ifndef __ARCH_HAS_DO_SOFTIRQ | 252 | #ifndef __ARCH_HAS_DO_SOFTIRQ |
| @@ -136,23 +273,6 @@ EXPORT_SYMBOL(do_softirq); | |||
| 136 | 273 | ||
| 137 | #endif | 274 | #endif |
| 138 | 275 | ||
| 139 | void local_bh_enable(void) | ||
| 140 | { | ||
| 141 | WARN_ON(irqs_disabled()); | ||
| 142 | /* | ||
| 143 | * Keep preemption disabled until we are done with | ||
| 144 | * softirq processing: | ||
| 145 | */ | ||
| 146 | sub_preempt_count(SOFTIRQ_OFFSET - 1); | ||
| 147 | |||
| 148 | if (unlikely(!in_interrupt() && local_softirq_pending())) | ||
| 149 | do_softirq(); | ||
| 150 | |||
| 151 | dec_preempt_count(); | ||
| 152 | preempt_check_resched(); | ||
| 153 | } | ||
| 154 | EXPORT_SYMBOL(local_bh_enable); | ||
| 155 | |||
| 156 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED | 276 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED |
| 157 | # define invoke_softirq() __do_softirq() | 277 | # define invoke_softirq() __do_softirq() |
| 158 | #else | 278 | #else |
| @@ -165,6 +285,7 @@ EXPORT_SYMBOL(local_bh_enable); | |||
| 165 | void irq_exit(void) | 285 | void irq_exit(void) |
| 166 | { | 286 | { |
| 167 | account_system_vtime(current); | 287 | account_system_vtime(current); |
| 288 | trace_hardirq_exit(); | ||
| 168 | sub_preempt_count(IRQ_EXIT_OFFSET); | 289 | sub_preempt_count(IRQ_EXIT_OFFSET); |
| 169 | if (!in_interrupt() && local_softirq_pending()) | 290 | if (!in_interrupt() && local_softirq_pending()) |
| 170 | invoke_softirq(); | 291 | invoke_softirq(); |
| @@ -208,8 +329,6 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data) | |||
| 208 | softirq_vec[nr].action = action; | 329 | softirq_vec[nr].action = action; |
| 209 | } | 330 | } |
| 210 | 331 | ||
| 211 | EXPORT_SYMBOL(open_softirq); | ||
| 212 | |||
| 213 | /* Tasklets */ | 332 | /* Tasklets */ |
| 214 | struct tasklet_head | 333 | struct tasklet_head |
| 215 | { | 334 | { |
| @@ -446,7 +565,7 @@ static void takeover_tasklets(unsigned int cpu) | |||
| 446 | } | 565 | } |
| 447 | #endif /* CONFIG_HOTPLUG_CPU */ | 566 | #endif /* CONFIG_HOTPLUG_CPU */ |
| 448 | 567 | ||
| 449 | static int cpu_callback(struct notifier_block *nfb, | 568 | static int __cpuinit cpu_callback(struct notifier_block *nfb, |
| 450 | unsigned long action, | 569 | unsigned long action, |
| 451 | void *hcpu) | 570 | void *hcpu) |
| 452 | { | 571 | { |
| @@ -470,6 +589,8 @@ static int cpu_callback(struct notifier_block *nfb, | |||
| 470 | break; | 589 | break; |
| 471 | #ifdef CONFIG_HOTPLUG_CPU | 590 | #ifdef CONFIG_HOTPLUG_CPU |
| 472 | case CPU_UP_CANCELED: | 591 | case CPU_UP_CANCELED: |
| 592 | if (!per_cpu(ksoftirqd, hotcpu)) | ||
| 593 | break; | ||
| 473 | /* Unbind so it can run. Fall thru. */ | 594 | /* Unbind so it can run. Fall thru. */ |
| 474 | kthread_bind(per_cpu(ksoftirqd, hotcpu), | 595 | kthread_bind(per_cpu(ksoftirqd, hotcpu), |
| 475 | any_online_cpu(cpu_online_map)); | 596 | any_online_cpu(cpu_online_map)); |
| @@ -484,14 +605,16 @@ static int cpu_callback(struct notifier_block *nfb, | |||
| 484 | return NOTIFY_OK; | 605 | return NOTIFY_OK; |
| 485 | } | 606 | } |
| 486 | 607 | ||
| 487 | static struct notifier_block cpu_nfb = { | 608 | static struct notifier_block __cpuinitdata cpu_nfb = { |
| 488 | .notifier_call = cpu_callback | 609 | .notifier_call = cpu_callback |
| 489 | }; | 610 | }; |
| 490 | 611 | ||
| 491 | __init int spawn_ksoftirqd(void) | 612 | __init int spawn_ksoftirqd(void) |
| 492 | { | 613 | { |
| 493 | void *cpu = (void *)(long)smp_processor_id(); | 614 | void *cpu = (void *)(long)smp_processor_id(); |
| 494 | cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | 615 | int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); |
| 616 | |||
| 617 | BUG_ON(err == NOTIFY_BAD); | ||
| 495 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | 618 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); |
| 496 | register_cpu_notifier(&cpu_nfb); | 619 | register_cpu_notifier(&cpu_nfb); |
| 497 | return 0; | 620 | return 0; |
diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 14c7faf02909..50afeb813305 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c | |||
| @@ -36,7 +36,7 @@ static struct notifier_block panic_block = { | |||
| 36 | 36 | ||
| 37 | void touch_softlockup_watchdog(void) | 37 | void touch_softlockup_watchdog(void) |
| 38 | { | 38 | { |
| 39 | per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies; | 39 | __raw_get_cpu_var(touch_timestamp) = jiffies; |
| 40 | } | 40 | } |
| 41 | EXPORT_SYMBOL(touch_softlockup_watchdog); | 41 | EXPORT_SYMBOL(touch_softlockup_watchdog); |
| 42 | 42 | ||
| @@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu) | |||
| 104 | /* | 104 | /* |
| 105 | * Create/destroy watchdog threads as CPUs come and go: | 105 | * Create/destroy watchdog threads as CPUs come and go: |
| 106 | */ | 106 | */ |
| 107 | static int | 107 | static int __cpuinit |
| 108 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | 108 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) |
| 109 | { | 109 | { |
| 110 | int hotcpu = (unsigned long)hcpu; | 110 | int hotcpu = (unsigned long)hcpu; |
| @@ -127,6 +127,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 127 | break; | 127 | break; |
| 128 | #ifdef CONFIG_HOTPLUG_CPU | 128 | #ifdef CONFIG_HOTPLUG_CPU |
| 129 | case CPU_UP_CANCELED: | 129 | case CPU_UP_CANCELED: |
| 130 | if (!per_cpu(watchdog_task, hotcpu)) | ||
| 131 | break; | ||
| 130 | /* Unbind so it can run. Fall thru. */ | 132 | /* Unbind so it can run. Fall thru. */ |
| 131 | kthread_bind(per_cpu(watchdog_task, hotcpu), | 133 | kthread_bind(per_cpu(watchdog_task, hotcpu), |
| 132 | any_online_cpu(cpu_online_map)); | 134 | any_online_cpu(cpu_online_map)); |
| @@ -140,15 +142,16 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 140 | return NOTIFY_OK; | 142 | return NOTIFY_OK; |
| 141 | } | 143 | } |
| 142 | 144 | ||
| 143 | static struct notifier_block cpu_nfb = { | 145 | static struct notifier_block __cpuinitdata cpu_nfb = { |
| 144 | .notifier_call = cpu_callback | 146 | .notifier_call = cpu_callback |
| 145 | }; | 147 | }; |
| 146 | 148 | ||
| 147 | __init void spawn_softlockup_task(void) | 149 | __init void spawn_softlockup_task(void) |
| 148 | { | 150 | { |
| 149 | void *cpu = (void *)(long)smp_processor_id(); | 151 | void *cpu = (void *)(long)smp_processor_id(); |
| 152 | int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | ||
| 150 | 153 | ||
| 151 | cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | 154 | BUG_ON(err == NOTIFY_BAD); |
| 152 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | 155 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); |
| 153 | register_cpu_notifier(&cpu_nfb); | 156 | register_cpu_notifier(&cpu_nfb); |
| 154 | 157 | ||
diff --git a/kernel/spinlock.c b/kernel/spinlock.c index d1b810782bc4..d48143eafbfd 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c | |||
| @@ -7,31 +7,27 @@ | |||
| 7 | * | 7 | * |
| 8 | * This file contains the spinlock/rwlock implementations for the | 8 | * This file contains the spinlock/rwlock implementations for the |
| 9 | * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) | 9 | * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) |
| 10 | * | ||
| 11 | * Note that some architectures have special knowledge about the | ||
| 12 | * stack frames of these functions in their profile_pc. If you | ||
| 13 | * change anything significant here that could change the stack | ||
| 14 | * frame contact the architecture maintainers. | ||
| 10 | */ | 15 | */ |
| 11 | 16 | ||
| 12 | #include <linux/config.h> | ||
| 13 | #include <linux/linkage.h> | 17 | #include <linux/linkage.h> |
| 14 | #include <linux/preempt.h> | 18 | #include <linux/preempt.h> |
| 15 | #include <linux/spinlock.h> | 19 | #include <linux/spinlock.h> |
| 16 | #include <linux/interrupt.h> | 20 | #include <linux/interrupt.h> |
| 21 | #include <linux/debug_locks.h> | ||
| 17 | #include <linux/module.h> | 22 | #include <linux/module.h> |
| 18 | 23 | ||
| 19 | /* | ||
| 20 | * Generic declaration of the raw read_trylock() function, | ||
| 21 | * architectures are supposed to optimize this: | ||
| 22 | */ | ||
| 23 | int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock) | ||
| 24 | { | ||
| 25 | __raw_read_lock(lock); | ||
| 26 | return 1; | ||
| 27 | } | ||
| 28 | EXPORT_SYMBOL(generic__raw_read_trylock); | ||
| 29 | |||
| 30 | int __lockfunc _spin_trylock(spinlock_t *lock) | 24 | int __lockfunc _spin_trylock(spinlock_t *lock) |
| 31 | { | 25 | { |
| 32 | preempt_disable(); | 26 | preempt_disable(); |
| 33 | if (_raw_spin_trylock(lock)) | 27 | if (_raw_spin_trylock(lock)) { |
| 28 | spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); | ||
| 34 | return 1; | 29 | return 1; |
| 30 | } | ||
| 35 | 31 | ||
| 36 | preempt_enable(); | 32 | preempt_enable(); |
| 37 | return 0; | 33 | return 0; |
| @@ -41,8 +37,10 @@ EXPORT_SYMBOL(_spin_trylock); | |||
| 41 | int __lockfunc _read_trylock(rwlock_t *lock) | 37 | int __lockfunc _read_trylock(rwlock_t *lock) |
| 42 | { | 38 | { |
| 43 | preempt_disable(); | 39 | preempt_disable(); |
| 44 | if (_raw_read_trylock(lock)) | 40 | if (_raw_read_trylock(lock)) { |
| 41 | rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_); | ||
| 45 | return 1; | 42 | return 1; |
| 43 | } | ||
| 46 | 44 | ||
| 47 | preempt_enable(); | 45 | preempt_enable(); |
| 48 | return 0; | 46 | return 0; |
| @@ -52,19 +50,28 @@ EXPORT_SYMBOL(_read_trylock); | |||
| 52 | int __lockfunc _write_trylock(rwlock_t *lock) | 50 | int __lockfunc _write_trylock(rwlock_t *lock) |
| 53 | { | 51 | { |
| 54 | preempt_disable(); | 52 | preempt_disable(); |
| 55 | if (_raw_write_trylock(lock)) | 53 | if (_raw_write_trylock(lock)) { |
| 54 | rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_); | ||
| 56 | return 1; | 55 | return 1; |
| 56 | } | ||
| 57 | 57 | ||
| 58 | preempt_enable(); | 58 | preempt_enable(); |
| 59 | return 0; | 59 | return 0; |
| 60 | } | 60 | } |
| 61 | EXPORT_SYMBOL(_write_trylock); | 61 | EXPORT_SYMBOL(_write_trylock); |
| 62 | 62 | ||
| 63 | #if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) | 63 | /* |
| 64 | * If lockdep is enabled then we use the non-preemption spin-ops | ||
| 65 | * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are | ||
| 66 | * not re-enabled during lock-acquire (which the preempt-spin-ops do): | ||
| 67 | */ | ||
| 68 | #if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \ | ||
| 69 | defined(CONFIG_DEBUG_LOCK_ALLOC) | ||
| 64 | 70 | ||
| 65 | void __lockfunc _read_lock(rwlock_t *lock) | 71 | void __lockfunc _read_lock(rwlock_t *lock) |
| 66 | { | 72 | { |
| 67 | preempt_disable(); | 73 | preempt_disable(); |
| 74 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); | ||
| 68 | _raw_read_lock(lock); | 75 | _raw_read_lock(lock); |
| 69 | } | 76 | } |
| 70 | EXPORT_SYMBOL(_read_lock); | 77 | EXPORT_SYMBOL(_read_lock); |
| @@ -75,7 +82,17 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) | |||
| 75 | 82 | ||
| 76 | local_irq_save(flags); | 83 | local_irq_save(flags); |
| 77 | preempt_disable(); | 84 | preempt_disable(); |
| 85 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); | ||
| 86 | /* | ||
| 87 | * On lockdep we dont want the hand-coded irq-enable of | ||
| 88 | * _raw_spin_lock_flags() code, because lockdep assumes | ||
| 89 | * that interrupts are not re-enabled during lock-acquire: | ||
| 90 | */ | ||
| 91 | #ifdef CONFIG_PROVE_LOCKING | ||
| 92 | _raw_spin_lock(lock); | ||
| 93 | #else | ||
| 78 | _raw_spin_lock_flags(lock, &flags); | 94 | _raw_spin_lock_flags(lock, &flags); |
| 95 | #endif | ||
| 79 | return flags; | 96 | return flags; |
| 80 | } | 97 | } |
| 81 | EXPORT_SYMBOL(_spin_lock_irqsave); | 98 | EXPORT_SYMBOL(_spin_lock_irqsave); |
| @@ -84,6 +101,7 @@ void __lockfunc _spin_lock_irq(spinlock_t *lock) | |||
| 84 | { | 101 | { |
| 85 | local_irq_disable(); | 102 | local_irq_disable(); |
| 86 | preempt_disable(); | 103 | preempt_disable(); |
| 104 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); | ||
| 87 | _raw_spin_lock(lock); | 105 | _raw_spin_lock(lock); |
| 88 | } | 106 | } |
| 89 | EXPORT_SYMBOL(_spin_lock_irq); | 107 | EXPORT_SYMBOL(_spin_lock_irq); |
| @@ -92,6 +110,7 @@ void __lockfunc _spin_lock_bh(spinlock_t *lock) | |||
| 92 | { | 110 | { |
| 93 | local_bh_disable(); | 111 | local_bh_disable(); |
| 94 | preempt_disable(); | 112 | preempt_disable(); |
| 113 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); | ||
| 95 | _raw_spin_lock(lock); | 114 | _raw_spin_lock(lock); |
| 96 | } | 115 | } |
| 97 | EXPORT_SYMBOL(_spin_lock_bh); | 116 | EXPORT_SYMBOL(_spin_lock_bh); |
| @@ -102,6 +121,7 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) | |||
| 102 | 121 | ||
| 103 | local_irq_save(flags); | 122 | local_irq_save(flags); |
| 104 | preempt_disable(); | 123 | preempt_disable(); |
| 124 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); | ||
| 105 | _raw_read_lock(lock); | 125 | _raw_read_lock(lock); |
| 106 | return flags; | 126 | return flags; |
| 107 | } | 127 | } |
| @@ -111,6 +131,7 @@ void __lockfunc _read_lock_irq(rwlock_t *lock) | |||
| 111 | { | 131 | { |
| 112 | local_irq_disable(); | 132 | local_irq_disable(); |
| 113 | preempt_disable(); | 133 | preempt_disable(); |
| 134 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); | ||
| 114 | _raw_read_lock(lock); | 135 | _raw_read_lock(lock); |
| 115 | } | 136 | } |
| 116 | EXPORT_SYMBOL(_read_lock_irq); | 137 | EXPORT_SYMBOL(_read_lock_irq); |
| @@ -119,6 +140,7 @@ void __lockfunc _read_lock_bh(rwlock_t *lock) | |||
| 119 | { | 140 | { |
| 120 | local_bh_disable(); | 141 | local_bh_disable(); |
| 121 | preempt_disable(); | 142 | preempt_disable(); |
| 143 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); | ||
| 122 | _raw_read_lock(lock); | 144 | _raw_read_lock(lock); |
| 123 | } | 145 | } |
| 124 | EXPORT_SYMBOL(_read_lock_bh); | 146 | EXPORT_SYMBOL(_read_lock_bh); |
| @@ -129,6 +151,7 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) | |||
| 129 | 151 | ||
| 130 | local_irq_save(flags); | 152 | local_irq_save(flags); |
| 131 | preempt_disable(); | 153 | preempt_disable(); |
| 154 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); | ||
| 132 | _raw_write_lock(lock); | 155 | _raw_write_lock(lock); |
| 133 | return flags; | 156 | return flags; |
| 134 | } | 157 | } |
| @@ -138,6 +161,7 @@ void __lockfunc _write_lock_irq(rwlock_t *lock) | |||
| 138 | { | 161 | { |
| 139 | local_irq_disable(); | 162 | local_irq_disable(); |
| 140 | preempt_disable(); | 163 | preempt_disable(); |
| 164 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); | ||
| 141 | _raw_write_lock(lock); | 165 | _raw_write_lock(lock); |
| 142 | } | 166 | } |
| 143 | EXPORT_SYMBOL(_write_lock_irq); | 167 | EXPORT_SYMBOL(_write_lock_irq); |
| @@ -146,6 +170,7 @@ void __lockfunc _write_lock_bh(rwlock_t *lock) | |||
| 146 | { | 170 | { |
| 147 | local_bh_disable(); | 171 | local_bh_disable(); |
| 148 | preempt_disable(); | 172 | preempt_disable(); |
| 173 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); | ||
| 149 | _raw_write_lock(lock); | 174 | _raw_write_lock(lock); |
| 150 | } | 175 | } |
| 151 | EXPORT_SYMBOL(_write_lock_bh); | 176 | EXPORT_SYMBOL(_write_lock_bh); |
| @@ -153,6 +178,7 @@ EXPORT_SYMBOL(_write_lock_bh); | |||
| 153 | void __lockfunc _spin_lock(spinlock_t *lock) | 178 | void __lockfunc _spin_lock(spinlock_t *lock) |
| 154 | { | 179 | { |
| 155 | preempt_disable(); | 180 | preempt_disable(); |
| 181 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); | ||
| 156 | _raw_spin_lock(lock); | 182 | _raw_spin_lock(lock); |
| 157 | } | 183 | } |
| 158 | 184 | ||
| @@ -161,6 +187,7 @@ EXPORT_SYMBOL(_spin_lock); | |||
| 161 | void __lockfunc _write_lock(rwlock_t *lock) | 187 | void __lockfunc _write_lock(rwlock_t *lock) |
| 162 | { | 188 | { |
| 163 | preempt_disable(); | 189 | preempt_disable(); |
| 190 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); | ||
| 164 | _raw_write_lock(lock); | 191 | _raw_write_lock(lock); |
| 165 | } | 192 | } |
| 166 | 193 | ||
| @@ -256,8 +283,22 @@ BUILD_LOCK_OPS(write, rwlock); | |||
| 256 | 283 | ||
| 257 | #endif /* CONFIG_PREEMPT */ | 284 | #endif /* CONFIG_PREEMPT */ |
| 258 | 285 | ||
| 286 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
| 287 | |||
| 288 | void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) | ||
| 289 | { | ||
| 290 | preempt_disable(); | ||
| 291 | spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); | ||
| 292 | _raw_spin_lock(lock); | ||
| 293 | } | ||
| 294 | |||
| 295 | EXPORT_SYMBOL(_spin_lock_nested); | ||
| 296 | |||
| 297 | #endif | ||
| 298 | |||
| 259 | void __lockfunc _spin_unlock(spinlock_t *lock) | 299 | void __lockfunc _spin_unlock(spinlock_t *lock) |
| 260 | { | 300 | { |
| 301 | spin_release(&lock->dep_map, 1, _RET_IP_); | ||
| 261 | _raw_spin_unlock(lock); | 302 | _raw_spin_unlock(lock); |
| 262 | preempt_enable(); | 303 | preempt_enable(); |
| 263 | } | 304 | } |
| @@ -265,6 +306,7 @@ EXPORT_SYMBOL(_spin_unlock); | |||
| 265 | 306 | ||
| 266 | void __lockfunc _write_unlock(rwlock_t *lock) | 307 | void __lockfunc _write_unlock(rwlock_t *lock) |
| 267 | { | 308 | { |
| 309 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | ||
| 268 | _raw_write_unlock(lock); | 310 | _raw_write_unlock(lock); |
| 269 | preempt_enable(); | 311 | preempt_enable(); |
| 270 | } | 312 | } |
| @@ -272,6 +314,7 @@ EXPORT_SYMBOL(_write_unlock); | |||
| 272 | 314 | ||
| 273 | void __lockfunc _read_unlock(rwlock_t *lock) | 315 | void __lockfunc _read_unlock(rwlock_t *lock) |
| 274 | { | 316 | { |
| 317 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | ||
| 275 | _raw_read_unlock(lock); | 318 | _raw_read_unlock(lock); |
| 276 | preempt_enable(); | 319 | preempt_enable(); |
| 277 | } | 320 | } |
| @@ -279,6 +322,7 @@ EXPORT_SYMBOL(_read_unlock); | |||
| 279 | 322 | ||
| 280 | void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) | 323 | void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) |
| 281 | { | 324 | { |
| 325 | spin_release(&lock->dep_map, 1, _RET_IP_); | ||
| 282 | _raw_spin_unlock(lock); | 326 | _raw_spin_unlock(lock); |
| 283 | local_irq_restore(flags); | 327 | local_irq_restore(flags); |
| 284 | preempt_enable(); | 328 | preempt_enable(); |
| @@ -287,6 +331,7 @@ EXPORT_SYMBOL(_spin_unlock_irqrestore); | |||
| 287 | 331 | ||
| 288 | void __lockfunc _spin_unlock_irq(spinlock_t *lock) | 332 | void __lockfunc _spin_unlock_irq(spinlock_t *lock) |
| 289 | { | 333 | { |
| 334 | spin_release(&lock->dep_map, 1, _RET_IP_); | ||
| 290 | _raw_spin_unlock(lock); | 335 | _raw_spin_unlock(lock); |
| 291 | local_irq_enable(); | 336 | local_irq_enable(); |
| 292 | preempt_enable(); | 337 | preempt_enable(); |
| @@ -295,14 +340,16 @@ EXPORT_SYMBOL(_spin_unlock_irq); | |||
| 295 | 340 | ||
| 296 | void __lockfunc _spin_unlock_bh(spinlock_t *lock) | 341 | void __lockfunc _spin_unlock_bh(spinlock_t *lock) |
| 297 | { | 342 | { |
| 343 | spin_release(&lock->dep_map, 1, _RET_IP_); | ||
| 298 | _raw_spin_unlock(lock); | 344 | _raw_spin_unlock(lock); |
| 299 | preempt_enable_no_resched(); | 345 | preempt_enable_no_resched(); |
| 300 | local_bh_enable(); | 346 | local_bh_enable_ip((unsigned long)__builtin_return_address(0)); |
| 301 | } | 347 | } |
| 302 | EXPORT_SYMBOL(_spin_unlock_bh); | 348 | EXPORT_SYMBOL(_spin_unlock_bh); |
| 303 | 349 | ||
| 304 | void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) | 350 | void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) |
| 305 | { | 351 | { |
| 352 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | ||
| 306 | _raw_read_unlock(lock); | 353 | _raw_read_unlock(lock); |
| 307 | local_irq_restore(flags); | 354 | local_irq_restore(flags); |
| 308 | preempt_enable(); | 355 | preempt_enable(); |
| @@ -311,6 +358,7 @@ EXPORT_SYMBOL(_read_unlock_irqrestore); | |||
| 311 | 358 | ||
| 312 | void __lockfunc _read_unlock_irq(rwlock_t *lock) | 359 | void __lockfunc _read_unlock_irq(rwlock_t *lock) |
| 313 | { | 360 | { |
| 361 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | ||
| 314 | _raw_read_unlock(lock); | 362 | _raw_read_unlock(lock); |
| 315 | local_irq_enable(); | 363 | local_irq_enable(); |
| 316 | preempt_enable(); | 364 | preempt_enable(); |
| @@ -319,14 +367,16 @@ EXPORT_SYMBOL(_read_unlock_irq); | |||
| 319 | 367 | ||
| 320 | void __lockfunc _read_unlock_bh(rwlock_t *lock) | 368 | void __lockfunc _read_unlock_bh(rwlock_t *lock) |
| 321 | { | 369 | { |
| 370 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | ||
| 322 | _raw_read_unlock(lock); | 371 | _raw_read_unlock(lock); |
| 323 | preempt_enable_no_resched(); | 372 | preempt_enable_no_resched(); |
| 324 | local_bh_enable(); | 373 | local_bh_enable_ip((unsigned long)__builtin_return_address(0)); |
| 325 | } | 374 | } |
| 326 | EXPORT_SYMBOL(_read_unlock_bh); | 375 | EXPORT_SYMBOL(_read_unlock_bh); |
| 327 | 376 | ||
| 328 | void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) | 377 | void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) |
| 329 | { | 378 | { |
| 379 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | ||
| 330 | _raw_write_unlock(lock); | 380 | _raw_write_unlock(lock); |
| 331 | local_irq_restore(flags); | 381 | local_irq_restore(flags); |
| 332 | preempt_enable(); | 382 | preempt_enable(); |
| @@ -335,6 +385,7 @@ EXPORT_SYMBOL(_write_unlock_irqrestore); | |||
| 335 | 385 | ||
| 336 | void __lockfunc _write_unlock_irq(rwlock_t *lock) | 386 | void __lockfunc _write_unlock_irq(rwlock_t *lock) |
| 337 | { | 387 | { |
| 388 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | ||
| 338 | _raw_write_unlock(lock); | 389 | _raw_write_unlock(lock); |
| 339 | local_irq_enable(); | 390 | local_irq_enable(); |
| 340 | preempt_enable(); | 391 | preempt_enable(); |
| @@ -343,9 +394,10 @@ EXPORT_SYMBOL(_write_unlock_irq); | |||
| 343 | 394 | ||
| 344 | void __lockfunc _write_unlock_bh(rwlock_t *lock) | 395 | void __lockfunc _write_unlock_bh(rwlock_t *lock) |
| 345 | { | 396 | { |
| 397 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | ||
| 346 | _raw_write_unlock(lock); | 398 | _raw_write_unlock(lock); |
| 347 | preempt_enable_no_resched(); | 399 | preempt_enable_no_resched(); |
| 348 | local_bh_enable(); | 400 | local_bh_enable_ip((unsigned long)__builtin_return_address(0)); |
| 349 | } | 401 | } |
| 350 | EXPORT_SYMBOL(_write_unlock_bh); | 402 | EXPORT_SYMBOL(_write_unlock_bh); |
| 351 | 403 | ||
| @@ -353,11 +405,13 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock) | |||
| 353 | { | 405 | { |
| 354 | local_bh_disable(); | 406 | local_bh_disable(); |
| 355 | preempt_disable(); | 407 | preempt_disable(); |
| 356 | if (_raw_spin_trylock(lock)) | 408 | if (_raw_spin_trylock(lock)) { |
| 409 | spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); | ||
| 357 | return 1; | 410 | return 1; |
| 411 | } | ||
| 358 | 412 | ||
| 359 | preempt_enable_no_resched(); | 413 | preempt_enable_no_resched(); |
| 360 | local_bh_enable(); | 414 | local_bh_enable_ip((unsigned long)__builtin_return_address(0)); |
| 361 | return 0; | 415 | return 0; |
| 362 | } | 416 | } |
| 363 | EXPORT_SYMBOL(_spin_trylock_bh); | 417 | EXPORT_SYMBOL(_spin_trylock_bh); |
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c new file mode 100644 index 000000000000..b71816e47a30 --- /dev/null +++ b/kernel/stacktrace.c | |||
| @@ -0,0 +1,24 @@ | |||
| 1 | /* | ||
| 2 | * kernel/stacktrace.c | ||
| 3 | * | ||
| 4 | * Stack trace management functions | ||
| 5 | * | ||
| 6 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
| 7 | */ | ||
| 8 | #include <linux/sched.h> | ||
| 9 | #include <linux/kallsyms.h> | ||
| 10 | #include <linux/stacktrace.h> | ||
| 11 | |||
| 12 | void print_stack_trace(struct stack_trace *trace, int spaces) | ||
| 13 | { | ||
| 14 | int i, j; | ||
| 15 | |||
| 16 | for (i = 0; i < trace->nr_entries; i++) { | ||
| 17 | unsigned long ip = trace->entries[i]; | ||
| 18 | |||
| 19 | for (j = 0; j < spaces + 1; j++) | ||
| 20 | printk(" "); | ||
| 21 | print_ip_sym(ip); | ||
| 22 | } | ||
| 23 | } | ||
| 24 | |||
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index dcfb5d731466..12458040e665 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
| @@ -1,3 +1,6 @@ | |||
| 1 | /* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. | ||
| 2 | * GPL v2 and any later version. | ||
| 3 | */ | ||
| 1 | #include <linux/stop_machine.h> | 4 | #include <linux/stop_machine.h> |
| 2 | #include <linux/kthread.h> | 5 | #include <linux/kthread.h> |
| 3 | #include <linux/sched.h> | 6 | #include <linux/sched.h> |
| @@ -111,7 +114,6 @@ static int stop_machine(void) | |||
| 111 | /* If some failed, kill them all. */ | 114 | /* If some failed, kill them all. */ |
| 112 | if (ret < 0) { | 115 | if (ret < 0) { |
| 113 | stopmachine_set_state(STOPMACHINE_EXIT); | 116 | stopmachine_set_state(STOPMACHINE_EXIT); |
| 114 | up(&stopmachine_mutex); | ||
| 115 | return ret; | 117 | return ret; |
| 116 | } | 118 | } |
| 117 | 119 | ||
diff --git a/kernel/sys.c b/kernel/sys.c index 90930b28d2ca..b88806c66244 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -4,7 +4,6 @@ | |||
| 4 | * Copyright (C) 1991, 1992 Linus Torvalds | 4 | * Copyright (C) 1991, 1992 Linus Torvalds |
| 5 | */ | 5 | */ |
| 6 | 6 | ||
| 7 | #include <linux/config.h> | ||
| 8 | #include <linux/module.h> | 7 | #include <linux/module.h> |
| 9 | #include <linux/mm.h> | 8 | #include <linux/mm.h> |
| 10 | #include <linux/utsname.h> | 9 | #include <linux/utsname.h> |
| @@ -29,6 +28,7 @@ | |||
| 29 | #include <linux/tty.h> | 28 | #include <linux/tty.h> |
| 30 | #include <linux/signal.h> | 29 | #include <linux/signal.h> |
| 31 | #include <linux/cn_proc.h> | 30 | #include <linux/cn_proc.h> |
| 31 | #include <linux/getcpu.h> | ||
| 32 | 32 | ||
| 33 | #include <linux/compat.h> | 33 | #include <linux/compat.h> |
| 34 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
| @@ -137,14 +137,15 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, | |||
| 137 | unsigned long val, void *v) | 137 | unsigned long val, void *v) |
| 138 | { | 138 | { |
| 139 | int ret = NOTIFY_DONE; | 139 | int ret = NOTIFY_DONE; |
| 140 | struct notifier_block *nb; | 140 | struct notifier_block *nb, *next_nb; |
| 141 | 141 | ||
| 142 | nb = rcu_dereference(*nl); | 142 | nb = rcu_dereference(*nl); |
| 143 | while (nb) { | 143 | while (nb) { |
| 144 | next_nb = rcu_dereference(nb->next); | ||
| 144 | ret = nb->notifier_call(nb, val, v); | 145 | ret = nb->notifier_call(nb, val, v); |
| 145 | if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) | 146 | if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) |
| 146 | break; | 147 | break; |
| 147 | nb = rcu_dereference(nb->next); | 148 | nb = next_nb; |
| 148 | } | 149 | } |
| 149 | return ret; | 150 | return ret; |
| 150 | } | 151 | } |
| @@ -588,7 +589,7 @@ void emergency_restart(void) | |||
| 588 | } | 589 | } |
| 589 | EXPORT_SYMBOL_GPL(emergency_restart); | 590 | EXPORT_SYMBOL_GPL(emergency_restart); |
| 590 | 591 | ||
| 591 | void kernel_restart_prepare(char *cmd) | 592 | static void kernel_restart_prepare(char *cmd) |
| 592 | { | 593 | { |
| 593 | blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); | 594 | blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); |
| 594 | system_state = SYSTEM_RESTART; | 595 | system_state = SYSTEM_RESTART; |
| @@ -611,7 +612,6 @@ void kernel_restart(char *cmd) | |||
| 611 | } else { | 612 | } else { |
| 612 | printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); | 613 | printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); |
| 613 | } | 614 | } |
| 614 | printk(".\n"); | ||
| 615 | machine_restart(cmd); | 615 | machine_restart(cmd); |
| 616 | } | 616 | } |
| 617 | EXPORT_SYMBOL_GPL(kernel_restart); | 617 | EXPORT_SYMBOL_GPL(kernel_restart); |
| @@ -622,7 +622,7 @@ EXPORT_SYMBOL_GPL(kernel_restart); | |||
| 622 | * Move into place and start executing a preloaded standalone | 622 | * Move into place and start executing a preloaded standalone |
| 623 | * executable. If nothing was preloaded return an error. | 623 | * executable. If nothing was preloaded return an error. |
| 624 | */ | 624 | */ |
| 625 | void kernel_kexec(void) | 625 | static void kernel_kexec(void) |
| 626 | { | 626 | { |
| 627 | #ifdef CONFIG_KEXEC | 627 | #ifdef CONFIG_KEXEC |
| 628 | struct kimage *image; | 628 | struct kimage *image; |
| @@ -636,7 +636,6 @@ void kernel_kexec(void) | |||
| 636 | machine_kexec(image); | 636 | machine_kexec(image); |
| 637 | #endif | 637 | #endif |
| 638 | } | 638 | } |
| 639 | EXPORT_SYMBOL_GPL(kernel_kexec); | ||
| 640 | 639 | ||
| 641 | void kernel_shutdown_prepare(enum system_states state) | 640 | void kernel_shutdown_prepare(enum system_states state) |
| 642 | { | 641 | { |
| @@ -1984,7 +1983,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | |||
| 1984 | error = current->mm->dumpable; | 1983 | error = current->mm->dumpable; |
| 1985 | break; | 1984 | break; |
| 1986 | case PR_SET_DUMPABLE: | 1985 | case PR_SET_DUMPABLE: |
| 1987 | if (arg2 < 0 || arg2 > 2) { | 1986 | if (arg2 < 0 || arg2 > 1) { |
| 1988 | error = -EINVAL; | 1987 | error = -EINVAL; |
| 1989 | break; | 1988 | break; |
| 1990 | } | 1989 | } |
| @@ -2063,3 +2062,33 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | |||
| 2063 | } | 2062 | } |
| 2064 | return error; | 2063 | return error; |
| 2065 | } | 2064 | } |
| 2065 | |||
| 2066 | asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, | ||
| 2067 | struct getcpu_cache __user *cache) | ||
| 2068 | { | ||
| 2069 | int err = 0; | ||
| 2070 | int cpu = raw_smp_processor_id(); | ||
| 2071 | if (cpup) | ||
| 2072 | err |= put_user(cpu, cpup); | ||
| 2073 | if (nodep) | ||
| 2074 | err |= put_user(cpu_to_node(cpu), nodep); | ||
| 2075 | if (cache) { | ||
| 2076 | /* | ||
| 2077 | * The cache is not needed for this implementation, | ||
| 2078 | * but make sure user programs pass something | ||
| 2079 | * valid. vsyscall implementations can instead make | ||
| 2080 | * good use of the cache. Only use t0 and t1 because | ||
| 2081 | * these are available in both 32bit and 64bit ABI (no | ||
| 2082 | * need for a compat_getcpu). 32bit has enough | ||
| 2083 | * padding | ||
| 2084 | */ | ||
| 2085 | unsigned long t0, t1; | ||
| 2086 | get_user(t0, &cache->blob[0]); | ||
| 2087 | get_user(t1, &cache->blob[1]); | ||
| 2088 | t0++; | ||
| 2089 | t1++; | ||
| 2090 | put_user(t0, &cache->blob[0]); | ||
| 2091 | put_user(t1, &cache->blob[1]); | ||
| 2092 | } | ||
| 2093 | return err ? -EFAULT : 0; | ||
| 2094 | } | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index eb8bd214e7d7..c57c4532e296 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -18,7 +18,6 @@ | |||
| 18 | * Removed it and replaced it with older style, 03/23/00, Bill Wendling | 18 | * Removed it and replaced it with older style, 03/23/00, Bill Wendling |
| 19 | */ | 19 | */ |
| 20 | 20 | ||
| 21 | #include <linux/config.h> | ||
| 22 | #include <linux/module.h> | 21 | #include <linux/module.h> |
| 23 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
| 24 | #include <linux/swap.h> | 23 | #include <linux/swap.h> |
| @@ -53,6 +52,10 @@ | |||
| 53 | extern int proc_nr_files(ctl_table *table, int write, struct file *filp, | 52 | extern int proc_nr_files(ctl_table *table, int write, struct file *filp, |
| 54 | void __user *buffer, size_t *lenp, loff_t *ppos); | 53 | void __user *buffer, size_t *lenp, loff_t *ppos); |
| 55 | 54 | ||
| 55 | #ifdef CONFIG_X86 | ||
| 56 | #include <asm/nmi.h> | ||
| 57 | #endif | ||
| 58 | |||
| 56 | #if defined(CONFIG_SYSCTL) | 59 | #if defined(CONFIG_SYSCTL) |
| 57 | 60 | ||
| 58 | /* External variables not in a header file. */ | 61 | /* External variables not in a header file. */ |
| @@ -73,12 +76,7 @@ extern int printk_ratelimit_burst; | |||
| 73 | extern int pid_max_min, pid_max_max; | 76 | extern int pid_max_min, pid_max_max; |
| 74 | extern int sysctl_drop_caches; | 77 | extern int sysctl_drop_caches; |
| 75 | extern int percpu_pagelist_fraction; | 78 | extern int percpu_pagelist_fraction; |
| 76 | 79 | extern int compat_log; | |
| 77 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | ||
| 78 | int unknown_nmi_panic; | ||
| 79 | extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *, | ||
| 80 | void __user *, size_t *, loff_t *); | ||
| 81 | #endif | ||
| 82 | 80 | ||
| 83 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ | 81 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ |
| 84 | static int maxolduid = 65535; | 82 | static int maxolduid = 65535; |
| @@ -132,8 +130,15 @@ extern int acct_parm[]; | |||
| 132 | extern int no_unaligned_warning; | 130 | extern int no_unaligned_warning; |
| 133 | #endif | 131 | #endif |
| 134 | 132 | ||
| 135 | static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, | 133 | #ifdef CONFIG_RT_MUTEXES |
| 136 | ctl_table *, void **); | 134 | extern int max_lock_depth; |
| 135 | #endif | ||
| 136 | |||
| 137 | #ifdef CONFIG_SYSCTL_SYSCALL | ||
| 138 | static int parse_table(int __user *, int, void __user *, size_t __user *, | ||
| 139 | void __user *, size_t, ctl_table *, void **); | ||
| 140 | #endif | ||
| 141 | |||
| 137 | static int proc_doutsstring(ctl_table *table, int write, struct file *filp, | 142 | static int proc_doutsstring(ctl_table *table, int write, struct file *filp, |
| 138 | void __user *buffer, size_t *lenp, loff_t *ppos); | 143 | void __user *buffer, size_t *lenp, loff_t *ppos); |
| 139 | 144 | ||
| @@ -143,7 +148,6 @@ static struct ctl_table_header root_table_header = | |||
| 143 | 148 | ||
| 144 | static ctl_table kern_table[]; | 149 | static ctl_table kern_table[]; |
| 145 | static ctl_table vm_table[]; | 150 | static ctl_table vm_table[]; |
| 146 | static ctl_table proc_table[]; | ||
| 147 | static ctl_table fs_table[]; | 151 | static ctl_table fs_table[]; |
| 148 | static ctl_table debug_table[]; | 152 | static ctl_table debug_table[]; |
| 149 | static ctl_table dev_table[]; | 153 | static ctl_table dev_table[]; |
| @@ -161,7 +165,7 @@ int sysctl_legacy_va_layout; | |||
| 161 | 165 | ||
| 162 | /* /proc declarations: */ | 166 | /* /proc declarations: */ |
| 163 | 167 | ||
| 164 | #ifdef CONFIG_PROC_FS | 168 | #ifdef CONFIG_PROC_SYSCTL |
| 165 | 169 | ||
| 166 | static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *); | 170 | static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *); |
| 167 | static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); | 171 | static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); |
| @@ -203,12 +207,6 @@ static ctl_table root_table[] = { | |||
| 203 | }, | 207 | }, |
| 204 | #endif | 208 | #endif |
| 205 | { | 209 | { |
| 206 | .ctl_name = CTL_PROC, | ||
| 207 | .procname = "proc", | ||
| 208 | .mode = 0555, | ||
| 209 | .child = proc_table, | ||
| 210 | }, | ||
| 211 | { | ||
| 212 | .ctl_name = CTL_FS, | 210 | .ctl_name = CTL_FS, |
| 213 | .procname = "fs", | 211 | .procname = "fs", |
| 214 | .mode = 0555, | 212 | .mode = 0555, |
| @@ -631,11 +629,27 @@ static ctl_table kern_table[] = { | |||
| 631 | .data = &unknown_nmi_panic, | 629 | .data = &unknown_nmi_panic, |
| 632 | .maxlen = sizeof (int), | 630 | .maxlen = sizeof (int), |
| 633 | .mode = 0644, | 631 | .mode = 0644, |
| 634 | .proc_handler = &proc_unknown_nmi_panic, | 632 | .proc_handler = &proc_dointvec, |
| 633 | }, | ||
| 634 | { | ||
| 635 | .ctl_name = KERN_NMI_WATCHDOG, | ||
| 636 | .procname = "nmi_watchdog", | ||
| 637 | .data = &nmi_watchdog_enabled, | ||
| 638 | .maxlen = sizeof (int), | ||
| 639 | .mode = 0644, | ||
| 640 | .proc_handler = &proc_nmi_enabled, | ||
| 635 | }, | 641 | }, |
| 636 | #endif | 642 | #endif |
| 637 | #if defined(CONFIG_X86) | 643 | #if defined(CONFIG_X86) |
| 638 | { | 644 | { |
| 645 | .ctl_name = KERN_PANIC_ON_NMI, | ||
| 646 | .procname = "panic_on_unrecovered_nmi", | ||
| 647 | .data = &panic_on_unrecovered_nmi, | ||
| 648 | .maxlen = sizeof(int), | ||
| 649 | .mode = 0644, | ||
| 650 | .proc_handler = &proc_dointvec, | ||
| 651 | }, | ||
| 652 | { | ||
| 639 | .ctl_name = KERN_BOOTLOADER_TYPE, | 653 | .ctl_name = KERN_BOOTLOADER_TYPE, |
| 640 | .procname = "bootloader_type", | 654 | .procname = "bootloader_type", |
| 641 | .data = &bootloader_type, | 655 | .data = &bootloader_type, |
| @@ -684,6 +698,27 @@ static ctl_table kern_table[] = { | |||
| 684 | .proc_handler = &proc_dointvec, | 698 | .proc_handler = &proc_dointvec, |
| 685 | }, | 699 | }, |
| 686 | #endif | 700 | #endif |
| 701 | #ifdef CONFIG_COMPAT | ||
| 702 | { | ||
| 703 | .ctl_name = KERN_COMPAT_LOG, | ||
| 704 | .procname = "compat-log", | ||
| 705 | .data = &compat_log, | ||
| 706 | .maxlen = sizeof (int), | ||
| 707 | .mode = 0644, | ||
| 708 | .proc_handler = &proc_dointvec, | ||
| 709 | }, | ||
| 710 | #endif | ||
| 711 | #ifdef CONFIG_RT_MUTEXES | ||
| 712 | { | ||
| 713 | .ctl_name = KERN_MAX_LOCK_DEPTH, | ||
| 714 | .procname = "max_lock_depth", | ||
| 715 | .data = &max_lock_depth, | ||
| 716 | .maxlen = sizeof(int), | ||
| 717 | .mode = 0644, | ||
| 718 | .proc_handler = &proc_dointvec, | ||
| 719 | }, | ||
| 720 | #endif | ||
| 721 | |||
| 687 | { .ctl_name = 0 } | 722 | { .ctl_name = 0 } |
| 688 | }; | 723 | }; |
| 689 | 724 | ||
| @@ -915,19 +950,40 @@ static ctl_table vm_table[] = { | |||
| 915 | .extra1 = &zero, | 950 | .extra1 = &zero, |
| 916 | }, | 951 | }, |
| 917 | { | 952 | { |
| 918 | .ctl_name = VM_ZONE_RECLAIM_INTERVAL, | 953 | .ctl_name = VM_MIN_UNMAPPED, |
| 919 | .procname = "zone_reclaim_interval", | 954 | .procname = "min_unmapped_ratio", |
| 920 | .data = &zone_reclaim_interval, | 955 | .data = &sysctl_min_unmapped_ratio, |
| 921 | .maxlen = sizeof(zone_reclaim_interval), | 956 | .maxlen = sizeof(sysctl_min_unmapped_ratio), |
| 922 | .mode = 0644, | 957 | .mode = 0644, |
| 923 | .proc_handler = &proc_dointvec_jiffies, | 958 | .proc_handler = &sysctl_min_unmapped_ratio_sysctl_handler, |
| 924 | .strategy = &sysctl_jiffies, | 959 | .strategy = &sysctl_intvec, |
| 960 | .extra1 = &zero, | ||
| 961 | .extra2 = &one_hundred, | ||
| 962 | }, | ||
| 963 | { | ||
| 964 | .ctl_name = VM_MIN_SLAB, | ||
| 965 | .procname = "min_slab_ratio", | ||
| 966 | .data = &sysctl_min_slab_ratio, | ||
| 967 | .maxlen = sizeof(sysctl_min_slab_ratio), | ||
| 968 | .mode = 0644, | ||
| 969 | .proc_handler = &sysctl_min_slab_ratio_sysctl_handler, | ||
| 970 | .strategy = &sysctl_intvec, | ||
| 971 | .extra1 = &zero, | ||
| 972 | .extra2 = &one_hundred, | ||
| 973 | }, | ||
| 974 | #endif | ||
| 975 | #ifdef CONFIG_X86_32 | ||
| 976 | { | ||
| 977 | .ctl_name = VM_VDSO_ENABLED, | ||
| 978 | .procname = "vdso_enabled", | ||
| 979 | .data = &vdso_enabled, | ||
| 980 | .maxlen = sizeof(vdso_enabled), | ||
| 981 | .mode = 0644, | ||
| 982 | .proc_handler = &proc_dointvec, | ||
| 983 | .strategy = &sysctl_intvec, | ||
| 984 | .extra1 = &zero, | ||
| 925 | }, | 985 | }, |
| 926 | #endif | 986 | #endif |
| 927 | { .ctl_name = 0 } | ||
| 928 | }; | ||
| 929 | |||
| 930 | static ctl_table proc_table[] = { | ||
| 931 | { .ctl_name = 0 } | 987 | { .ctl_name = 0 } |
| 932 | }; | 988 | }; |
| 933 | 989 | ||
| @@ -1110,12 +1166,13 @@ static void start_unregistering(struct ctl_table_header *p) | |||
| 1110 | 1166 | ||
| 1111 | void __init sysctl_init(void) | 1167 | void __init sysctl_init(void) |
| 1112 | { | 1168 | { |
| 1113 | #ifdef CONFIG_PROC_FS | 1169 | #ifdef CONFIG_PROC_SYSCTL |
| 1114 | register_proc_table(root_table, proc_sys_root, &root_table_header); | 1170 | register_proc_table(root_table, proc_sys_root, &root_table_header); |
| 1115 | init_irq_proc(); | 1171 | init_irq_proc(); |
| 1116 | #endif | 1172 | #endif |
| 1117 | } | 1173 | } |
| 1118 | 1174 | ||
| 1175 | #ifdef CONFIG_SYSCTL_SYSCALL | ||
| 1119 | int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, | 1176 | int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, |
| 1120 | void __user *newval, size_t newlen) | 1177 | void __user *newval, size_t newlen) |
| 1121 | { | 1178 | { |
| @@ -1169,6 +1226,7 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args) | |||
| 1169 | unlock_kernel(); | 1226 | unlock_kernel(); |
| 1170 | return error; | 1227 | return error; |
| 1171 | } | 1228 | } |
| 1229 | #endif /* CONFIG_SYSCTL_SYSCALL */ | ||
| 1172 | 1230 | ||
| 1173 | /* | 1231 | /* |
| 1174 | * ctl_perm does NOT grant the superuser all rights automatically, because | 1232 | * ctl_perm does NOT grant the superuser all rights automatically, because |
| @@ -1195,6 +1253,7 @@ static inline int ctl_perm(ctl_table *table, int op) | |||
| 1195 | return test_perm(table->mode, op); | 1253 | return test_perm(table->mode, op); |
| 1196 | } | 1254 | } |
| 1197 | 1255 | ||
| 1256 | #ifdef CONFIG_SYSCTL_SYSCALL | ||
| 1198 | static int parse_table(int __user *name, int nlen, | 1257 | static int parse_table(int __user *name, int nlen, |
| 1199 | void __user *oldval, size_t __user *oldlenp, | 1258 | void __user *oldval, size_t __user *oldlenp, |
| 1200 | void __user *newval, size_t newlen, | 1259 | void __user *newval, size_t newlen, |
| @@ -1284,6 +1343,7 @@ int do_sysctl_strategy (ctl_table *table, | |||
| 1284 | } | 1343 | } |
| 1285 | return 0; | 1344 | return 0; |
| 1286 | } | 1345 | } |
| 1346 | #endif /* CONFIG_SYSCTL_SYSCALL */ | ||
| 1287 | 1347 | ||
| 1288 | /** | 1348 | /** |
| 1289 | * register_sysctl_table - register a sysctl hierarchy | 1349 | * register_sysctl_table - register a sysctl hierarchy |
| @@ -1371,7 +1431,7 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table, | |||
| 1371 | else | 1431 | else |
| 1372 | list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); | 1432 | list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); |
| 1373 | spin_unlock(&sysctl_lock); | 1433 | spin_unlock(&sysctl_lock); |
| 1374 | #ifdef CONFIG_PROC_FS | 1434 | #ifdef CONFIG_PROC_SYSCTL |
| 1375 | register_proc_table(table, proc_sys_root, tmp); | 1435 | register_proc_table(table, proc_sys_root, tmp); |
| 1376 | #endif | 1436 | #endif |
| 1377 | return tmp; | 1437 | return tmp; |
| @@ -1389,18 +1449,31 @@ void unregister_sysctl_table(struct ctl_table_header * header) | |||
| 1389 | might_sleep(); | 1449 | might_sleep(); |
| 1390 | spin_lock(&sysctl_lock); | 1450 | spin_lock(&sysctl_lock); |
| 1391 | start_unregistering(header); | 1451 | start_unregistering(header); |
| 1392 | #ifdef CONFIG_PROC_FS | 1452 | #ifdef CONFIG_PROC_SYSCTL |
| 1393 | unregister_proc_table(header->ctl_table, proc_sys_root); | 1453 | unregister_proc_table(header->ctl_table, proc_sys_root); |
| 1394 | #endif | 1454 | #endif |
| 1395 | spin_unlock(&sysctl_lock); | 1455 | spin_unlock(&sysctl_lock); |
| 1396 | kfree(header); | 1456 | kfree(header); |
| 1397 | } | 1457 | } |
| 1398 | 1458 | ||
| 1459 | #else /* !CONFIG_SYSCTL */ | ||
| 1460 | struct ctl_table_header * register_sysctl_table(ctl_table * table, | ||
| 1461 | int insert_at_head) | ||
| 1462 | { | ||
| 1463 | return NULL; | ||
| 1464 | } | ||
| 1465 | |||
| 1466 | void unregister_sysctl_table(struct ctl_table_header * table) | ||
| 1467 | { | ||
| 1468 | } | ||
| 1469 | |||
| 1470 | #endif /* CONFIG_SYSCTL */ | ||
| 1471 | |||
| 1399 | /* | 1472 | /* |
| 1400 | * /proc/sys support | 1473 | * /proc/sys support |
| 1401 | */ | 1474 | */ |
| 1402 | 1475 | ||
| 1403 | #ifdef CONFIG_PROC_FS | 1476 | #ifdef CONFIG_PROC_SYSCTL |
| 1404 | 1477 | ||
| 1405 | /* Scan the sysctl entries in table and add them all into /proc */ | 1478 | /* Scan the sysctl entries in table and add them all into /proc */ |
| 1406 | static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set) | 1479 | static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set) |
| @@ -1839,7 +1912,7 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, | |||
| 1839 | return -EPERM; | 1912 | return -EPERM; |
| 1840 | } | 1913 | } |
| 1841 | 1914 | ||
| 1842 | op = (current->pid == 1) ? OP_SET : OP_AND; | 1915 | op = is_init(current) ? OP_SET : OP_AND; |
| 1843 | return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, | 1916 | return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, |
| 1844 | do_proc_dointvec_bset_conv,&op); | 1917 | do_proc_dointvec_bset_conv,&op); |
| 1845 | } | 1918 | } |
| @@ -2262,6 +2335,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, | |||
| 2262 | #endif /* CONFIG_PROC_FS */ | 2335 | #endif /* CONFIG_PROC_FS */ |
| 2263 | 2336 | ||
| 2264 | 2337 | ||
| 2338 | #ifdef CONFIG_SYSCTL_SYSCALL | ||
| 2265 | /* | 2339 | /* |
| 2266 | * General sysctl support routines | 2340 | * General sysctl support routines |
| 2267 | */ | 2341 | */ |
| @@ -2404,11 +2478,19 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, | |||
| 2404 | return 1; | 2478 | return 1; |
| 2405 | } | 2479 | } |
| 2406 | 2480 | ||
| 2407 | #else /* CONFIG_SYSCTL */ | 2481 | #else /* CONFIG_SYSCTL_SYSCALL */ |
| 2408 | 2482 | ||
| 2409 | 2483 | ||
| 2410 | asmlinkage long sys_sysctl(struct __sysctl_args __user *args) | 2484 | asmlinkage long sys_sysctl(struct __sysctl_args __user *args) |
| 2411 | { | 2485 | { |
| 2486 | static int msg_count; | ||
| 2487 | |||
| 2488 | if (msg_count < 5) { | ||
| 2489 | msg_count++; | ||
| 2490 | printk(KERN_INFO | ||
| 2491 | "warning: process `%s' used the removed sysctl " | ||
| 2492 | "system call\n", current->comm); | ||
| 2493 | } | ||
| 2412 | return -ENOSYS; | 2494 | return -ENOSYS; |
| 2413 | } | 2495 | } |
| 2414 | 2496 | ||
| @@ -2440,73 +2522,7 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, | |||
| 2440 | return -ENOSYS; | 2522 | return -ENOSYS; |
| 2441 | } | 2523 | } |
| 2442 | 2524 | ||
| 2443 | int proc_dostring(ctl_table *table, int write, struct file *filp, | 2525 | #endif /* CONFIG_SYSCTL_SYSCALL */ |
| 2444 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 2445 | { | ||
| 2446 | return -ENOSYS; | ||
| 2447 | } | ||
| 2448 | |||
| 2449 | int proc_dointvec(ctl_table *table, int write, struct file *filp, | ||
| 2450 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 2451 | { | ||
| 2452 | return -ENOSYS; | ||
| 2453 | } | ||
| 2454 | |||
| 2455 | int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, | ||
| 2456 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 2457 | { | ||
| 2458 | return -ENOSYS; | ||
| 2459 | } | ||
| 2460 | |||
| 2461 | int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, | ||
| 2462 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 2463 | { | ||
| 2464 | return -ENOSYS; | ||
| 2465 | } | ||
| 2466 | |||
| 2467 | int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp, | ||
| 2468 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 2469 | { | ||
| 2470 | return -ENOSYS; | ||
| 2471 | } | ||
| 2472 | |||
| 2473 | int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp, | ||
| 2474 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 2475 | { | ||
| 2476 | return -ENOSYS; | ||
| 2477 | } | ||
| 2478 | |||
| 2479 | int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, | ||
| 2480 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 2481 | { | ||
| 2482 | return -ENOSYS; | ||
| 2483 | } | ||
| 2484 | |||
| 2485 | int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, | ||
| 2486 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 2487 | { | ||
| 2488 | return -ENOSYS; | ||
| 2489 | } | ||
| 2490 | |||
| 2491 | int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, | ||
| 2492 | struct file *filp, | ||
| 2493 | void __user *buffer, | ||
| 2494 | size_t *lenp, loff_t *ppos) | ||
| 2495 | { | ||
| 2496 | return -ENOSYS; | ||
| 2497 | } | ||
| 2498 | |||
| 2499 | struct ctl_table_header * register_sysctl_table(ctl_table * table, | ||
| 2500 | int insert_at_head) | ||
| 2501 | { | ||
| 2502 | return NULL; | ||
| 2503 | } | ||
| 2504 | |||
| 2505 | void unregister_sysctl_table(struct ctl_table_header * table) | ||
| 2506 | { | ||
| 2507 | } | ||
| 2508 | |||
| 2509 | #endif /* CONFIG_SYSCTL */ | ||
| 2510 | 2526 | ||
| 2511 | /* | 2527 | /* |
| 2512 | * No sense putting this after each symbol definition, twice, | 2528 | * No sense putting this after each symbol definition, twice, |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c new file mode 100644 index 000000000000..2ed4040d0dc5 --- /dev/null +++ b/kernel/taskstats.c | |||
| @@ -0,0 +1,564 @@ | |||
| 1 | /* | ||
| 2 | * taskstats.c - Export per-task statistics to userland | ||
| 3 | * | ||
| 4 | * Copyright (C) Shailabh Nagar, IBM Corp. 2006 | ||
| 5 | * (C) Balbir Singh, IBM Corp. 2006 | ||
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or modify | ||
| 8 | * it under the terms of the GNU General Public License as published by | ||
| 9 | * the Free Software Foundation; either version 2 of the License, or | ||
| 10 | * (at your option) any later version. | ||
| 11 | * | ||
| 12 | * This program is distributed in the hope that it will be useful, | ||
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 15 | * GNU General Public License for more details. | ||
| 16 | * | ||
| 17 | */ | ||
| 18 | |||
| 19 | #include <linux/kernel.h> | ||
| 20 | #include <linux/taskstats_kern.h> | ||
| 21 | #include <linux/delayacct.h> | ||
| 22 | #include <linux/cpumask.h> | ||
| 23 | #include <linux/percpu.h> | ||
| 24 | #include <net/genetlink.h> | ||
| 25 | #include <asm/atomic.h> | ||
| 26 | |||
| 27 | /* | ||
| 28 | * Maximum length of a cpumask that can be specified in | ||
| 29 | * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute | ||
| 30 | */ | ||
| 31 | #define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) | ||
| 32 | |||
| 33 | static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; | ||
| 34 | static int family_registered; | ||
| 35 | kmem_cache_t *taskstats_cache; | ||
| 36 | |||
| 37 | static struct genl_family family = { | ||
| 38 | .id = GENL_ID_GENERATE, | ||
| 39 | .name = TASKSTATS_GENL_NAME, | ||
| 40 | .version = TASKSTATS_GENL_VERSION, | ||
| 41 | .maxattr = TASKSTATS_CMD_ATTR_MAX, | ||
| 42 | }; | ||
| 43 | |||
| 44 | static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] | ||
| 45 | __read_mostly = { | ||
| 46 | [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, | ||
| 47 | [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, | ||
| 48 | [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, | ||
| 49 | [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; | ||
| 50 | |||
| 51 | struct listener { | ||
| 52 | struct list_head list; | ||
| 53 | pid_t pid; | ||
| 54 | char valid; | ||
| 55 | }; | ||
| 56 | |||
| 57 | struct listener_list { | ||
| 58 | struct rw_semaphore sem; | ||
| 59 | struct list_head list; | ||
| 60 | }; | ||
| 61 | static DEFINE_PER_CPU(struct listener_list, listener_array); | ||
| 62 | |||
| 63 | enum actions { | ||
| 64 | REGISTER, | ||
| 65 | DEREGISTER, | ||
| 66 | CPU_DONT_CARE | ||
| 67 | }; | ||
| 68 | |||
| 69 | static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, | ||
| 70 | void **replyp, size_t size) | ||
| 71 | { | ||
| 72 | struct sk_buff *skb; | ||
| 73 | void *reply; | ||
| 74 | |||
| 75 | /* | ||
| 76 | * If new attributes are added, please revisit this allocation | ||
| 77 | */ | ||
| 78 | skb = nlmsg_new(size, GFP_KERNEL); | ||
| 79 | if (!skb) | ||
| 80 | return -ENOMEM; | ||
| 81 | |||
| 82 | if (!info) { | ||
| 83 | int seq = get_cpu_var(taskstats_seqnum)++; | ||
| 84 | put_cpu_var(taskstats_seqnum); | ||
| 85 | |||
| 86 | reply = genlmsg_put(skb, 0, seq, | ||
| 87 | family.id, 0, 0, | ||
| 88 | cmd, family.version); | ||
| 89 | } else | ||
| 90 | reply = genlmsg_put(skb, info->snd_pid, info->snd_seq, | ||
| 91 | family.id, 0, 0, | ||
| 92 | cmd, family.version); | ||
| 93 | if (reply == NULL) { | ||
| 94 | nlmsg_free(skb); | ||
| 95 | return -EINVAL; | ||
| 96 | } | ||
| 97 | |||
| 98 | *skbp = skb; | ||
| 99 | *replyp = reply; | ||
| 100 | return 0; | ||
| 101 | } | ||
| 102 | |||
| 103 | /* | ||
| 104 | * Send taskstats data in @skb to listener with nl_pid @pid | ||
| 105 | */ | ||
| 106 | static int send_reply(struct sk_buff *skb, pid_t pid) | ||
| 107 | { | ||
| 108 | struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); | ||
| 109 | void *reply = genlmsg_data(genlhdr); | ||
| 110 | int rc; | ||
| 111 | |||
| 112 | rc = genlmsg_end(skb, reply); | ||
| 113 | if (rc < 0) { | ||
| 114 | nlmsg_free(skb); | ||
| 115 | return rc; | ||
| 116 | } | ||
| 117 | |||
| 118 | return genlmsg_unicast(skb, pid); | ||
| 119 | } | ||
| 120 | |||
| 121 | /* | ||
| 122 | * Send taskstats data in @skb to listeners registered for @cpu's exit data | ||
| 123 | */ | ||
| 124 | static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) | ||
| 125 | { | ||
| 126 | struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); | ||
| 127 | struct listener_list *listeners; | ||
| 128 | struct listener *s, *tmp; | ||
| 129 | struct sk_buff *skb_next, *skb_cur = skb; | ||
| 130 | void *reply = genlmsg_data(genlhdr); | ||
| 131 | int rc, delcount = 0; | ||
| 132 | |||
| 133 | rc = genlmsg_end(skb, reply); | ||
| 134 | if (rc < 0) { | ||
| 135 | nlmsg_free(skb); | ||
| 136 | return; | ||
| 137 | } | ||
| 138 | |||
| 139 | rc = 0; | ||
| 140 | listeners = &per_cpu(listener_array, cpu); | ||
| 141 | down_read(&listeners->sem); | ||
| 142 | list_for_each_entry(s, &listeners->list, list) { | ||
| 143 | skb_next = NULL; | ||
| 144 | if (!list_is_last(&s->list, &listeners->list)) { | ||
| 145 | skb_next = skb_clone(skb_cur, GFP_KERNEL); | ||
| 146 | if (!skb_next) | ||
| 147 | break; | ||
| 148 | } | ||
| 149 | rc = genlmsg_unicast(skb_cur, s->pid); | ||
| 150 | if (rc == -ECONNREFUSED) { | ||
| 151 | s->valid = 0; | ||
| 152 | delcount++; | ||
| 153 | } | ||
| 154 | skb_cur = skb_next; | ||
| 155 | } | ||
| 156 | up_read(&listeners->sem); | ||
| 157 | |||
| 158 | if (skb_cur) | ||
| 159 | nlmsg_free(skb_cur); | ||
| 160 | |||
| 161 | if (!delcount) | ||
| 162 | return; | ||
| 163 | |||
| 164 | /* Delete invalidated entries */ | ||
| 165 | down_write(&listeners->sem); | ||
| 166 | list_for_each_entry_safe(s, tmp, &listeners->list, list) { | ||
| 167 | if (!s->valid) { | ||
| 168 | list_del(&s->list); | ||
| 169 | kfree(s); | ||
| 170 | } | ||
| 171 | } | ||
| 172 | up_write(&listeners->sem); | ||
| 173 | } | ||
| 174 | |||
| 175 | static int fill_pid(pid_t pid, struct task_struct *pidtsk, | ||
| 176 | struct taskstats *stats) | ||
| 177 | { | ||
| 178 | int rc = 0; | ||
| 179 | struct task_struct *tsk = pidtsk; | ||
| 180 | |||
| 181 | if (!pidtsk) { | ||
| 182 | read_lock(&tasklist_lock); | ||
| 183 | tsk = find_task_by_pid(pid); | ||
| 184 | if (!tsk) { | ||
| 185 | read_unlock(&tasklist_lock); | ||
| 186 | return -ESRCH; | ||
| 187 | } | ||
| 188 | get_task_struct(tsk); | ||
| 189 | read_unlock(&tasklist_lock); | ||
| 190 | } else | ||
| 191 | get_task_struct(tsk); | ||
| 192 | |||
| 193 | /* | ||
| 194 | * Each accounting subsystem adds calls to its functions to | ||
| 195 | * fill in relevant parts of struct taskstsats as follows | ||
| 196 | * | ||
| 197 | * per-task-foo(stats, tsk); | ||
| 198 | */ | ||
| 199 | |||
| 200 | delayacct_add_tsk(stats, tsk); | ||
| 201 | stats->version = TASKSTATS_VERSION; | ||
| 202 | |||
| 203 | /* Define err: label here if needed */ | ||
| 204 | put_task_struct(tsk); | ||
| 205 | return rc; | ||
| 206 | |||
| 207 | } | ||
| 208 | |||
| 209 | static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, | ||
| 210 | struct taskstats *stats) | ||
| 211 | { | ||
| 212 | struct task_struct *tsk, *first; | ||
| 213 | unsigned long flags; | ||
| 214 | |||
| 215 | /* | ||
| 216 | * Add additional stats from live tasks except zombie thread group | ||
| 217 | * leaders who are already counted with the dead tasks | ||
| 218 | */ | ||
| 219 | first = tgidtsk; | ||
| 220 | if (!first) { | ||
| 221 | read_lock(&tasklist_lock); | ||
| 222 | first = find_task_by_pid(tgid); | ||
| 223 | if (!first) { | ||
| 224 | read_unlock(&tasklist_lock); | ||
| 225 | return -ESRCH; | ||
| 226 | } | ||
| 227 | get_task_struct(first); | ||
| 228 | read_unlock(&tasklist_lock); | ||
| 229 | } else | ||
| 230 | get_task_struct(first); | ||
| 231 | |||
| 232 | /* Start with stats from dead tasks */ | ||
| 233 | spin_lock_irqsave(&first->signal->stats_lock, flags); | ||
| 234 | if (first->signal->stats) | ||
| 235 | memcpy(stats, first->signal->stats, sizeof(*stats)); | ||
| 236 | spin_unlock_irqrestore(&first->signal->stats_lock, flags); | ||
| 237 | |||
| 238 | tsk = first; | ||
| 239 | read_lock(&tasklist_lock); | ||
| 240 | do { | ||
| 241 | if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) | ||
| 242 | continue; | ||
| 243 | /* | ||
| 244 | * Accounting subsystem can call its functions here to | ||
| 245 | * fill in relevant parts of struct taskstsats as follows | ||
| 246 | * | ||
| 247 | * per-task-foo(stats, tsk); | ||
| 248 | */ | ||
| 249 | delayacct_add_tsk(stats, tsk); | ||
| 250 | |||
| 251 | } while_each_thread(first, tsk); | ||
| 252 | read_unlock(&tasklist_lock); | ||
| 253 | stats->version = TASKSTATS_VERSION; | ||
| 254 | |||
| 255 | /* | ||
| 256 | * Accounting subsytems can also add calls here to modify | ||
| 257 | * fields of taskstats. | ||
| 258 | */ | ||
| 259 | |||
| 260 | return 0; | ||
| 261 | } | ||
| 262 | |||
| 263 | |||
| 264 | static void fill_tgid_exit(struct task_struct *tsk) | ||
| 265 | { | ||
| 266 | unsigned long flags; | ||
| 267 | |||
| 268 | spin_lock_irqsave(&tsk->signal->stats_lock, flags); | ||
| 269 | if (!tsk->signal->stats) | ||
| 270 | goto ret; | ||
| 271 | |||
| 272 | /* | ||
| 273 | * Each accounting subsystem calls its functions here to | ||
| 274 | * accumalate its per-task stats for tsk, into the per-tgid structure | ||
| 275 | * | ||
| 276 | * per-task-foo(tsk->signal->stats, tsk); | ||
| 277 | */ | ||
| 278 | delayacct_add_tsk(tsk->signal->stats, tsk); | ||
| 279 | ret: | ||
| 280 | spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); | ||
| 281 | return; | ||
| 282 | } | ||
| 283 | |||
| 284 | static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) | ||
| 285 | { | ||
| 286 | struct listener_list *listeners; | ||
| 287 | struct listener *s, *tmp; | ||
| 288 | unsigned int cpu; | ||
| 289 | cpumask_t mask = *maskp; | ||
| 290 | |||
| 291 | if (!cpus_subset(mask, cpu_possible_map)) | ||
| 292 | return -EINVAL; | ||
| 293 | |||
| 294 | if (isadd == REGISTER) { | ||
| 295 | for_each_cpu_mask(cpu, mask) { | ||
| 296 | s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, | ||
| 297 | cpu_to_node(cpu)); | ||
| 298 | if (!s) | ||
| 299 | goto cleanup; | ||
| 300 | s->pid = pid; | ||
| 301 | INIT_LIST_HEAD(&s->list); | ||
| 302 | s->valid = 1; | ||
| 303 | |||
| 304 | listeners = &per_cpu(listener_array, cpu); | ||
| 305 | down_write(&listeners->sem); | ||
| 306 | list_add(&s->list, &listeners->list); | ||
| 307 | up_write(&listeners->sem); | ||
| 308 | } | ||
| 309 | return 0; | ||
| 310 | } | ||
| 311 | |||
| 312 | /* Deregister or cleanup */ | ||
| 313 | cleanup: | ||
| 314 | for_each_cpu_mask(cpu, mask) { | ||
| 315 | listeners = &per_cpu(listener_array, cpu); | ||
| 316 | down_write(&listeners->sem); | ||
| 317 | list_for_each_entry_safe(s, tmp, &listeners->list, list) { | ||
| 318 | if (s->pid == pid) { | ||
| 319 | list_del(&s->list); | ||
| 320 | kfree(s); | ||
| 321 | break; | ||
| 322 | } | ||
| 323 | } | ||
| 324 | up_write(&listeners->sem); | ||
| 325 | } | ||
| 326 | return 0; | ||
| 327 | } | ||
| 328 | |||
| 329 | static int parse(struct nlattr *na, cpumask_t *mask) | ||
| 330 | { | ||
| 331 | char *data; | ||
| 332 | int len; | ||
| 333 | int ret; | ||
| 334 | |||
| 335 | if (na == NULL) | ||
| 336 | return 1; | ||
| 337 | len = nla_len(na); | ||
| 338 | if (len > TASKSTATS_CPUMASK_MAXLEN) | ||
| 339 | return -E2BIG; | ||
| 340 | if (len < 1) | ||
| 341 | return -EINVAL; | ||
| 342 | data = kmalloc(len, GFP_KERNEL); | ||
| 343 | if (!data) | ||
| 344 | return -ENOMEM; | ||
| 345 | nla_strlcpy(data, na, len); | ||
| 346 | ret = cpulist_parse(data, *mask); | ||
| 347 | kfree(data); | ||
| 348 | return ret; | ||
| 349 | } | ||
| 350 | |||
| 351 | static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | ||
| 352 | { | ||
| 353 | int rc = 0; | ||
| 354 | struct sk_buff *rep_skb; | ||
| 355 | struct taskstats stats; | ||
| 356 | void *reply; | ||
| 357 | size_t size; | ||
| 358 | struct nlattr *na; | ||
| 359 | cpumask_t mask; | ||
| 360 | |||
| 361 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); | ||
| 362 | if (rc < 0) | ||
| 363 | return rc; | ||
| 364 | if (rc == 0) | ||
| 365 | return add_del_listener(info->snd_pid, &mask, REGISTER); | ||
| 366 | |||
| 367 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); | ||
| 368 | if (rc < 0) | ||
| 369 | return rc; | ||
| 370 | if (rc == 0) | ||
| 371 | return add_del_listener(info->snd_pid, &mask, DEREGISTER); | ||
| 372 | |||
| 373 | /* | ||
| 374 | * Size includes space for nested attributes | ||
| 375 | */ | ||
| 376 | size = nla_total_size(sizeof(u32)) + | ||
| 377 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | ||
| 378 | |||
| 379 | memset(&stats, 0, sizeof(stats)); | ||
| 380 | rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); | ||
| 381 | if (rc < 0) | ||
| 382 | return rc; | ||
| 383 | |||
| 384 | if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { | ||
| 385 | u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); | ||
| 386 | rc = fill_pid(pid, NULL, &stats); | ||
| 387 | if (rc < 0) | ||
| 388 | goto err; | ||
| 389 | |||
| 390 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); | ||
| 391 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid); | ||
| 392 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | ||
| 393 | stats); | ||
| 394 | } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { | ||
| 395 | u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); | ||
| 396 | rc = fill_tgid(tgid, NULL, &stats); | ||
| 397 | if (rc < 0) | ||
| 398 | goto err; | ||
| 399 | |||
| 400 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); | ||
| 401 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid); | ||
| 402 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | ||
| 403 | stats); | ||
| 404 | } else { | ||
| 405 | rc = -EINVAL; | ||
| 406 | goto err; | ||
| 407 | } | ||
| 408 | |||
| 409 | nla_nest_end(rep_skb, na); | ||
| 410 | |||
| 411 | return send_reply(rep_skb, info->snd_pid); | ||
| 412 | |||
| 413 | nla_put_failure: | ||
| 414 | return genlmsg_cancel(rep_skb, reply); | ||
| 415 | err: | ||
| 416 | nlmsg_free(rep_skb); | ||
| 417 | return rc; | ||
| 418 | } | ||
| 419 | |||
| 420 | void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) | ||
| 421 | { | ||
| 422 | struct listener_list *listeners; | ||
| 423 | struct taskstats *tmp; | ||
| 424 | /* | ||
| 425 | * This is the cpu on which the task is exiting currently and will | ||
| 426 | * be the one for which the exit event is sent, even if the cpu | ||
| 427 | * on which this function is running changes later. | ||
| 428 | */ | ||
| 429 | *mycpu = raw_smp_processor_id(); | ||
| 430 | |||
| 431 | *ptidstats = NULL; | ||
| 432 | tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); | ||
| 433 | if (!tmp) | ||
| 434 | return; | ||
| 435 | |||
| 436 | listeners = &per_cpu(listener_array, *mycpu); | ||
| 437 | down_read(&listeners->sem); | ||
| 438 | if (!list_empty(&listeners->list)) { | ||
| 439 | *ptidstats = tmp; | ||
| 440 | tmp = NULL; | ||
| 441 | } | ||
| 442 | up_read(&listeners->sem); | ||
| 443 | kfree(tmp); | ||
| 444 | } | ||
| 445 | |||
| 446 | /* Send pid data out on exit */ | ||
| 447 | void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, | ||
| 448 | int group_dead, unsigned int mycpu) | ||
| 449 | { | ||
| 450 | int rc; | ||
| 451 | struct sk_buff *rep_skb; | ||
| 452 | void *reply; | ||
| 453 | size_t size; | ||
| 454 | int is_thread_group; | ||
| 455 | struct nlattr *na; | ||
| 456 | unsigned long flags; | ||
| 457 | |||
| 458 | if (!family_registered || !tidstats) | ||
| 459 | return; | ||
| 460 | |||
| 461 | spin_lock_irqsave(&tsk->signal->stats_lock, flags); | ||
| 462 | is_thread_group = tsk->signal->stats ? 1 : 0; | ||
| 463 | spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); | ||
| 464 | |||
| 465 | rc = 0; | ||
| 466 | /* | ||
| 467 | * Size includes space for nested attributes | ||
| 468 | */ | ||
| 469 | size = nla_total_size(sizeof(u32)) + | ||
| 470 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | ||
| 471 | |||
| 472 | if (is_thread_group) | ||
| 473 | size = 2 * size; /* PID + STATS + TGID + STATS */ | ||
| 474 | |||
| 475 | rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); | ||
| 476 | if (rc < 0) | ||
| 477 | goto ret; | ||
| 478 | |||
| 479 | rc = fill_pid(tsk->pid, tsk, tidstats); | ||
| 480 | if (rc < 0) | ||
| 481 | goto err_skb; | ||
| 482 | |||
| 483 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); | ||
| 484 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid); | ||
| 485 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | ||
| 486 | *tidstats); | ||
| 487 | nla_nest_end(rep_skb, na); | ||
| 488 | |||
| 489 | if (!is_thread_group) | ||
| 490 | goto send; | ||
| 491 | |||
| 492 | /* | ||
| 493 | * tsk has/had a thread group so fill the tsk->signal->stats structure | ||
| 494 | * Doesn't matter if tsk is the leader or the last group member leaving | ||
| 495 | */ | ||
| 496 | |||
| 497 | fill_tgid_exit(tsk); | ||
| 498 | if (!group_dead) | ||
| 499 | goto send; | ||
| 500 | |||
| 501 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); | ||
| 502 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); | ||
| 503 | /* No locking needed for tsk->signal->stats since group is dead */ | ||
| 504 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | ||
| 505 | *tsk->signal->stats); | ||
| 506 | nla_nest_end(rep_skb, na); | ||
| 507 | |||
| 508 | send: | ||
| 509 | send_cpu_listeners(rep_skb, mycpu); | ||
| 510 | return; | ||
| 511 | |||
| 512 | nla_put_failure: | ||
| 513 | genlmsg_cancel(rep_skb, reply); | ||
| 514 | goto ret; | ||
| 515 | err_skb: | ||
| 516 | nlmsg_free(rep_skb); | ||
| 517 | ret: | ||
| 518 | return; | ||
| 519 | } | ||
| 520 | |||
| 521 | static struct genl_ops taskstats_ops = { | ||
| 522 | .cmd = TASKSTATS_CMD_GET, | ||
| 523 | .doit = taskstats_user_cmd, | ||
| 524 | .policy = taskstats_cmd_get_policy, | ||
| 525 | }; | ||
| 526 | |||
| 527 | /* Needed early in initialization */ | ||
| 528 | void __init taskstats_init_early(void) | ||
| 529 | { | ||
| 530 | unsigned int i; | ||
| 531 | |||
| 532 | taskstats_cache = kmem_cache_create("taskstats_cache", | ||
| 533 | sizeof(struct taskstats), | ||
| 534 | 0, SLAB_PANIC, NULL, NULL); | ||
| 535 | for_each_possible_cpu(i) { | ||
| 536 | INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); | ||
| 537 | init_rwsem(&(per_cpu(listener_array, i).sem)); | ||
| 538 | } | ||
| 539 | } | ||
| 540 | |||
| 541 | static int __init taskstats_init(void) | ||
| 542 | { | ||
| 543 | int rc; | ||
| 544 | |||
| 545 | rc = genl_register_family(&family); | ||
| 546 | if (rc) | ||
| 547 | return rc; | ||
| 548 | |||
| 549 | rc = genl_register_ops(&family, &taskstats_ops); | ||
| 550 | if (rc < 0) | ||
| 551 | goto err; | ||
| 552 | |||
| 553 | family_registered = 1; | ||
| 554 | return 0; | ||
| 555 | err: | ||
| 556 | genl_unregister_family(&family); | ||
| 557 | return rc; | ||
| 558 | } | ||
| 559 | |||
| 560 | /* | ||
| 561 | * late initcall ensures initialization of statistics collection | ||
| 562 | * mechanisms precedes initialization of the taskstats interface | ||
| 563 | */ | ||
| 564 | late_initcall(taskstats_init); | ||
diff --git a/kernel/time.c b/kernel/time.c index b00ddc71cedb..5bd489747643 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
| @@ -523,6 +523,7 @@ EXPORT_SYMBOL(do_gettimeofday); | |||
| 523 | 523 | ||
| 524 | 524 | ||
| 525 | #else | 525 | #else |
| 526 | #ifndef CONFIG_GENERIC_TIME | ||
| 526 | /* | 527 | /* |
| 527 | * Simulate gettimeofday using do_gettimeofday which only allows a timeval | 528 | * Simulate gettimeofday using do_gettimeofday which only allows a timeval |
| 528 | * and therefore only yields usec accuracy | 529 | * and therefore only yields usec accuracy |
| @@ -537,6 +538,7 @@ void getnstimeofday(struct timespec *tv) | |||
| 537 | } | 538 | } |
| 538 | EXPORT_SYMBOL_GPL(getnstimeofday); | 539 | EXPORT_SYMBOL_GPL(getnstimeofday); |
| 539 | #endif | 540 | #endif |
| 541 | #endif | ||
| 540 | 542 | ||
| 541 | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. | 543 | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. |
| 542 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 | 544 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 |
diff --git a/kernel/time/Makefile b/kernel/time/Makefile new file mode 100644 index 000000000000..e1dfd8e86cce --- /dev/null +++ b/kernel/time/Makefile | |||
| @@ -0,0 +1 @@ | |||
| obj-y += clocksource.o jiffies.o | |||
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c new file mode 100644 index 000000000000..74eca5939bd9 --- /dev/null +++ b/kernel/time/clocksource.c | |||
| @@ -0,0 +1,349 @@ | |||
| 1 | /* | ||
| 2 | * linux/kernel/time/clocksource.c | ||
| 3 | * | ||
| 4 | * This file contains the functions which manage clocksource drivers. | ||
| 5 | * | ||
| 6 | * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) | ||
| 7 | * | ||
| 8 | * This program is free software; you can redistribute it and/or modify | ||
| 9 | * it under the terms of the GNU General Public License as published by | ||
| 10 | * the Free Software Foundation; either version 2 of the License, or | ||
| 11 | * (at your option) any later version. | ||
| 12 | * | ||
| 13 | * This program is distributed in the hope that it will be useful, | ||
| 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 16 | * GNU General Public License for more details. | ||
| 17 | * | ||
| 18 | * You should have received a copy of the GNU General Public License | ||
| 19 | * along with this program; if not, write to the Free Software | ||
| 20 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
| 21 | * | ||
| 22 | * TODO WishList: | ||
| 23 | * o Allow clocksource drivers to be unregistered | ||
| 24 | * o get rid of clocksource_jiffies extern | ||
| 25 | */ | ||
| 26 | |||
| 27 | #include <linux/clocksource.h> | ||
| 28 | #include <linux/sysdev.h> | ||
| 29 | #include <linux/init.h> | ||
| 30 | #include <linux/module.h> | ||
| 31 | |||
| 32 | /* XXX - Would like a better way for initializing curr_clocksource */ | ||
| 33 | extern struct clocksource clocksource_jiffies; | ||
| 34 | |||
| 35 | /*[Clocksource internal variables]--------- | ||
| 36 | * curr_clocksource: | ||
| 37 | * currently selected clocksource. Initialized to clocksource_jiffies. | ||
| 38 | * next_clocksource: | ||
| 39 | * pending next selected clocksource. | ||
| 40 | * clocksource_list: | ||
| 41 | * linked list with the registered clocksources | ||
| 42 | * clocksource_lock: | ||
| 43 | * protects manipulations to curr_clocksource and next_clocksource | ||
| 44 | * and the clocksource_list | ||
| 45 | * override_name: | ||
| 46 | * Name of the user-specified clocksource. | ||
| 47 | */ | ||
| 48 | static struct clocksource *curr_clocksource = &clocksource_jiffies; | ||
| 49 | static struct clocksource *next_clocksource; | ||
| 50 | static LIST_HEAD(clocksource_list); | ||
| 51 | static DEFINE_SPINLOCK(clocksource_lock); | ||
| 52 | static char override_name[32]; | ||
| 53 | static int finished_booting; | ||
| 54 | |||
| 55 | /* clocksource_done_booting - Called near the end of bootup | ||
| 56 | * | ||
| 57 | * Hack to avoid lots of clocksource churn at boot time | ||
| 58 | */ | ||
| 59 | static int __init clocksource_done_booting(void) | ||
| 60 | { | ||
| 61 | finished_booting = 1; | ||
| 62 | return 0; | ||
| 63 | } | ||
| 64 | |||
| 65 | late_initcall(clocksource_done_booting); | ||
| 66 | |||
| 67 | /** | ||
| 68 | * clocksource_get_next - Returns the selected clocksource | ||
| 69 | * | ||
| 70 | */ | ||
| 71 | struct clocksource *clocksource_get_next(void) | ||
| 72 | { | ||
| 73 | unsigned long flags; | ||
| 74 | |||
| 75 | spin_lock_irqsave(&clocksource_lock, flags); | ||
| 76 | if (next_clocksource && finished_booting) { | ||
| 77 | curr_clocksource = next_clocksource; | ||
| 78 | next_clocksource = NULL; | ||
| 79 | } | ||
| 80 | spin_unlock_irqrestore(&clocksource_lock, flags); | ||
| 81 | |||
| 82 | return curr_clocksource; | ||
| 83 | } | ||
| 84 | |||
| 85 | /** | ||
| 86 | * select_clocksource - Finds the best registered clocksource. | ||
| 87 | * | ||
| 88 | * Private function. Must hold clocksource_lock when called. | ||
| 89 | * | ||
| 90 | * Looks through the list of registered clocksources, returning | ||
| 91 | * the one with the highest rating value. If there is a clocksource | ||
| 92 | * name that matches the override string, it returns that clocksource. | ||
| 93 | */ | ||
| 94 | static struct clocksource *select_clocksource(void) | ||
| 95 | { | ||
| 96 | struct clocksource *best = NULL; | ||
| 97 | struct list_head *tmp; | ||
| 98 | |||
| 99 | list_for_each(tmp, &clocksource_list) { | ||
| 100 | struct clocksource *src; | ||
| 101 | |||
| 102 | src = list_entry(tmp, struct clocksource, list); | ||
| 103 | if (!best) | ||
| 104 | best = src; | ||
| 105 | |||
| 106 | /* check for override: */ | ||
| 107 | if (strlen(src->name) == strlen(override_name) && | ||
| 108 | !strcmp(src->name, override_name)) { | ||
| 109 | best = src; | ||
| 110 | break; | ||
| 111 | } | ||
| 112 | /* pick the highest rating: */ | ||
| 113 | if (src->rating > best->rating) | ||
| 114 | best = src; | ||
| 115 | } | ||
| 116 | |||
| 117 | return best; | ||
| 118 | } | ||
| 119 | |||
| 120 | /** | ||
| 121 | * is_registered_source - Checks if clocksource is registered | ||
| 122 | * @c: pointer to a clocksource | ||
| 123 | * | ||
| 124 | * Private helper function. Must hold clocksource_lock when called. | ||
| 125 | * | ||
| 126 | * Returns one if the clocksource is already registered, zero otherwise. | ||
| 127 | */ | ||
| 128 | static int is_registered_source(struct clocksource *c) | ||
| 129 | { | ||
| 130 | int len = strlen(c->name); | ||
| 131 | struct list_head *tmp; | ||
| 132 | |||
| 133 | list_for_each(tmp, &clocksource_list) { | ||
| 134 | struct clocksource *src; | ||
| 135 | |||
| 136 | src = list_entry(tmp, struct clocksource, list); | ||
| 137 | if (strlen(src->name) == len && !strcmp(src->name, c->name)) | ||
| 138 | return 1; | ||
| 139 | } | ||
| 140 | |||
| 141 | return 0; | ||
| 142 | } | ||
| 143 | |||
| 144 | /** | ||
| 145 | * clocksource_register - Used to install new clocksources | ||
| 146 | * @t: clocksource to be registered | ||
| 147 | * | ||
| 148 | * Returns -EBUSY if registration fails, zero otherwise. | ||
| 149 | */ | ||
| 150 | int clocksource_register(struct clocksource *c) | ||
| 151 | { | ||
| 152 | int ret = 0; | ||
| 153 | unsigned long flags; | ||
| 154 | |||
| 155 | spin_lock_irqsave(&clocksource_lock, flags); | ||
| 156 | /* check if clocksource is already registered */ | ||
| 157 | if (is_registered_source(c)) { | ||
| 158 | printk("register_clocksource: Cannot register %s. " | ||
| 159 | "Already registered!", c->name); | ||
| 160 | ret = -EBUSY; | ||
| 161 | } else { | ||
| 162 | /* register it */ | ||
| 163 | list_add(&c->list, &clocksource_list); | ||
| 164 | /* scan the registered clocksources, and pick the best one */ | ||
| 165 | next_clocksource = select_clocksource(); | ||
| 166 | } | ||
| 167 | spin_unlock_irqrestore(&clocksource_lock, flags); | ||
| 168 | return ret; | ||
| 169 | } | ||
| 170 | EXPORT_SYMBOL(clocksource_register); | ||
| 171 | |||
| 172 | /** | ||
| 173 | * clocksource_reselect - Rescan list for next clocksource | ||
| 174 | * | ||
| 175 | * A quick helper function to be used if a clocksource changes its | ||
| 176 | * rating. Forces the clocksource list to be re-scanned for the best | ||
| 177 | * clocksource. | ||
| 178 | */ | ||
| 179 | void clocksource_reselect(void) | ||
| 180 | { | ||
| 181 | unsigned long flags; | ||
| 182 | |||
| 183 | spin_lock_irqsave(&clocksource_lock, flags); | ||
| 184 | next_clocksource = select_clocksource(); | ||
| 185 | spin_unlock_irqrestore(&clocksource_lock, flags); | ||
| 186 | } | ||
| 187 | EXPORT_SYMBOL(clocksource_reselect); | ||
| 188 | |||
| 189 | /** | ||
| 190 | * sysfs_show_current_clocksources - sysfs interface for current clocksource | ||
| 191 | * @dev: unused | ||
| 192 | * @buf: char buffer to be filled with clocksource list | ||
| 193 | * | ||
| 194 | * Provides sysfs interface for listing current clocksource. | ||
| 195 | */ | ||
| 196 | static ssize_t | ||
| 197 | sysfs_show_current_clocksources(struct sys_device *dev, char *buf) | ||
| 198 | { | ||
| 199 | char *curr = buf; | ||
| 200 | |||
| 201 | spin_lock_irq(&clocksource_lock); | ||
| 202 | curr += sprintf(curr, "%s ", curr_clocksource->name); | ||
| 203 | spin_unlock_irq(&clocksource_lock); | ||
| 204 | |||
| 205 | curr += sprintf(curr, "\n"); | ||
| 206 | |||
| 207 | return curr - buf; | ||
| 208 | } | ||
| 209 | |||
| 210 | /** | ||
| 211 | * sysfs_override_clocksource - interface for manually overriding clocksource | ||
| 212 | * @dev: unused | ||
| 213 | * @buf: name of override clocksource | ||
| 214 | * @count: length of buffer | ||
| 215 | * | ||
| 216 | * Takes input from sysfs interface for manually overriding the default | ||
| 217 | * clocksource selction. | ||
| 218 | */ | ||
| 219 | static ssize_t sysfs_override_clocksource(struct sys_device *dev, | ||
| 220 | const char *buf, size_t count) | ||
| 221 | { | ||
| 222 | size_t ret = count; | ||
| 223 | /* strings from sysfs write are not 0 terminated! */ | ||
| 224 | if (count >= sizeof(override_name)) | ||
| 225 | return -EINVAL; | ||
| 226 | |||
| 227 | /* strip of \n: */ | ||
| 228 | if (buf[count-1] == '\n') | ||
| 229 | count--; | ||
| 230 | if (count < 1) | ||
| 231 | return -EINVAL; | ||
| 232 | |||
| 233 | spin_lock_irq(&clocksource_lock); | ||
| 234 | |||
| 235 | /* copy the name given: */ | ||
| 236 | memcpy(override_name, buf, count); | ||
| 237 | override_name[count] = 0; | ||
| 238 | |||
| 239 | /* try to select it: */ | ||
| 240 | next_clocksource = select_clocksource(); | ||
| 241 | |||
| 242 | spin_unlock_irq(&clocksource_lock); | ||
| 243 | |||
| 244 | return ret; | ||
| 245 | } | ||
| 246 | |||
| 247 | /** | ||
| 248 | * sysfs_show_available_clocksources - sysfs interface for listing clocksource | ||
| 249 | * @dev: unused | ||
| 250 | * @buf: char buffer to be filled with clocksource list | ||
| 251 | * | ||
| 252 | * Provides sysfs interface for listing registered clocksources | ||
| 253 | */ | ||
| 254 | static ssize_t | ||
| 255 | sysfs_show_available_clocksources(struct sys_device *dev, char *buf) | ||
| 256 | { | ||
| 257 | struct list_head *tmp; | ||
| 258 | char *curr = buf; | ||
| 259 | |||
| 260 | spin_lock_irq(&clocksource_lock); | ||
| 261 | list_for_each(tmp, &clocksource_list) { | ||
| 262 | struct clocksource *src; | ||
| 263 | |||
| 264 | src = list_entry(tmp, struct clocksource, list); | ||
| 265 | curr += sprintf(curr, "%s ", src->name); | ||
| 266 | } | ||
| 267 | spin_unlock_irq(&clocksource_lock); | ||
| 268 | |||
| 269 | curr += sprintf(curr, "\n"); | ||
| 270 | |||
| 271 | return curr - buf; | ||
| 272 | } | ||
| 273 | |||
| 274 | /* | ||
| 275 | * Sysfs setup bits: | ||
| 276 | */ | ||
| 277 | static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, | ||
| 278 | sysfs_override_clocksource); | ||
| 279 | |||
| 280 | static SYSDEV_ATTR(available_clocksource, 0600, | ||
| 281 | sysfs_show_available_clocksources, NULL); | ||
| 282 | |||
| 283 | static struct sysdev_class clocksource_sysclass = { | ||
| 284 | set_kset_name("clocksource"), | ||
| 285 | }; | ||
| 286 | |||
| 287 | static struct sys_device device_clocksource = { | ||
| 288 | .id = 0, | ||
| 289 | .cls = &clocksource_sysclass, | ||
| 290 | }; | ||
| 291 | |||
| 292 | static int __init init_clocksource_sysfs(void) | ||
| 293 | { | ||
| 294 | int error = sysdev_class_register(&clocksource_sysclass); | ||
| 295 | |||
| 296 | if (!error) | ||
| 297 | error = sysdev_register(&device_clocksource); | ||
| 298 | if (!error) | ||
| 299 | error = sysdev_create_file( | ||
| 300 | &device_clocksource, | ||
| 301 | &attr_current_clocksource); | ||
| 302 | if (!error) | ||
| 303 | error = sysdev_create_file( | ||
| 304 | &device_clocksource, | ||
| 305 | &attr_available_clocksource); | ||
| 306 | return error; | ||
| 307 | } | ||
| 308 | |||
| 309 | device_initcall(init_clocksource_sysfs); | ||
| 310 | |||
| 311 | /** | ||
| 312 | * boot_override_clocksource - boot clock override | ||
| 313 | * @str: override name | ||
| 314 | * | ||
| 315 | * Takes a clocksource= boot argument and uses it | ||
| 316 | * as the clocksource override name. | ||
| 317 | */ | ||
| 318 | static int __init boot_override_clocksource(char* str) | ||
| 319 | { | ||
| 320 | unsigned long flags; | ||
| 321 | spin_lock_irqsave(&clocksource_lock, flags); | ||
| 322 | if (str) | ||
| 323 | strlcpy(override_name, str, sizeof(override_name)); | ||
| 324 | spin_unlock_irqrestore(&clocksource_lock, flags); | ||
| 325 | return 1; | ||
| 326 | } | ||
| 327 | |||
| 328 | __setup("clocksource=", boot_override_clocksource); | ||
| 329 | |||
| 330 | /** | ||
| 331 | * boot_override_clock - Compatibility layer for deprecated boot option | ||
| 332 | * @str: override name | ||
| 333 | * | ||
| 334 | * DEPRECATED! Takes a clock= boot argument and uses it | ||
| 335 | * as the clocksource override name | ||
| 336 | */ | ||
| 337 | static int __init boot_override_clock(char* str) | ||
| 338 | { | ||
| 339 | if (!strcmp(str, "pmtmr")) { | ||
| 340 | printk("Warning: clock=pmtmr is deprecated. " | ||
| 341 | "Use clocksource=acpi_pm.\n"); | ||
| 342 | return boot_override_clocksource("acpi_pm"); | ||
| 343 | } | ||
| 344 | printk("Warning! clock= boot option is deprecated. " | ||
| 345 | "Use clocksource=xyz\n"); | ||
| 346 | return boot_override_clocksource(str); | ||
| 347 | } | ||
| 348 | |||
| 349 | __setup("clock=", boot_override_clock); | ||
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c new file mode 100644 index 000000000000..126bb30c4afe --- /dev/null +++ b/kernel/time/jiffies.c | |||
| @@ -0,0 +1,73 @@ | |||
| 1 | /*********************************************************************** | ||
| 2 | * linux/kernel/time/jiffies.c | ||
| 3 | * | ||
| 4 | * This file contains the jiffies based clocksource. | ||
| 5 | * | ||
| 6 | * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) | ||
| 7 | * | ||
| 8 | * This program is free software; you can redistribute it and/or modify | ||
| 9 | * it under the terms of the GNU General Public License as published by | ||
| 10 | * the Free Software Foundation; either version 2 of the License, or | ||
| 11 | * (at your option) any later version. | ||
| 12 | * | ||
| 13 | * This program is distributed in the hope that it will be useful, | ||
| 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 16 | * GNU General Public License for more details. | ||
| 17 | * | ||
| 18 | * You should have received a copy of the GNU General Public License | ||
| 19 | * along with this program; if not, write to the Free Software | ||
| 20 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
| 21 | * | ||
| 22 | ************************************************************************/ | ||
| 23 | #include <linux/clocksource.h> | ||
| 24 | #include <linux/jiffies.h> | ||
| 25 | #include <linux/init.h> | ||
| 26 | |||
| 27 | /* The Jiffies based clocksource is the lowest common | ||
| 28 | * denominator clock source which should function on | ||
| 29 | * all systems. It has the same coarse resolution as | ||
| 30 | * the timer interrupt frequency HZ and it suffers | ||
| 31 | * inaccuracies caused by missed or lost timer | ||
| 32 | * interrupts and the inability for the timer | ||
| 33 | * interrupt hardware to accuratly tick at the | ||
| 34 | * requested HZ value. It is also not reccomended | ||
| 35 | * for "tick-less" systems. | ||
| 36 | */ | ||
| 37 | #define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) | ||
| 38 | |||
| 39 | /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier | ||
| 40 | * conversion, the .shift value could be zero. However | ||
| 41 | * this would make NTP adjustments impossible as they are | ||
| 42 | * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to | ||
| 43 | * shift both the nominator and denominator the same | ||
| 44 | * amount, and give ntp adjustments in units of 1/2^8 | ||
| 45 | * | ||
| 46 | * The value 8 is somewhat carefully chosen, as anything | ||
| 47 | * larger can result in overflows. NSEC_PER_JIFFY grows as | ||
| 48 | * HZ shrinks, so values greater then 8 overflow 32bits when | ||
| 49 | * HZ=100. | ||
| 50 | */ | ||
| 51 | #define JIFFIES_SHIFT 8 | ||
| 52 | |||
| 53 | static cycle_t jiffies_read(void) | ||
| 54 | { | ||
| 55 | return (cycle_t) jiffies; | ||
| 56 | } | ||
| 57 | |||
| 58 | struct clocksource clocksource_jiffies = { | ||
| 59 | .name = "jiffies", | ||
| 60 | .rating = 0, /* lowest rating*/ | ||
| 61 | .read = jiffies_read, | ||
| 62 | .mask = 0xffffffff, /*32bits*/ | ||
| 63 | .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ | ||
| 64 | .shift = JIFFIES_SHIFT, | ||
| 65 | .is_continuous = 0, /* tick based, not free running */ | ||
| 66 | }; | ||
| 67 | |||
| 68 | static int __init init_jiffies_clocksource(void) | ||
| 69 | { | ||
| 70 | return clocksource_register(&clocksource_jiffies); | ||
| 71 | } | ||
| 72 | |||
| 73 | module_init(init_jiffies_clocksource); | ||
diff --git a/kernel/timer.c b/kernel/timer.c index f35b3939e937..4f55622b0d38 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
| @@ -84,7 +84,7 @@ typedef struct tvec_t_base_s tvec_base_t; | |||
| 84 | 84 | ||
| 85 | tvec_base_t boot_tvec_bases; | 85 | tvec_base_t boot_tvec_bases; |
| 86 | EXPORT_SYMBOL(boot_tvec_bases); | 86 | EXPORT_SYMBOL(boot_tvec_bases); |
| 87 | static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases }; | 87 | static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; |
| 88 | 88 | ||
| 89 | static inline void set_running_timer(tvec_base_t *base, | 89 | static inline void set_running_timer(tvec_base_t *base, |
| 90 | struct timer_list *timer) | 90 | struct timer_list *timer) |
| @@ -136,7 +136,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) | |||
| 136 | list_add_tail(&timer->entry, vec); | 136 | list_add_tail(&timer->entry, vec); |
| 137 | } | 137 | } |
| 138 | 138 | ||
| 139 | /*** | 139 | /** |
| 140 | * init_timer - initialize a timer. | 140 | * init_timer - initialize a timer. |
| 141 | * @timer: the timer to be initialized | 141 | * @timer: the timer to be initialized |
| 142 | * | 142 | * |
| @@ -146,7 +146,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) | |||
| 146 | void fastcall init_timer(struct timer_list *timer) | 146 | void fastcall init_timer(struct timer_list *timer) |
| 147 | { | 147 | { |
| 148 | timer->entry.next = NULL; | 148 | timer->entry.next = NULL; |
| 149 | timer->base = per_cpu(tvec_bases, raw_smp_processor_id()); | 149 | timer->base = __raw_get_cpu_var(tvec_bases); |
| 150 | } | 150 | } |
| 151 | EXPORT_SYMBOL(init_timer); | 151 | EXPORT_SYMBOL(init_timer); |
| 152 | 152 | ||
| @@ -175,6 +175,7 @@ static inline void detach_timer(struct timer_list *timer, | |||
| 175 | */ | 175 | */ |
| 176 | static tvec_base_t *lock_timer_base(struct timer_list *timer, | 176 | static tvec_base_t *lock_timer_base(struct timer_list *timer, |
| 177 | unsigned long *flags) | 177 | unsigned long *flags) |
| 178 | __acquires(timer->base->lock) | ||
| 178 | { | 179 | { |
| 179 | tvec_base_t *base; | 180 | tvec_base_t *base; |
| 180 | 181 | ||
| @@ -235,7 +236,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) | |||
| 235 | 236 | ||
| 236 | EXPORT_SYMBOL(__mod_timer); | 237 | EXPORT_SYMBOL(__mod_timer); |
| 237 | 238 | ||
| 238 | /*** | 239 | /** |
| 239 | * add_timer_on - start a timer on a particular CPU | 240 | * add_timer_on - start a timer on a particular CPU |
| 240 | * @timer: the timer to be added | 241 | * @timer: the timer to be added |
| 241 | * @cpu: the CPU to start it on | 242 | * @cpu: the CPU to start it on |
| @@ -255,9 +256,10 @@ void add_timer_on(struct timer_list *timer, int cpu) | |||
| 255 | } | 256 | } |
| 256 | 257 | ||
| 257 | 258 | ||
| 258 | /*** | 259 | /** |
| 259 | * mod_timer - modify a timer's timeout | 260 | * mod_timer - modify a timer's timeout |
| 260 | * @timer: the timer to be modified | 261 | * @timer: the timer to be modified |
| 262 | * @expires: new timeout in jiffies | ||
| 261 | * | 263 | * |
| 262 | * mod_timer is a more efficient way to update the expire field of an | 264 | * mod_timer is a more efficient way to update the expire field of an |
| 263 | * active timer (if the timer is inactive it will be activated) | 265 | * active timer (if the timer is inactive it will be activated) |
| @@ -291,7 +293,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires) | |||
| 291 | 293 | ||
| 292 | EXPORT_SYMBOL(mod_timer); | 294 | EXPORT_SYMBOL(mod_timer); |
| 293 | 295 | ||
| 294 | /*** | 296 | /** |
| 295 | * del_timer - deactive a timer. | 297 | * del_timer - deactive a timer. |
| 296 | * @timer: the timer to be deactivated | 298 | * @timer: the timer to be deactivated |
| 297 | * | 299 | * |
| @@ -323,7 +325,10 @@ int del_timer(struct timer_list *timer) | |||
| 323 | EXPORT_SYMBOL(del_timer); | 325 | EXPORT_SYMBOL(del_timer); |
| 324 | 326 | ||
| 325 | #ifdef CONFIG_SMP | 327 | #ifdef CONFIG_SMP |
| 326 | /* | 328 | /** |
| 329 | * try_to_del_timer_sync - Try to deactivate a timer | ||
| 330 | * @timer: timer do del | ||
| 331 | * | ||
| 327 | * This function tries to deactivate a timer. Upon successful (ret >= 0) | 332 | * This function tries to deactivate a timer. Upon successful (ret >= 0) |
| 328 | * exit the timer is not queued and the handler is not running on any CPU. | 333 | * exit the timer is not queued and the handler is not running on any CPU. |
| 329 | * | 334 | * |
| @@ -351,7 +356,7 @@ out: | |||
| 351 | return ret; | 356 | return ret; |
| 352 | } | 357 | } |
| 353 | 358 | ||
| 354 | /*** | 359 | /** |
| 355 | * del_timer_sync - deactivate a timer and wait for the handler to finish. | 360 | * del_timer_sync - deactivate a timer and wait for the handler to finish. |
| 356 | * @timer: the timer to be deactivated | 361 | * @timer: the timer to be deactivated |
| 357 | * | 362 | * |
| @@ -374,6 +379,7 @@ int del_timer_sync(struct timer_list *timer) | |||
| 374 | int ret = try_to_del_timer_sync(timer); | 379 | int ret = try_to_del_timer_sync(timer); |
| 375 | if (ret >= 0) | 380 | if (ret >= 0) |
| 376 | return ret; | 381 | return ret; |
| 382 | cpu_relax(); | ||
| 377 | } | 383 | } |
| 378 | } | 384 | } |
| 379 | 385 | ||
| @@ -400,15 +406,15 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index) | |||
| 400 | return index; | 406 | return index; |
| 401 | } | 407 | } |
| 402 | 408 | ||
| 403 | /*** | 409 | #define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) |
| 410 | |||
| 411 | /** | ||
| 404 | * __run_timers - run all expired timers (if any) on this CPU. | 412 | * __run_timers - run all expired timers (if any) on this CPU. |
| 405 | * @base: the timer vector to be processed. | 413 | * @base: the timer vector to be processed. |
| 406 | * | 414 | * |
| 407 | * This function cascades all vectors and executes all expired timer | 415 | * This function cascades all vectors and executes all expired timer |
| 408 | * vectors. | 416 | * vectors. |
| 409 | */ | 417 | */ |
| 410 | #define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK | ||
| 411 | |||
| 412 | static inline void __run_timers(tvec_base_t *base) | 418 | static inline void __run_timers(tvec_base_t *base) |
| 413 | { | 419 | { |
| 414 | struct timer_list *timer; | 420 | struct timer_list *timer; |
| @@ -597,7 +603,6 @@ long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ | |||
| 597 | long time_precision = 1; /* clock precision (us) */ | 603 | long time_precision = 1; /* clock precision (us) */ |
| 598 | long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ | 604 | long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ |
| 599 | long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ | 605 | long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ |
| 600 | static long time_phase; /* phase offset (scaled us) */ | ||
| 601 | long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; | 606 | long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; |
| 602 | /* frequency offset (scaled ppm)*/ | 607 | /* frequency offset (scaled ppm)*/ |
| 603 | static long time_adj; /* tick adjust (scaled 1 / HZ) */ | 608 | static long time_adj; /* tick adjust (scaled 1 / HZ) */ |
| @@ -747,27 +752,14 @@ static long adjtime_adjustment(void) | |||
| 747 | } | 752 | } |
| 748 | 753 | ||
| 749 | /* in the NTP reference this is called "hardclock()" */ | 754 | /* in the NTP reference this is called "hardclock()" */ |
| 750 | static void update_wall_time_one_tick(void) | 755 | static void update_ntp_one_tick(void) |
| 751 | { | 756 | { |
| 752 | long time_adjust_step, delta_nsec; | 757 | long time_adjust_step; |
| 753 | 758 | ||
| 754 | time_adjust_step = adjtime_adjustment(); | 759 | time_adjust_step = adjtime_adjustment(); |
| 755 | if (time_adjust_step) | 760 | if (time_adjust_step) |
| 756 | /* Reduce by this step the amount of time left */ | 761 | /* Reduce by this step the amount of time left */ |
| 757 | time_adjust -= time_adjust_step; | 762 | time_adjust -= time_adjust_step; |
| 758 | delta_nsec = tick_nsec + time_adjust_step * 1000; | ||
| 759 | /* | ||
| 760 | * Advance the phase, once it gets to one microsecond, then | ||
| 761 | * advance the tick more. | ||
| 762 | */ | ||
| 763 | time_phase += time_adj; | ||
| 764 | if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) { | ||
| 765 | long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10)); | ||
| 766 | time_phase -= ltemp << (SHIFT_SCALE - 10); | ||
| 767 | delta_nsec += ltemp; | ||
| 768 | } | ||
| 769 | xtime.tv_nsec += delta_nsec; | ||
| 770 | time_interpolator_update(delta_nsec); | ||
| 771 | 763 | ||
| 772 | /* Changes by adjtime() do not take effect till next tick. */ | 764 | /* Changes by adjtime() do not take effect till next tick. */ |
| 773 | if (time_next_adjust != 0) { | 765 | if (time_next_adjust != 0) { |
| @@ -780,36 +772,404 @@ static void update_wall_time_one_tick(void) | |||
| 780 | * Return how long ticks are at the moment, that is, how much time | 772 | * Return how long ticks are at the moment, that is, how much time |
| 781 | * update_wall_time_one_tick will add to xtime next time we call it | 773 | * update_wall_time_one_tick will add to xtime next time we call it |
| 782 | * (assuming no calls to do_adjtimex in the meantime). | 774 | * (assuming no calls to do_adjtimex in the meantime). |
| 783 | * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10 | 775 | * The return value is in fixed-point nanoseconds shifted by the |
| 784 | * bits to the right of the binary point. | 776 | * specified number of bits to the right of the binary point. |
| 785 | * This function has no side-effects. | 777 | * This function has no side-effects. |
| 786 | */ | 778 | */ |
| 787 | u64 current_tick_length(void) | 779 | u64 current_tick_length(void) |
| 788 | { | 780 | { |
| 789 | long delta_nsec; | 781 | long delta_nsec; |
| 782 | u64 ret; | ||
| 790 | 783 | ||
| 784 | /* calculate the finest interval NTP will allow. | ||
| 785 | * ie: nanosecond value shifted by (SHIFT_SCALE - 10) | ||
| 786 | */ | ||
| 791 | delta_nsec = tick_nsec + adjtime_adjustment() * 1000; | 787 | delta_nsec = tick_nsec + adjtime_adjustment() * 1000; |
| 792 | return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; | 788 | ret = (u64)delta_nsec << TICK_LENGTH_SHIFT; |
| 789 | ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10)); | ||
| 790 | |||
| 791 | return ret; | ||
| 793 | } | 792 | } |
| 794 | 793 | ||
| 795 | /* | 794 | /* XXX - all of this timekeeping code should be later moved to time.c */ |
| 796 | * Using a loop looks inefficient, but "ticks" is | 795 | #include <linux/clocksource.h> |
| 797 | * usually just one (we shouldn't be losing ticks, | 796 | static struct clocksource *clock; /* pointer to current clocksource */ |
| 798 | * we're doing this this way mainly for interrupt | 797 | |
| 799 | * latency reasons, not because we think we'll | 798 | #ifdef CONFIG_GENERIC_TIME |
| 800 | * have lots of lost timer ticks | 799 | /** |
| 800 | * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook | ||
| 801 | * | ||
| 802 | * private function, must hold xtime_lock lock when being | ||
| 803 | * called. Returns the number of nanoseconds since the | ||
| 804 | * last call to update_wall_time() (adjusted by NTP scaling) | ||
| 805 | */ | ||
| 806 | static inline s64 __get_nsec_offset(void) | ||
| 807 | { | ||
| 808 | cycle_t cycle_now, cycle_delta; | ||
| 809 | s64 ns_offset; | ||
| 810 | |||
| 811 | /* read clocksource: */ | ||
| 812 | cycle_now = clocksource_read(clock); | ||
| 813 | |||
| 814 | /* calculate the delta since the last update_wall_time: */ | ||
| 815 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | ||
| 816 | |||
| 817 | /* convert to nanoseconds: */ | ||
| 818 | ns_offset = cyc2ns(clock, cycle_delta); | ||
| 819 | |||
| 820 | return ns_offset; | ||
| 821 | } | ||
| 822 | |||
| 823 | /** | ||
| 824 | * __get_realtime_clock_ts - Returns the time of day in a timespec | ||
| 825 | * @ts: pointer to the timespec to be set | ||
| 826 | * | ||
| 827 | * Returns the time of day in a timespec. Used by | ||
| 828 | * do_gettimeofday() and get_realtime_clock_ts(). | ||
| 829 | */ | ||
| 830 | static inline void __get_realtime_clock_ts(struct timespec *ts) | ||
| 831 | { | ||
| 832 | unsigned long seq; | ||
| 833 | s64 nsecs; | ||
| 834 | |||
| 835 | do { | ||
| 836 | seq = read_seqbegin(&xtime_lock); | ||
| 837 | |||
| 838 | *ts = xtime; | ||
| 839 | nsecs = __get_nsec_offset(); | ||
| 840 | |||
| 841 | } while (read_seqretry(&xtime_lock, seq)); | ||
| 842 | |||
| 843 | timespec_add_ns(ts, nsecs); | ||
| 844 | } | ||
| 845 | |||
| 846 | /** | ||
| 847 | * getnstimeofday - Returns the time of day in a timespec | ||
| 848 | * @ts: pointer to the timespec to be set | ||
| 849 | * | ||
| 850 | * Returns the time of day in a timespec. | ||
| 851 | */ | ||
| 852 | void getnstimeofday(struct timespec *ts) | ||
| 853 | { | ||
| 854 | __get_realtime_clock_ts(ts); | ||
| 855 | } | ||
| 856 | |||
| 857 | EXPORT_SYMBOL(getnstimeofday); | ||
| 858 | |||
| 859 | /** | ||
| 860 | * do_gettimeofday - Returns the time of day in a timeval | ||
| 861 | * @tv: pointer to the timeval to be set | ||
| 862 | * | ||
| 863 | * NOTE: Users should be converted to using get_realtime_clock_ts() | ||
| 801 | */ | 864 | */ |
| 802 | static void update_wall_time(unsigned long ticks) | 865 | void do_gettimeofday(struct timeval *tv) |
| 803 | { | 866 | { |
| 867 | struct timespec now; | ||
| 868 | |||
| 869 | __get_realtime_clock_ts(&now); | ||
| 870 | tv->tv_sec = now.tv_sec; | ||
| 871 | tv->tv_usec = now.tv_nsec/1000; | ||
| 872 | } | ||
| 873 | |||
| 874 | EXPORT_SYMBOL(do_gettimeofday); | ||
| 875 | /** | ||
| 876 | * do_settimeofday - Sets the time of day | ||
| 877 | * @tv: pointer to the timespec variable containing the new time | ||
| 878 | * | ||
| 879 | * Sets the time of day to the new time and update NTP and notify hrtimers | ||
| 880 | */ | ||
| 881 | int do_settimeofday(struct timespec *tv) | ||
| 882 | { | ||
| 883 | unsigned long flags; | ||
| 884 | time_t wtm_sec, sec = tv->tv_sec; | ||
| 885 | long wtm_nsec, nsec = tv->tv_nsec; | ||
| 886 | |||
| 887 | if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) | ||
| 888 | return -EINVAL; | ||
| 889 | |||
| 890 | write_seqlock_irqsave(&xtime_lock, flags); | ||
| 891 | |||
| 892 | nsec -= __get_nsec_offset(); | ||
| 893 | |||
| 894 | wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); | ||
| 895 | wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); | ||
| 896 | |||
| 897 | set_normalized_timespec(&xtime, sec, nsec); | ||
| 898 | set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); | ||
| 899 | |||
| 900 | clock->error = 0; | ||
| 901 | ntp_clear(); | ||
| 902 | |||
| 903 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
| 904 | |||
| 905 | /* signal hrtimers about time change */ | ||
| 906 | clock_was_set(); | ||
| 907 | |||
| 908 | return 0; | ||
| 909 | } | ||
| 910 | |||
| 911 | EXPORT_SYMBOL(do_settimeofday); | ||
| 912 | |||
| 913 | /** | ||
| 914 | * change_clocksource - Swaps clocksources if a new one is available | ||
| 915 | * | ||
| 916 | * Accumulates current time interval and initializes new clocksource | ||
| 917 | */ | ||
| 918 | static int change_clocksource(void) | ||
| 919 | { | ||
| 920 | struct clocksource *new; | ||
| 921 | cycle_t now; | ||
| 922 | u64 nsec; | ||
| 923 | new = clocksource_get_next(); | ||
| 924 | if (clock != new) { | ||
| 925 | now = clocksource_read(new); | ||
| 926 | nsec = __get_nsec_offset(); | ||
| 927 | timespec_add_ns(&xtime, nsec); | ||
| 928 | |||
| 929 | clock = new; | ||
| 930 | clock->cycle_last = now; | ||
| 931 | printk(KERN_INFO "Time: %s clocksource has been installed.\n", | ||
| 932 | clock->name); | ||
| 933 | return 1; | ||
| 934 | } else if (clock->update_callback) { | ||
| 935 | return clock->update_callback(); | ||
| 936 | } | ||
| 937 | return 0; | ||
| 938 | } | ||
| 939 | #else | ||
| 940 | #define change_clocksource() (0) | ||
| 941 | #endif | ||
| 942 | |||
| 943 | /** | ||
| 944 | * timeofday_is_continuous - check to see if timekeeping is free running | ||
| 945 | */ | ||
| 946 | int timekeeping_is_continuous(void) | ||
| 947 | { | ||
| 948 | unsigned long seq; | ||
| 949 | int ret; | ||
| 950 | |||
| 804 | do { | 951 | do { |
| 805 | ticks--; | 952 | seq = read_seqbegin(&xtime_lock); |
| 806 | update_wall_time_one_tick(); | 953 | |
| 807 | if (xtime.tv_nsec >= 1000000000) { | 954 | ret = clock->is_continuous; |
| 808 | xtime.tv_nsec -= 1000000000; | 955 | |
| 956 | } while (read_seqretry(&xtime_lock, seq)); | ||
| 957 | |||
| 958 | return ret; | ||
| 959 | } | ||
| 960 | |||
| 961 | /* | ||
| 962 | * timekeeping_init - Initializes the clocksource and common timekeeping values | ||
| 963 | */ | ||
| 964 | void __init timekeeping_init(void) | ||
| 965 | { | ||
| 966 | unsigned long flags; | ||
| 967 | |||
| 968 | write_seqlock_irqsave(&xtime_lock, flags); | ||
| 969 | clock = clocksource_get_next(); | ||
| 970 | clocksource_calculate_interval(clock, tick_nsec); | ||
| 971 | clock->cycle_last = clocksource_read(clock); | ||
| 972 | ntp_clear(); | ||
| 973 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
| 974 | } | ||
| 975 | |||
| 976 | |||
| 977 | static int timekeeping_suspended; | ||
| 978 | /** | ||
| 979 | * timekeeping_resume - Resumes the generic timekeeping subsystem. | ||
| 980 | * @dev: unused | ||
| 981 | * | ||
| 982 | * This is for the generic clocksource timekeeping. | ||
| 983 | * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are | ||
| 984 | * still managed by arch specific suspend/resume code. | ||
| 985 | */ | ||
| 986 | static int timekeeping_resume(struct sys_device *dev) | ||
| 987 | { | ||
| 988 | unsigned long flags; | ||
| 989 | |||
| 990 | write_seqlock_irqsave(&xtime_lock, flags); | ||
| 991 | /* restart the last cycle value */ | ||
| 992 | clock->cycle_last = clocksource_read(clock); | ||
| 993 | clock->error = 0; | ||
| 994 | timekeeping_suspended = 0; | ||
| 995 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
| 996 | return 0; | ||
| 997 | } | ||
| 998 | |||
| 999 | static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) | ||
| 1000 | { | ||
| 1001 | unsigned long flags; | ||
| 1002 | |||
| 1003 | write_seqlock_irqsave(&xtime_lock, flags); | ||
| 1004 | timekeeping_suspended = 1; | ||
| 1005 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
| 1006 | return 0; | ||
| 1007 | } | ||
| 1008 | |||
| 1009 | /* sysfs resume/suspend bits for timekeeping */ | ||
| 1010 | static struct sysdev_class timekeeping_sysclass = { | ||
| 1011 | .resume = timekeeping_resume, | ||
| 1012 | .suspend = timekeeping_suspend, | ||
| 1013 | set_kset_name("timekeeping"), | ||
| 1014 | }; | ||
| 1015 | |||
| 1016 | static struct sys_device device_timer = { | ||
| 1017 | .id = 0, | ||
| 1018 | .cls = &timekeeping_sysclass, | ||
| 1019 | }; | ||
| 1020 | |||
| 1021 | static int __init timekeeping_init_device(void) | ||
| 1022 | { | ||
| 1023 | int error = sysdev_class_register(&timekeeping_sysclass); | ||
| 1024 | if (!error) | ||
| 1025 | error = sysdev_register(&device_timer); | ||
| 1026 | return error; | ||
| 1027 | } | ||
| 1028 | |||
| 1029 | device_initcall(timekeeping_init_device); | ||
| 1030 | |||
| 1031 | /* | ||
| 1032 | * If the error is already larger, we look ahead even further | ||
| 1033 | * to compensate for late or lost adjustments. | ||
| 1034 | */ | ||
| 1035 | static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset) | ||
| 1036 | { | ||
| 1037 | s64 tick_error, i; | ||
| 1038 | u32 look_ahead, adj; | ||
| 1039 | s32 error2, mult; | ||
| 1040 | |||
| 1041 | /* | ||
| 1042 | * Use the current error value to determine how much to look ahead. | ||
| 1043 | * The larger the error the slower we adjust for it to avoid problems | ||
| 1044 | * with losing too many ticks, otherwise we would overadjust and | ||
| 1045 | * produce an even larger error. The smaller the adjustment the | ||
| 1046 | * faster we try to adjust for it, as lost ticks can do less harm | ||
| 1047 | * here. This is tuned so that an error of about 1 msec is adusted | ||
| 1048 | * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). | ||
| 1049 | */ | ||
| 1050 | error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ); | ||
| 1051 | error2 = abs(error2); | ||
| 1052 | for (look_ahead = 0; error2 > 0; look_ahead++) | ||
| 1053 | error2 >>= 2; | ||
| 1054 | |||
| 1055 | /* | ||
| 1056 | * Now calculate the error in (1 << look_ahead) ticks, but first | ||
| 1057 | * remove the single look ahead already included in the error. | ||
| 1058 | */ | ||
| 1059 | tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); | ||
| 1060 | tick_error -= clock->xtime_interval >> 1; | ||
| 1061 | error = ((error - tick_error) >> look_ahead) + tick_error; | ||
| 1062 | |||
| 1063 | /* Finally calculate the adjustment shift value. */ | ||
| 1064 | i = *interval; | ||
| 1065 | mult = 1; | ||
| 1066 | if (error < 0) { | ||
| 1067 | error = -error; | ||
| 1068 | *interval = -*interval; | ||
| 1069 | *offset = -*offset; | ||
| 1070 | mult = -1; | ||
| 1071 | } | ||
| 1072 | for (adj = 0; error > i; adj++) | ||
| 1073 | error >>= 1; | ||
| 1074 | |||
| 1075 | *interval <<= adj; | ||
| 1076 | *offset <<= adj; | ||
| 1077 | return mult << adj; | ||
| 1078 | } | ||
| 1079 | |||
| 1080 | /* | ||
| 1081 | * Adjust the multiplier to reduce the error value, | ||
| 1082 | * this is optimized for the most common adjustments of -1,0,1, | ||
| 1083 | * for other values we can do a bit more work. | ||
| 1084 | */ | ||
| 1085 | static void clocksource_adjust(struct clocksource *clock, s64 offset) | ||
| 1086 | { | ||
| 1087 | s64 error, interval = clock->cycle_interval; | ||
| 1088 | int adj; | ||
| 1089 | |||
| 1090 | error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); | ||
| 1091 | if (error > interval) { | ||
| 1092 | error >>= 2; | ||
| 1093 | if (likely(error <= interval)) | ||
| 1094 | adj = 1; | ||
| 1095 | else | ||
| 1096 | adj = clocksource_bigadjust(error, &interval, &offset); | ||
| 1097 | } else if (error < -interval) { | ||
| 1098 | error >>= 2; | ||
| 1099 | if (likely(error >= -interval)) { | ||
| 1100 | adj = -1; | ||
| 1101 | interval = -interval; | ||
| 1102 | offset = -offset; | ||
| 1103 | } else | ||
| 1104 | adj = clocksource_bigadjust(error, &interval, &offset); | ||
| 1105 | } else | ||
| 1106 | return; | ||
| 1107 | |||
| 1108 | clock->mult += adj; | ||
| 1109 | clock->xtime_interval += interval; | ||
| 1110 | clock->xtime_nsec -= offset; | ||
| 1111 | clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); | ||
| 1112 | } | ||
| 1113 | |||
| 1114 | /** | ||
| 1115 | * update_wall_time - Uses the current clocksource to increment the wall time | ||
| 1116 | * | ||
| 1117 | * Called from the timer interrupt, must hold a write on xtime_lock. | ||
| 1118 | */ | ||
| 1119 | static void update_wall_time(void) | ||
| 1120 | { | ||
| 1121 | cycle_t offset; | ||
| 1122 | |||
| 1123 | /* Make sure we're fully resumed: */ | ||
| 1124 | if (unlikely(timekeeping_suspended)) | ||
| 1125 | return; | ||
| 1126 | |||
| 1127 | #ifdef CONFIG_GENERIC_TIME | ||
| 1128 | offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; | ||
| 1129 | #else | ||
| 1130 | offset = clock->cycle_interval; | ||
| 1131 | #endif | ||
| 1132 | clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; | ||
| 1133 | |||
| 1134 | /* normally this loop will run just once, however in the | ||
| 1135 | * case of lost or late ticks, it will accumulate correctly. | ||
| 1136 | */ | ||
| 1137 | while (offset >= clock->cycle_interval) { | ||
| 1138 | /* accumulate one interval */ | ||
| 1139 | clock->xtime_nsec += clock->xtime_interval; | ||
| 1140 | clock->cycle_last += clock->cycle_interval; | ||
| 1141 | offset -= clock->cycle_interval; | ||
| 1142 | |||
| 1143 | if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { | ||
| 1144 | clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; | ||
| 809 | xtime.tv_sec++; | 1145 | xtime.tv_sec++; |
| 810 | second_overflow(); | 1146 | second_overflow(); |
| 811 | } | 1147 | } |
| 812 | } while (ticks); | 1148 | |
| 1149 | /* interpolator bits */ | ||
| 1150 | time_interpolator_update(clock->xtime_interval | ||
| 1151 | >> clock->shift); | ||
| 1152 | /* increment the NTP state machine */ | ||
| 1153 | update_ntp_one_tick(); | ||
| 1154 | |||
| 1155 | /* accumulate error between NTP and clock interval */ | ||
| 1156 | clock->error += current_tick_length(); | ||
| 1157 | clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); | ||
| 1158 | } | ||
| 1159 | |||
| 1160 | /* correct the clock when NTP error is too big */ | ||
| 1161 | clocksource_adjust(clock, offset); | ||
| 1162 | |||
| 1163 | /* store full nanoseconds into xtime */ | ||
| 1164 | xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; | ||
| 1165 | clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; | ||
| 1166 | |||
| 1167 | /* check to see if there is a new clocksource to use */ | ||
| 1168 | if (change_clocksource()) { | ||
| 1169 | clock->error = 0; | ||
| 1170 | clock->xtime_nsec = 0; | ||
| 1171 | clocksource_calculate_interval(clock, tick_nsec); | ||
| 1172 | } | ||
| 813 | } | 1173 | } |
| 814 | 1174 | ||
| 815 | /* | 1175 | /* |
| @@ -862,10 +1222,8 @@ static inline void calc_load(unsigned long ticks) | |||
| 862 | unsigned long active_tasks; /* fixed-point */ | 1222 | unsigned long active_tasks; /* fixed-point */ |
| 863 | static int count = LOAD_FREQ; | 1223 | static int count = LOAD_FREQ; |
| 864 | 1224 | ||
| 865 | count -= ticks; | 1225 | active_tasks = count_active_tasks(); |
| 866 | if (count < 0) { | 1226 | for (count -= ticks; count < 0; count += LOAD_FREQ) { |
| 867 | count += LOAD_FREQ; | ||
| 868 | active_tasks = count_active_tasks(); | ||
| 869 | CALC_LOAD(avenrun[0], EXP_1, active_tasks); | 1227 | CALC_LOAD(avenrun[0], EXP_1, active_tasks); |
| 870 | CALC_LOAD(avenrun[1], EXP_5, active_tasks); | 1228 | CALC_LOAD(avenrun[1], EXP_5, active_tasks); |
| 871 | CALC_LOAD(avenrun[2], EXP_15, active_tasks); | 1229 | CALC_LOAD(avenrun[2], EXP_15, active_tasks); |
| @@ -880,7 +1238,7 @@ unsigned long wall_jiffies = INITIAL_JIFFIES; | |||
| 880 | * playing with xtime and avenrun. | 1238 | * playing with xtime and avenrun. |
| 881 | */ | 1239 | */ |
| 882 | #ifndef ARCH_HAVE_XTIME_LOCK | 1240 | #ifndef ARCH_HAVE_XTIME_LOCK |
| 883 | seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; | 1241 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); |
| 884 | 1242 | ||
| 885 | EXPORT_SYMBOL(xtime_lock); | 1243 | EXPORT_SYMBOL(xtime_lock); |
| 886 | #endif | 1244 | #endif |
| @@ -910,15 +1268,10 @@ void run_local_timers(void) | |||
| 910 | * Called by the timer interrupt. xtime_lock must already be taken | 1268 | * Called by the timer interrupt. xtime_lock must already be taken |
| 911 | * by the timer IRQ! | 1269 | * by the timer IRQ! |
| 912 | */ | 1270 | */ |
| 913 | static inline void update_times(void) | 1271 | static inline void update_times(unsigned long ticks) |
| 914 | { | 1272 | { |
| 915 | unsigned long ticks; | 1273 | wall_jiffies += ticks; |
| 916 | 1274 | update_wall_time(); | |
| 917 | ticks = jiffies - wall_jiffies; | ||
| 918 | if (ticks) { | ||
| 919 | wall_jiffies += ticks; | ||
| 920 | update_wall_time(ticks); | ||
| 921 | } | ||
| 922 | calc_load(ticks); | 1275 | calc_load(ticks); |
| 923 | } | 1276 | } |
| 924 | 1277 | ||
| @@ -928,12 +1281,10 @@ static inline void update_times(void) | |||
| 928 | * jiffies is defined in the linker script... | 1281 | * jiffies is defined in the linker script... |
| 929 | */ | 1282 | */ |
| 930 | 1283 | ||
| 931 | void do_timer(struct pt_regs *regs) | 1284 | void do_timer(unsigned long ticks) |
| 932 | { | 1285 | { |
| 933 | jiffies_64++; | 1286 | jiffies_64 += ticks; |
| 934 | /* prevent loading jiffies before storing new jiffies_64 value. */ | 1287 | update_times(ticks); |
| 935 | barrier(); | ||
| 936 | update_times(); | ||
| 937 | } | 1288 | } |
| 938 | 1289 | ||
| 939 | #ifdef __ARCH_WANT_SYS_ALARM | 1290 | #ifdef __ARCH_WANT_SYS_ALARM |
| @@ -971,46 +1322,19 @@ asmlinkage long sys_getpid(void) | |||
| 971 | } | 1322 | } |
| 972 | 1323 | ||
| 973 | /* | 1324 | /* |
| 974 | * Accessing ->group_leader->real_parent is not SMP-safe, it could | 1325 | * Accessing ->real_parent is not SMP-safe, it could |
| 975 | * change from under us. However, rather than getting any lock | 1326 | * change from under us. However, we can use a stale |
| 976 | * we can use an optimistic algorithm: get the parent | 1327 | * value of ->real_parent under rcu_read_lock(), see |
| 977 | * pid, and go back and check that the parent is still | 1328 | * release_task()->call_rcu(delayed_put_task_struct). |
| 978 | * the same. If it has changed (which is extremely unlikely | ||
| 979 | * indeed), we just try again.. | ||
| 980 | * | ||
| 981 | * NOTE! This depends on the fact that even if we _do_ | ||
| 982 | * get an old value of "parent", we can happily dereference | ||
| 983 | * the pointer (it was and remains a dereferencable kernel pointer | ||
| 984 | * no matter what): we just can't necessarily trust the result | ||
| 985 | * until we know that the parent pointer is valid. | ||
| 986 | * | ||
| 987 | * NOTE2: ->group_leader never changes from under us. | ||
| 988 | */ | 1329 | */ |
| 989 | asmlinkage long sys_getppid(void) | 1330 | asmlinkage long sys_getppid(void) |
| 990 | { | 1331 | { |
| 991 | int pid; | 1332 | int pid; |
| 992 | struct task_struct *me = current; | ||
| 993 | struct task_struct *parent; | ||
| 994 | 1333 | ||
| 995 | parent = me->group_leader->real_parent; | 1334 | rcu_read_lock(); |
| 996 | for (;;) { | 1335 | pid = rcu_dereference(current->real_parent)->tgid; |
| 997 | pid = parent->tgid; | 1336 | rcu_read_unlock(); |
| 998 | #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) | ||
| 999 | { | ||
| 1000 | struct task_struct *old = parent; | ||
| 1001 | 1337 | ||
| 1002 | /* | ||
| 1003 | * Make sure we read the pid before re-reading the | ||
| 1004 | * parent pointer: | ||
| 1005 | */ | ||
| 1006 | smp_rmb(); | ||
| 1007 | parent = me->group_leader->real_parent; | ||
| 1008 | if (old != parent) | ||
| 1009 | continue; | ||
| 1010 | } | ||
| 1011 | #endif | ||
| 1012 | break; | ||
| 1013 | } | ||
| 1014 | return pid; | 1338 | return pid; |
| 1015 | } | 1339 | } |
| 1016 | 1340 | ||
| @@ -1042,7 +1366,7 @@ asmlinkage long sys_getegid(void) | |||
| 1042 | 1366 | ||
| 1043 | static void process_timeout(unsigned long __data) | 1367 | static void process_timeout(unsigned long __data) |
| 1044 | { | 1368 | { |
| 1045 | wake_up_process((task_t *)__data); | 1369 | wake_up_process((struct task_struct *)__data); |
| 1046 | } | 1370 | } |
| 1047 | 1371 | ||
| 1048 | /** | 1372 | /** |
| @@ -1144,8 +1468,9 @@ asmlinkage long sys_gettid(void) | |||
| 1144 | return current->pid; | 1468 | return current->pid; |
| 1145 | } | 1469 | } |
| 1146 | 1470 | ||
| 1147 | /* | 1471 | /** |
| 1148 | * sys_sysinfo - fill in sysinfo struct | 1472 | * sys_sysinfo - fill in sysinfo struct |
| 1473 | * @info: pointer to buffer to fill | ||
| 1149 | */ | 1474 | */ |
| 1150 | asmlinkage long sys_sysinfo(struct sysinfo __user *info) | 1475 | asmlinkage long sys_sysinfo(struct sysinfo __user *info) |
| 1151 | { | 1476 | { |
| @@ -1233,6 +1558,13 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) | |||
| 1233 | return 0; | 1558 | return 0; |
| 1234 | } | 1559 | } |
| 1235 | 1560 | ||
| 1561 | /* | ||
| 1562 | * lockdep: we want to track each per-CPU base as a separate lock-class, | ||
| 1563 | * but timer-bases are kmalloc()-ed, so we need to attach separate | ||
| 1564 | * keys to them: | ||
| 1565 | */ | ||
| 1566 | static struct lock_class_key base_lock_keys[NR_CPUS]; | ||
| 1567 | |||
| 1236 | static int __devinit init_timers_cpu(int cpu) | 1568 | static int __devinit init_timers_cpu(int cpu) |
| 1237 | { | 1569 | { |
| 1238 | int j; | 1570 | int j; |
| @@ -1268,6 +1600,8 @@ static int __devinit init_timers_cpu(int cpu) | |||
| 1268 | } | 1600 | } |
| 1269 | 1601 | ||
| 1270 | spin_lock_init(&base->lock); | 1602 | spin_lock_init(&base->lock); |
| 1603 | lockdep_set_class(&base->lock, base_lock_keys + cpu); | ||
| 1604 | |||
| 1271 | for (j = 0; j < TVN_SIZE; j++) { | 1605 | for (j = 0; j < TVN_SIZE; j++) { |
| 1272 | INIT_LIST_HEAD(base->tv5.vec + j); | 1606 | INIT_LIST_HEAD(base->tv5.vec + j); |
| 1273 | INIT_LIST_HEAD(base->tv4.vec + j); | 1607 | INIT_LIST_HEAD(base->tv4.vec + j); |
| @@ -1326,7 +1660,7 @@ static void __devinit migrate_timers(int cpu) | |||
| 1326 | } | 1660 | } |
| 1327 | #endif /* CONFIG_HOTPLUG_CPU */ | 1661 | #endif /* CONFIG_HOTPLUG_CPU */ |
| 1328 | 1662 | ||
| 1329 | static int timer_cpu_notify(struct notifier_block *self, | 1663 | static int __cpuinit timer_cpu_notify(struct notifier_block *self, |
| 1330 | unsigned long action, void *hcpu) | 1664 | unsigned long action, void *hcpu) |
| 1331 | { | 1665 | { |
| 1332 | long cpu = (long)hcpu; | 1666 | long cpu = (long)hcpu; |
| @@ -1346,15 +1680,17 @@ static int timer_cpu_notify(struct notifier_block *self, | |||
| 1346 | return NOTIFY_OK; | 1680 | return NOTIFY_OK; |
| 1347 | } | 1681 | } |
| 1348 | 1682 | ||
| 1349 | static struct notifier_block timers_nb = { | 1683 | static struct notifier_block __cpuinitdata timers_nb = { |
| 1350 | .notifier_call = timer_cpu_notify, | 1684 | .notifier_call = timer_cpu_notify, |
| 1351 | }; | 1685 | }; |
| 1352 | 1686 | ||
| 1353 | 1687 | ||
| 1354 | void __init init_timers(void) | 1688 | void __init init_timers(void) |
| 1355 | { | 1689 | { |
| 1356 | timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, | 1690 | int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, |
| 1357 | (void *)(long)smp_processor_id()); | 1691 | (void *)(long)smp_processor_id()); |
| 1692 | |||
| 1693 | BUG_ON(err == NOTIFY_BAD); | ||
| 1358 | register_cpu_notifier(&timers_nb); | 1694 | register_cpu_notifier(&timers_nb); |
| 1359 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); | 1695 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); |
| 1360 | } | 1696 | } |
diff --git a/kernel/unwind.c b/kernel/unwind.c new file mode 100644 index 000000000000..2e2368607aab --- /dev/null +++ b/kernel/unwind.c | |||
| @@ -0,0 +1,941 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2002-2006 Novell, Inc. | ||
| 3 | * Jan Beulich <jbeulich@novell.com> | ||
| 4 | * This code is released under version 2 of the GNU GPL. | ||
| 5 | * | ||
| 6 | * A simple API for unwinding kernel stacks. This is used for | ||
| 7 | * debugging and error reporting purposes. The kernel doesn't need | ||
| 8 | * full-blown stack unwinding with all the bells and whistles, so there | ||
| 9 | * is not much point in implementing the full Dwarf2 unwind API. | ||
| 10 | */ | ||
| 11 | |||
| 12 | #include <linux/unwind.h> | ||
| 13 | #include <linux/module.h> | ||
| 14 | #include <linux/delay.h> | ||
| 15 | #include <linux/stop_machine.h> | ||
| 16 | #include <asm/sections.h> | ||
| 17 | #include <asm/uaccess.h> | ||
| 18 | #include <asm/unaligned.h> | ||
| 19 | |||
| 20 | extern char __start_unwind[], __end_unwind[]; | ||
| 21 | |||
| 22 | #define MAX_STACK_DEPTH 8 | ||
| 23 | |||
| 24 | #define EXTRA_INFO(f) { \ | ||
| 25 | BUILD_BUG_ON_ZERO(offsetof(struct unwind_frame_info, f) \ | ||
| 26 | % FIELD_SIZEOF(struct unwind_frame_info, f)) \ | ||
| 27 | + offsetof(struct unwind_frame_info, f) \ | ||
| 28 | / FIELD_SIZEOF(struct unwind_frame_info, f), \ | ||
| 29 | FIELD_SIZEOF(struct unwind_frame_info, f) \ | ||
| 30 | } | ||
| 31 | #define PTREGS_INFO(f) EXTRA_INFO(regs.f) | ||
| 32 | |||
| 33 | static const struct { | ||
| 34 | unsigned offs:BITS_PER_LONG / 2; | ||
| 35 | unsigned width:BITS_PER_LONG / 2; | ||
| 36 | } reg_info[] = { | ||
| 37 | UNW_REGISTER_INFO | ||
| 38 | }; | ||
| 39 | |||
| 40 | #undef PTREGS_INFO | ||
| 41 | #undef EXTRA_INFO | ||
| 42 | |||
| 43 | #ifndef REG_INVALID | ||
| 44 | #define REG_INVALID(r) (reg_info[r].width == 0) | ||
| 45 | #endif | ||
| 46 | |||
| 47 | #define DW_CFA_nop 0x00 | ||
| 48 | #define DW_CFA_set_loc 0x01 | ||
| 49 | #define DW_CFA_advance_loc1 0x02 | ||
| 50 | #define DW_CFA_advance_loc2 0x03 | ||
| 51 | #define DW_CFA_advance_loc4 0x04 | ||
| 52 | #define DW_CFA_offset_extended 0x05 | ||
| 53 | #define DW_CFA_restore_extended 0x06 | ||
| 54 | #define DW_CFA_undefined 0x07 | ||
| 55 | #define DW_CFA_same_value 0x08 | ||
| 56 | #define DW_CFA_register 0x09 | ||
| 57 | #define DW_CFA_remember_state 0x0a | ||
| 58 | #define DW_CFA_restore_state 0x0b | ||
| 59 | #define DW_CFA_def_cfa 0x0c | ||
| 60 | #define DW_CFA_def_cfa_register 0x0d | ||
| 61 | #define DW_CFA_def_cfa_offset 0x0e | ||
| 62 | #define DW_CFA_def_cfa_expression 0x0f | ||
| 63 | #define DW_CFA_expression 0x10 | ||
| 64 | #define DW_CFA_offset_extended_sf 0x11 | ||
| 65 | #define DW_CFA_def_cfa_sf 0x12 | ||
| 66 | #define DW_CFA_def_cfa_offset_sf 0x13 | ||
| 67 | #define DW_CFA_val_offset 0x14 | ||
| 68 | #define DW_CFA_val_offset_sf 0x15 | ||
| 69 | #define DW_CFA_val_expression 0x16 | ||
| 70 | #define DW_CFA_lo_user 0x1c | ||
| 71 | #define DW_CFA_GNU_window_save 0x2d | ||
| 72 | #define DW_CFA_GNU_args_size 0x2e | ||
| 73 | #define DW_CFA_GNU_negative_offset_extended 0x2f | ||
| 74 | #define DW_CFA_hi_user 0x3f | ||
| 75 | |||
| 76 | #define DW_EH_PE_FORM 0x07 | ||
| 77 | #define DW_EH_PE_native 0x00 | ||
| 78 | #define DW_EH_PE_leb128 0x01 | ||
| 79 | #define DW_EH_PE_data2 0x02 | ||
| 80 | #define DW_EH_PE_data4 0x03 | ||
| 81 | #define DW_EH_PE_data8 0x04 | ||
| 82 | #define DW_EH_PE_signed 0x08 | ||
| 83 | #define DW_EH_PE_ADJUST 0x70 | ||
| 84 | #define DW_EH_PE_abs 0x00 | ||
| 85 | #define DW_EH_PE_pcrel 0x10 | ||
| 86 | #define DW_EH_PE_textrel 0x20 | ||
| 87 | #define DW_EH_PE_datarel 0x30 | ||
| 88 | #define DW_EH_PE_funcrel 0x40 | ||
| 89 | #define DW_EH_PE_aligned 0x50 | ||
| 90 | #define DW_EH_PE_indirect 0x80 | ||
| 91 | #define DW_EH_PE_omit 0xff | ||
| 92 | |||
| 93 | typedef unsigned long uleb128_t; | ||
| 94 | typedef signed long sleb128_t; | ||
| 95 | |||
| 96 | static struct unwind_table { | ||
| 97 | struct { | ||
| 98 | unsigned long pc; | ||
| 99 | unsigned long range; | ||
| 100 | } core, init; | ||
| 101 | const void *address; | ||
| 102 | unsigned long size; | ||
| 103 | struct unwind_table *link; | ||
| 104 | const char *name; | ||
| 105 | } root_table; | ||
| 106 | |||
| 107 | struct unwind_item { | ||
| 108 | enum item_location { | ||
| 109 | Nowhere, | ||
| 110 | Memory, | ||
| 111 | Register, | ||
| 112 | Value | ||
| 113 | } where; | ||
| 114 | uleb128_t value; | ||
| 115 | }; | ||
| 116 | |||
| 117 | struct unwind_state { | ||
| 118 | uleb128_t loc, org; | ||
| 119 | const u8 *cieStart, *cieEnd; | ||
| 120 | uleb128_t codeAlign; | ||
| 121 | sleb128_t dataAlign; | ||
| 122 | struct cfa { | ||
| 123 | uleb128_t reg, offs; | ||
| 124 | } cfa; | ||
| 125 | struct unwind_item regs[ARRAY_SIZE(reg_info)]; | ||
| 126 | unsigned stackDepth:8; | ||
| 127 | unsigned version:8; | ||
| 128 | const u8 *label; | ||
| 129 | const u8 *stack[MAX_STACK_DEPTH]; | ||
| 130 | }; | ||
| 131 | |||
| 132 | static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 }; | ||
| 133 | |||
| 134 | static struct unwind_table *find_table(unsigned long pc) | ||
| 135 | { | ||
| 136 | struct unwind_table *table; | ||
| 137 | |||
| 138 | for (table = &root_table; table; table = table->link) | ||
| 139 | if ((pc >= table->core.pc | ||
| 140 | && pc < table->core.pc + table->core.range) | ||
| 141 | || (pc >= table->init.pc | ||
| 142 | && pc < table->init.pc + table->init.range)) | ||
| 143 | break; | ||
| 144 | |||
| 145 | return table; | ||
| 146 | } | ||
| 147 | |||
| 148 | static void init_unwind_table(struct unwind_table *table, | ||
| 149 | const char *name, | ||
| 150 | const void *core_start, | ||
| 151 | unsigned long core_size, | ||
| 152 | const void *init_start, | ||
| 153 | unsigned long init_size, | ||
| 154 | const void *table_start, | ||
| 155 | unsigned long table_size) | ||
| 156 | { | ||
| 157 | table->core.pc = (unsigned long)core_start; | ||
| 158 | table->core.range = core_size; | ||
| 159 | table->init.pc = (unsigned long)init_start; | ||
| 160 | table->init.range = init_size; | ||
| 161 | table->address = table_start; | ||
| 162 | table->size = table_size; | ||
| 163 | table->link = NULL; | ||
| 164 | table->name = name; | ||
| 165 | } | ||
| 166 | |||
| 167 | void __init unwind_init(void) | ||
| 168 | { | ||
| 169 | init_unwind_table(&root_table, "kernel", | ||
| 170 | _text, _end - _text, | ||
| 171 | NULL, 0, | ||
| 172 | __start_unwind, __end_unwind - __start_unwind); | ||
| 173 | } | ||
| 174 | |||
| 175 | #ifdef CONFIG_MODULES | ||
| 176 | |||
| 177 | static struct unwind_table *last_table; | ||
| 178 | |||
| 179 | /* Must be called with module_mutex held. */ | ||
| 180 | void *unwind_add_table(struct module *module, | ||
| 181 | const void *table_start, | ||
| 182 | unsigned long table_size) | ||
| 183 | { | ||
| 184 | struct unwind_table *table; | ||
| 185 | |||
| 186 | if (table_size <= 0) | ||
| 187 | return NULL; | ||
| 188 | |||
| 189 | table = kmalloc(sizeof(*table), GFP_KERNEL); | ||
| 190 | if (!table) | ||
| 191 | return NULL; | ||
| 192 | |||
| 193 | init_unwind_table(table, module->name, | ||
| 194 | module->module_core, module->core_size, | ||
| 195 | module->module_init, module->init_size, | ||
| 196 | table_start, table_size); | ||
| 197 | |||
| 198 | if (last_table) | ||
| 199 | last_table->link = table; | ||
| 200 | else | ||
| 201 | root_table.link = table; | ||
| 202 | last_table = table; | ||
| 203 | |||
| 204 | return table; | ||
| 205 | } | ||
| 206 | |||
| 207 | struct unlink_table_info | ||
| 208 | { | ||
| 209 | struct unwind_table *table; | ||
| 210 | int init_only; | ||
| 211 | }; | ||
| 212 | |||
| 213 | static int unlink_table(void *arg) | ||
| 214 | { | ||
| 215 | struct unlink_table_info *info = arg; | ||
| 216 | struct unwind_table *table = info->table, *prev; | ||
| 217 | |||
| 218 | for (prev = &root_table; prev->link && prev->link != table; prev = prev->link) | ||
| 219 | ; | ||
| 220 | |||
| 221 | if (prev->link) { | ||
| 222 | if (info->init_only) { | ||
| 223 | table->init.pc = 0; | ||
| 224 | table->init.range = 0; | ||
| 225 | info->table = NULL; | ||
| 226 | } else { | ||
| 227 | prev->link = table->link; | ||
| 228 | if (!prev->link) | ||
| 229 | last_table = prev; | ||
| 230 | } | ||
| 231 | } else | ||
| 232 | info->table = NULL; | ||
| 233 | |||
| 234 | return 0; | ||
| 235 | } | ||
| 236 | |||
| 237 | /* Must be called with module_mutex held. */ | ||
| 238 | void unwind_remove_table(void *handle, int init_only) | ||
| 239 | { | ||
| 240 | struct unwind_table *table = handle; | ||
| 241 | struct unlink_table_info info; | ||
| 242 | |||
| 243 | if (!table || table == &root_table) | ||
| 244 | return; | ||
| 245 | |||
| 246 | if (init_only && table == last_table) { | ||
| 247 | table->init.pc = 0; | ||
| 248 | table->init.range = 0; | ||
| 249 | return; | ||
| 250 | } | ||
| 251 | |||
| 252 | info.table = table; | ||
| 253 | info.init_only = init_only; | ||
| 254 | stop_machine_run(unlink_table, &info, NR_CPUS); | ||
| 255 | |||
| 256 | if (info.table) | ||
| 257 | kfree(table); | ||
| 258 | } | ||
| 259 | |||
| 260 | #endif /* CONFIG_MODULES */ | ||
| 261 | |||
| 262 | static uleb128_t get_uleb128(const u8 **pcur, const u8 *end) | ||
| 263 | { | ||
| 264 | const u8 *cur = *pcur; | ||
| 265 | uleb128_t value; | ||
| 266 | unsigned shift; | ||
| 267 | |||
| 268 | for (shift = 0, value = 0; cur < end; shift += 7) { | ||
| 269 | if (shift + 7 > 8 * sizeof(value) | ||
| 270 | && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) { | ||
| 271 | cur = end + 1; | ||
| 272 | break; | ||
| 273 | } | ||
| 274 | value |= (uleb128_t)(*cur & 0x7f) << shift; | ||
| 275 | if (!(*cur++ & 0x80)) | ||
| 276 | break; | ||
| 277 | } | ||
| 278 | *pcur = cur; | ||
| 279 | |||
| 280 | return value; | ||
| 281 | } | ||
| 282 | |||
| 283 | static sleb128_t get_sleb128(const u8 **pcur, const u8 *end) | ||
| 284 | { | ||
| 285 | const u8 *cur = *pcur; | ||
| 286 | sleb128_t value; | ||
| 287 | unsigned shift; | ||
| 288 | |||
| 289 | for (shift = 0, value = 0; cur < end; shift += 7) { | ||
| 290 | if (shift + 7 > 8 * sizeof(value) | ||
| 291 | && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) { | ||
| 292 | cur = end + 1; | ||
| 293 | break; | ||
| 294 | } | ||
| 295 | value |= (sleb128_t)(*cur & 0x7f) << shift; | ||
| 296 | if (!(*cur & 0x80)) { | ||
| 297 | value |= -(*cur++ & 0x40) << shift; | ||
| 298 | break; | ||
| 299 | } | ||
| 300 | } | ||
| 301 | *pcur = cur; | ||
| 302 | |||
| 303 | return value; | ||
| 304 | } | ||
| 305 | |||
| 306 | static unsigned long read_pointer(const u8 **pLoc, | ||
| 307 | const void *end, | ||
| 308 | signed ptrType) | ||
| 309 | { | ||
| 310 | unsigned long value = 0; | ||
| 311 | union { | ||
| 312 | const u8 *p8; | ||
| 313 | const u16 *p16u; | ||
| 314 | const s16 *p16s; | ||
| 315 | const u32 *p32u; | ||
| 316 | const s32 *p32s; | ||
| 317 | const unsigned long *pul; | ||
| 318 | } ptr; | ||
| 319 | |||
| 320 | if (ptrType < 0 || ptrType == DW_EH_PE_omit) | ||
| 321 | return 0; | ||
| 322 | ptr.p8 = *pLoc; | ||
| 323 | switch(ptrType & DW_EH_PE_FORM) { | ||
| 324 | case DW_EH_PE_data2: | ||
| 325 | if (end < (const void *)(ptr.p16u + 1)) | ||
| 326 | return 0; | ||
| 327 | if(ptrType & DW_EH_PE_signed) | ||
| 328 | value = get_unaligned(ptr.p16s++); | ||
| 329 | else | ||
| 330 | value = get_unaligned(ptr.p16u++); | ||
| 331 | break; | ||
| 332 | case DW_EH_PE_data4: | ||
| 333 | #ifdef CONFIG_64BIT | ||
| 334 | if (end < (const void *)(ptr.p32u + 1)) | ||
| 335 | return 0; | ||
| 336 | if(ptrType & DW_EH_PE_signed) | ||
| 337 | value = get_unaligned(ptr.p32s++); | ||
| 338 | else | ||
| 339 | value = get_unaligned(ptr.p32u++); | ||
| 340 | break; | ||
| 341 | case DW_EH_PE_data8: | ||
| 342 | BUILD_BUG_ON(sizeof(u64) != sizeof(value)); | ||
| 343 | #else | ||
| 344 | BUILD_BUG_ON(sizeof(u32) != sizeof(value)); | ||
| 345 | #endif | ||
| 346 | case DW_EH_PE_native: | ||
| 347 | if (end < (const void *)(ptr.pul + 1)) | ||
| 348 | return 0; | ||
| 349 | value = get_unaligned(ptr.pul++); | ||
| 350 | break; | ||
| 351 | case DW_EH_PE_leb128: | ||
| 352 | BUILD_BUG_ON(sizeof(uleb128_t) > sizeof(value)); | ||
| 353 | value = ptrType & DW_EH_PE_signed | ||
| 354 | ? get_sleb128(&ptr.p8, end) | ||
| 355 | : get_uleb128(&ptr.p8, end); | ||
| 356 | if ((const void *)ptr.p8 > end) | ||
| 357 | return 0; | ||
| 358 | break; | ||
| 359 | default: | ||
| 360 | return 0; | ||
| 361 | } | ||
| 362 | switch(ptrType & DW_EH_PE_ADJUST) { | ||
| 363 | case DW_EH_PE_abs: | ||
| 364 | break; | ||
| 365 | case DW_EH_PE_pcrel: | ||
| 366 | value += (unsigned long)*pLoc; | ||
| 367 | break; | ||
| 368 | default: | ||
| 369 | return 0; | ||
| 370 | } | ||
| 371 | if ((ptrType & DW_EH_PE_indirect) | ||
| 372 | && __get_user(value, (unsigned long *)value)) | ||
| 373 | return 0; | ||
| 374 | *pLoc = ptr.p8; | ||
| 375 | |||
| 376 | return value; | ||
| 377 | } | ||
| 378 | |||
| 379 | static signed fde_pointer_type(const u32 *cie) | ||
| 380 | { | ||
| 381 | const u8 *ptr = (const u8 *)(cie + 2); | ||
| 382 | unsigned version = *ptr; | ||
| 383 | |||
| 384 | if (version != 1) | ||
| 385 | return -1; /* unsupported */ | ||
| 386 | if (*++ptr) { | ||
| 387 | const char *aug; | ||
| 388 | const u8 *end = (const u8 *)(cie + 1) + *cie; | ||
| 389 | uleb128_t len; | ||
| 390 | |||
| 391 | /* check if augmentation size is first (and thus present) */ | ||
| 392 | if (*ptr != 'z') | ||
| 393 | return -1; | ||
| 394 | /* check if augmentation string is nul-terminated */ | ||
| 395 | if ((ptr = memchr(aug = (const void *)ptr, 0, end - ptr)) == NULL) | ||
| 396 | return -1; | ||
| 397 | ++ptr; /* skip terminator */ | ||
| 398 | get_uleb128(&ptr, end); /* skip code alignment */ | ||
| 399 | get_sleb128(&ptr, end); /* skip data alignment */ | ||
| 400 | /* skip return address column */ | ||
| 401 | version <= 1 ? (void)++ptr : (void)get_uleb128(&ptr, end); | ||
| 402 | len = get_uleb128(&ptr, end); /* augmentation length */ | ||
| 403 | if (ptr + len < ptr || ptr + len > end) | ||
| 404 | return -1; | ||
| 405 | end = ptr + len; | ||
| 406 | while (*++aug) { | ||
| 407 | if (ptr >= end) | ||
| 408 | return -1; | ||
| 409 | switch(*aug) { | ||
| 410 | case 'L': | ||
| 411 | ++ptr; | ||
| 412 | break; | ||
| 413 | case 'P': { | ||
| 414 | signed ptrType = *ptr++; | ||
| 415 | |||
| 416 | if (!read_pointer(&ptr, end, ptrType) || ptr > end) | ||
| 417 | return -1; | ||
| 418 | } | ||
| 419 | break; | ||
| 420 | case 'R': | ||
| 421 | return *ptr; | ||
| 422 | default: | ||
| 423 | return -1; | ||
| 424 | } | ||
| 425 | } | ||
| 426 | } | ||
| 427 | return DW_EH_PE_native|DW_EH_PE_abs; | ||
| 428 | } | ||
| 429 | |||
| 430 | static int advance_loc(unsigned long delta, struct unwind_state *state) | ||
| 431 | { | ||
| 432 | state->loc += delta * state->codeAlign; | ||
| 433 | |||
| 434 | return delta > 0; | ||
| 435 | } | ||
| 436 | |||
| 437 | static void set_rule(uleb128_t reg, | ||
| 438 | enum item_location where, | ||
| 439 | uleb128_t value, | ||
| 440 | struct unwind_state *state) | ||
| 441 | { | ||
| 442 | if (reg < ARRAY_SIZE(state->regs)) { | ||
| 443 | state->regs[reg].where = where; | ||
| 444 | state->regs[reg].value = value; | ||
| 445 | } | ||
| 446 | } | ||
| 447 | |||
| 448 | static int processCFI(const u8 *start, | ||
| 449 | const u8 *end, | ||
| 450 | unsigned long targetLoc, | ||
| 451 | signed ptrType, | ||
| 452 | struct unwind_state *state) | ||
| 453 | { | ||
| 454 | union { | ||
| 455 | const u8 *p8; | ||
| 456 | const u16 *p16; | ||
| 457 | const u32 *p32; | ||
| 458 | } ptr; | ||
| 459 | int result = 1; | ||
| 460 | |||
| 461 | if (start != state->cieStart) { | ||
| 462 | state->loc = state->org; | ||
| 463 | result = processCFI(state->cieStart, state->cieEnd, 0, ptrType, state); | ||
| 464 | if (targetLoc == 0 && state->label == NULL) | ||
| 465 | return result; | ||
| 466 | } | ||
| 467 | for (ptr.p8 = start; result && ptr.p8 < end; ) { | ||
| 468 | switch(*ptr.p8 >> 6) { | ||
| 469 | uleb128_t value; | ||
| 470 | |||
| 471 | case 0: | ||
| 472 | switch(*ptr.p8++) { | ||
| 473 | case DW_CFA_nop: | ||
| 474 | break; | ||
| 475 | case DW_CFA_set_loc: | ||
| 476 | if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0) | ||
| 477 | result = 0; | ||
| 478 | break; | ||
| 479 | case DW_CFA_advance_loc1: | ||
| 480 | result = ptr.p8 < end && advance_loc(*ptr.p8++, state); | ||
| 481 | break; | ||
| 482 | case DW_CFA_advance_loc2: | ||
| 483 | result = ptr.p8 <= end + 2 | ||
| 484 | && advance_loc(*ptr.p16++, state); | ||
| 485 | break; | ||
| 486 | case DW_CFA_advance_loc4: | ||
| 487 | result = ptr.p8 <= end + 4 | ||
| 488 | && advance_loc(*ptr.p32++, state); | ||
| 489 | break; | ||
| 490 | case DW_CFA_offset_extended: | ||
| 491 | value = get_uleb128(&ptr.p8, end); | ||
| 492 | set_rule(value, Memory, get_uleb128(&ptr.p8, end), state); | ||
| 493 | break; | ||
| 494 | case DW_CFA_val_offset: | ||
| 495 | value = get_uleb128(&ptr.p8, end); | ||
| 496 | set_rule(value, Value, get_uleb128(&ptr.p8, end), state); | ||
| 497 | break; | ||
| 498 | case DW_CFA_offset_extended_sf: | ||
| 499 | value = get_uleb128(&ptr.p8, end); | ||
| 500 | set_rule(value, Memory, get_sleb128(&ptr.p8, end), state); | ||
| 501 | break; | ||
| 502 | case DW_CFA_val_offset_sf: | ||
| 503 | value = get_uleb128(&ptr.p8, end); | ||
| 504 | set_rule(value, Value, get_sleb128(&ptr.p8, end), state); | ||
| 505 | break; | ||
| 506 | case DW_CFA_restore_extended: | ||
| 507 | case DW_CFA_undefined: | ||
| 508 | case DW_CFA_same_value: | ||
| 509 | set_rule(get_uleb128(&ptr.p8, end), Nowhere, 0, state); | ||
| 510 | break; | ||
| 511 | case DW_CFA_register: | ||
| 512 | value = get_uleb128(&ptr.p8, end); | ||
| 513 | set_rule(value, | ||
| 514 | Register, | ||
| 515 | get_uleb128(&ptr.p8, end), state); | ||
| 516 | break; | ||
| 517 | case DW_CFA_remember_state: | ||
| 518 | if (ptr.p8 == state->label) { | ||
| 519 | state->label = NULL; | ||
| 520 | return 1; | ||
| 521 | } | ||
| 522 | if (state->stackDepth >= MAX_STACK_DEPTH) | ||
| 523 | return 0; | ||
| 524 | state->stack[state->stackDepth++] = ptr.p8; | ||
| 525 | break; | ||
| 526 | case DW_CFA_restore_state: | ||
| 527 | if (state->stackDepth) { | ||
| 528 | const uleb128_t loc = state->loc; | ||
| 529 | const u8 *label = state->label; | ||
| 530 | |||
| 531 | state->label = state->stack[state->stackDepth - 1]; | ||
| 532 | memcpy(&state->cfa, &badCFA, sizeof(state->cfa)); | ||
| 533 | memset(state->regs, 0, sizeof(state->regs)); | ||
| 534 | state->stackDepth = 0; | ||
| 535 | result = processCFI(start, end, 0, ptrType, state); | ||
| 536 | state->loc = loc; | ||
| 537 | state->label = label; | ||
| 538 | } else | ||
| 539 | return 0; | ||
| 540 | break; | ||
| 541 | case DW_CFA_def_cfa: | ||
| 542 | state->cfa.reg = get_uleb128(&ptr.p8, end); | ||
| 543 | /*nobreak*/ | ||
| 544 | case DW_CFA_def_cfa_offset: | ||
| 545 | state->cfa.offs = get_uleb128(&ptr.p8, end); | ||
| 546 | break; | ||
| 547 | case DW_CFA_def_cfa_sf: | ||
| 548 | state->cfa.reg = get_uleb128(&ptr.p8, end); | ||
| 549 | /*nobreak*/ | ||
| 550 | case DW_CFA_def_cfa_offset_sf: | ||
| 551 | state->cfa.offs = get_sleb128(&ptr.p8, end) | ||
| 552 | * state->dataAlign; | ||
| 553 | break; | ||
| 554 | case DW_CFA_def_cfa_register: | ||
| 555 | state->cfa.reg = get_uleb128(&ptr.p8, end); | ||
| 556 | break; | ||
| 557 | /*todo case DW_CFA_def_cfa_expression: */ | ||
| 558 | /*todo case DW_CFA_expression: */ | ||
| 559 | /*todo case DW_CFA_val_expression: */ | ||
| 560 | case DW_CFA_GNU_args_size: | ||
| 561 | get_uleb128(&ptr.p8, end); | ||
| 562 | break; | ||
| 563 | case DW_CFA_GNU_negative_offset_extended: | ||
| 564 | value = get_uleb128(&ptr.p8, end); | ||
| 565 | set_rule(value, | ||
| 566 | Memory, | ||
| 567 | (uleb128_t)0 - get_uleb128(&ptr.p8, end), state); | ||
| 568 | break; | ||
| 569 | case DW_CFA_GNU_window_save: | ||
| 570 | default: | ||
| 571 | result = 0; | ||
| 572 | break; | ||
| 573 | } | ||
| 574 | break; | ||
| 575 | case 1: | ||
| 576 | result = advance_loc(*ptr.p8++ & 0x3f, state); | ||
| 577 | break; | ||
| 578 | case 2: | ||
| 579 | value = *ptr.p8++ & 0x3f; | ||
| 580 | set_rule(value, Memory, get_uleb128(&ptr.p8, end), state); | ||
| 581 | break; | ||
| 582 | case 3: | ||
| 583 | set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state); | ||
| 584 | break; | ||
| 585 | } | ||
| 586 | if (ptr.p8 > end) | ||
| 587 | result = 0; | ||
| 588 | if (result && targetLoc != 0 && targetLoc < state->loc) | ||
| 589 | return 1; | ||
| 590 | } | ||
| 591 | |||
| 592 | return result | ||
| 593 | && ptr.p8 == end | ||
| 594 | && (targetLoc == 0 | ||
| 595 | || (/*todo While in theory this should apply, gcc in practice omits | ||
| 596 | everything past the function prolog, and hence the location | ||
| 597 | never reaches the end of the function. | ||
| 598 | targetLoc < state->loc &&*/ state->label == NULL)); | ||
| 599 | } | ||
| 600 | |||
| 601 | /* Unwind to previous to frame. Returns 0 if successful, negative | ||
| 602 | * number in case of an error. */ | ||
| 603 | int unwind(struct unwind_frame_info *frame) | ||
| 604 | { | ||
| 605 | #define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs]) | ||
| 606 | const u32 *fde = NULL, *cie = NULL; | ||
| 607 | const u8 *ptr = NULL, *end = NULL; | ||
| 608 | unsigned long pc = UNW_PC(frame) - frame->call_frame; | ||
| 609 | unsigned long startLoc = 0, endLoc = 0, cfa; | ||
| 610 | unsigned i; | ||
| 611 | signed ptrType = -1; | ||
| 612 | uleb128_t retAddrReg = 0; | ||
| 613 | struct unwind_table *table; | ||
| 614 | struct unwind_state state; | ||
| 615 | |||
| 616 | if (UNW_PC(frame) == 0) | ||
| 617 | return -EINVAL; | ||
| 618 | if ((table = find_table(pc)) != NULL | ||
| 619 | && !(table->size & (sizeof(*fde) - 1))) { | ||
| 620 | unsigned long tableSize = table->size; | ||
| 621 | |||
| 622 | for (fde = table->address; | ||
| 623 | tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde; | ||
| 624 | tableSize -= sizeof(*fde) + *fde, | ||
| 625 | fde += 1 + *fde / sizeof(*fde)) { | ||
| 626 | if (!*fde || (*fde & (sizeof(*fde) - 1))) | ||
| 627 | break; | ||
| 628 | if (!fde[1]) | ||
| 629 | continue; /* this is a CIE */ | ||
| 630 | if ((fde[1] & (sizeof(*fde) - 1)) | ||
| 631 | || fde[1] > (unsigned long)(fde + 1) | ||
| 632 | - (unsigned long)table->address) | ||
| 633 | continue; /* this is not a valid FDE */ | ||
| 634 | cie = fde + 1 - fde[1] / sizeof(*fde); | ||
| 635 | if (*cie <= sizeof(*cie) + 4 | ||
| 636 | || *cie >= fde[1] - sizeof(*fde) | ||
| 637 | || (*cie & (sizeof(*cie) - 1)) | ||
| 638 | || cie[1] | ||
| 639 | || (ptrType = fde_pointer_type(cie)) < 0) { | ||
| 640 | cie = NULL; /* this is not a (valid) CIE */ | ||
| 641 | continue; | ||
| 642 | } | ||
| 643 | ptr = (const u8 *)(fde + 2); | ||
| 644 | startLoc = read_pointer(&ptr, | ||
| 645 | (const u8 *)(fde + 1) + *fde, | ||
| 646 | ptrType); | ||
| 647 | endLoc = startLoc | ||
| 648 | + read_pointer(&ptr, | ||
| 649 | (const u8 *)(fde + 1) + *fde, | ||
| 650 | ptrType & DW_EH_PE_indirect | ||
| 651 | ? ptrType | ||
| 652 | : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed)); | ||
| 653 | if (pc >= startLoc && pc < endLoc) | ||
| 654 | break; | ||
| 655 | cie = NULL; | ||
| 656 | } | ||
| 657 | } | ||
| 658 | if (cie != NULL) { | ||
| 659 | memset(&state, 0, sizeof(state)); | ||
| 660 | state.cieEnd = ptr; /* keep here temporarily */ | ||
| 661 | ptr = (const u8 *)(cie + 2); | ||
| 662 | end = (const u8 *)(cie + 1) + *cie; | ||
| 663 | frame->call_frame = 1; | ||
| 664 | if ((state.version = *ptr) != 1) | ||
| 665 | cie = NULL; /* unsupported version */ | ||
| 666 | else if (*++ptr) { | ||
| 667 | /* check if augmentation size is first (and thus present) */ | ||
| 668 | if (*ptr == 'z') { | ||
| 669 | while (++ptr < end && *ptr) { | ||
| 670 | switch(*ptr) { | ||
| 671 | /* check for ignorable (or already handled) | ||
| 672 | * nul-terminated augmentation string */ | ||
| 673 | case 'L': | ||
| 674 | case 'P': | ||
| 675 | case 'R': | ||
| 676 | continue; | ||
| 677 | case 'S': | ||
| 678 | frame->call_frame = 0; | ||
| 679 | continue; | ||
| 680 | default: | ||
| 681 | break; | ||
| 682 | } | ||
| 683 | break; | ||
| 684 | } | ||
| 685 | } | ||
| 686 | if (ptr >= end || *ptr) | ||
| 687 | cie = NULL; | ||
| 688 | } | ||
| 689 | ++ptr; | ||
| 690 | } | ||
| 691 | if (cie != NULL) { | ||
| 692 | /* get code aligment factor */ | ||
| 693 | state.codeAlign = get_uleb128(&ptr, end); | ||
| 694 | /* get data aligment factor */ | ||
| 695 | state.dataAlign = get_sleb128(&ptr, end); | ||
| 696 | if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end) | ||
| 697 | cie = NULL; | ||
| 698 | else { | ||
| 699 | retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end); | ||
| 700 | /* skip augmentation */ | ||
| 701 | if (((const char *)(cie + 2))[1] == 'z') | ||
| 702 | ptr += get_uleb128(&ptr, end); | ||
| 703 | if (ptr > end | ||
| 704 | || retAddrReg >= ARRAY_SIZE(reg_info) | ||
| 705 | || REG_INVALID(retAddrReg) | ||
| 706 | || reg_info[retAddrReg].width != sizeof(unsigned long)) | ||
| 707 | cie = NULL; | ||
| 708 | } | ||
| 709 | } | ||
| 710 | if (cie != NULL) { | ||
| 711 | state.cieStart = ptr; | ||
| 712 | ptr = state.cieEnd; | ||
| 713 | state.cieEnd = end; | ||
| 714 | end = (const u8 *)(fde + 1) + *fde; | ||
| 715 | /* skip augmentation */ | ||
| 716 | if (((const char *)(cie + 2))[1] == 'z') { | ||
| 717 | uleb128_t augSize = get_uleb128(&ptr, end); | ||
| 718 | |||
| 719 | if ((ptr += augSize) > end) | ||
| 720 | fde = NULL; | ||
| 721 | } | ||
| 722 | } | ||
| 723 | if (cie == NULL || fde == NULL) { | ||
| 724 | #ifdef CONFIG_FRAME_POINTER | ||
| 725 | unsigned long top, bottom; | ||
| 726 | #endif | ||
| 727 | |||
| 728 | #ifdef CONFIG_FRAME_POINTER | ||
| 729 | top = STACK_TOP(frame->task); | ||
| 730 | bottom = STACK_BOTTOM(frame->task); | ||
| 731 | # if FRAME_RETADDR_OFFSET < 0 | ||
| 732 | if (UNW_SP(frame) < top | ||
| 733 | && UNW_FP(frame) <= UNW_SP(frame) | ||
| 734 | && bottom < UNW_FP(frame) | ||
| 735 | # else | ||
| 736 | if (UNW_SP(frame) > top | ||
| 737 | && UNW_FP(frame) >= UNW_SP(frame) | ||
| 738 | && bottom > UNW_FP(frame) | ||
| 739 | # endif | ||
| 740 | && !((UNW_SP(frame) | UNW_FP(frame)) | ||
| 741 | & (sizeof(unsigned long) - 1))) { | ||
| 742 | unsigned long link; | ||
| 743 | |||
| 744 | if (!__get_user(link, | ||
| 745 | (unsigned long *)(UNW_FP(frame) | ||
| 746 | + FRAME_LINK_OFFSET)) | ||
| 747 | # if FRAME_RETADDR_OFFSET < 0 | ||
| 748 | && link > bottom && link < UNW_FP(frame) | ||
| 749 | # else | ||
| 750 | && link > UNW_FP(frame) && link < bottom | ||
| 751 | # endif | ||
| 752 | && !(link & (sizeof(link) - 1)) | ||
| 753 | && !__get_user(UNW_PC(frame), | ||
| 754 | (unsigned long *)(UNW_FP(frame) | ||
| 755 | + FRAME_RETADDR_OFFSET))) { | ||
| 756 | UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET | ||
| 757 | # if FRAME_RETADDR_OFFSET < 0 | ||
| 758 | - | ||
| 759 | # else | ||
| 760 | + | ||
| 761 | # endif | ||
| 762 | sizeof(UNW_PC(frame)); | ||
| 763 | UNW_FP(frame) = link; | ||
| 764 | return 0; | ||
| 765 | } | ||
| 766 | } | ||
| 767 | #endif | ||
| 768 | return -ENXIO; | ||
| 769 | } | ||
| 770 | state.org = startLoc; | ||
| 771 | memcpy(&state.cfa, &badCFA, sizeof(state.cfa)); | ||
| 772 | /* process instructions */ | ||
| 773 | if (!processCFI(ptr, end, pc, ptrType, &state) | ||
| 774 | || state.loc > endLoc | ||
| 775 | || state.regs[retAddrReg].where == Nowhere | ||
| 776 | || state.cfa.reg >= ARRAY_SIZE(reg_info) | ||
| 777 | || reg_info[state.cfa.reg].width != sizeof(unsigned long) | ||
| 778 | || state.cfa.offs % sizeof(unsigned long)) | ||
| 779 | return -EIO; | ||
| 780 | /* update frame */ | ||
| 781 | #ifndef CONFIG_AS_CFI_SIGNAL_FRAME | ||
| 782 | if(frame->call_frame | ||
| 783 | && !UNW_DEFAULT_RA(state.regs[retAddrReg], state.dataAlign)) | ||
| 784 | frame->call_frame = 0; | ||
| 785 | #endif | ||
| 786 | cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs; | ||
| 787 | startLoc = min((unsigned long)UNW_SP(frame), cfa); | ||
| 788 | endLoc = max((unsigned long)UNW_SP(frame), cfa); | ||
| 789 | if (STACK_LIMIT(startLoc) != STACK_LIMIT(endLoc)) { | ||
| 790 | startLoc = min(STACK_LIMIT(cfa), cfa); | ||
| 791 | endLoc = max(STACK_LIMIT(cfa), cfa); | ||
| 792 | } | ||
| 793 | #ifndef CONFIG_64BIT | ||
| 794 | # define CASES CASE(8); CASE(16); CASE(32) | ||
| 795 | #else | ||
| 796 | # define CASES CASE(8); CASE(16); CASE(32); CASE(64) | ||
| 797 | #endif | ||
| 798 | for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { | ||
| 799 | if (REG_INVALID(i)) { | ||
| 800 | if (state.regs[i].where == Nowhere) | ||
| 801 | continue; | ||
| 802 | return -EIO; | ||
| 803 | } | ||
| 804 | switch(state.regs[i].where) { | ||
| 805 | default: | ||
| 806 | break; | ||
| 807 | case Register: | ||
| 808 | if (state.regs[i].value >= ARRAY_SIZE(reg_info) | ||
| 809 | || REG_INVALID(state.regs[i].value) | ||
| 810 | || reg_info[i].width > reg_info[state.regs[i].value].width) | ||
| 811 | return -EIO; | ||
| 812 | switch(reg_info[state.regs[i].value].width) { | ||
| 813 | #define CASE(n) \ | ||
| 814 | case sizeof(u##n): \ | ||
| 815 | state.regs[i].value = FRAME_REG(state.regs[i].value, \ | ||
| 816 | const u##n); \ | ||
| 817 | break | ||
| 818 | CASES; | ||
| 819 | #undef CASE | ||
| 820 | default: | ||
| 821 | return -EIO; | ||
| 822 | } | ||
| 823 | break; | ||
| 824 | } | ||
| 825 | } | ||
| 826 | for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { | ||
| 827 | if (REG_INVALID(i)) | ||
| 828 | continue; | ||
| 829 | switch(state.regs[i].where) { | ||
| 830 | case Nowhere: | ||
| 831 | if (reg_info[i].width != sizeof(UNW_SP(frame)) | ||
| 832 | || &FRAME_REG(i, __typeof__(UNW_SP(frame))) | ||
| 833 | != &UNW_SP(frame)) | ||
| 834 | continue; | ||
| 835 | UNW_SP(frame) = cfa; | ||
| 836 | break; | ||
| 837 | case Register: | ||
| 838 | switch(reg_info[i].width) { | ||
| 839 | #define CASE(n) case sizeof(u##n): \ | ||
| 840 | FRAME_REG(i, u##n) = state.regs[i].value; \ | ||
| 841 | break | ||
| 842 | CASES; | ||
| 843 | #undef CASE | ||
| 844 | default: | ||
| 845 | return -EIO; | ||
| 846 | } | ||
| 847 | break; | ||
| 848 | case Value: | ||
| 849 | if (reg_info[i].width != sizeof(unsigned long)) | ||
| 850 | return -EIO; | ||
| 851 | FRAME_REG(i, unsigned long) = cfa + state.regs[i].value | ||
| 852 | * state.dataAlign; | ||
| 853 | break; | ||
| 854 | case Memory: { | ||
| 855 | unsigned long addr = cfa + state.regs[i].value | ||
| 856 | * state.dataAlign; | ||
| 857 | |||
| 858 | if ((state.regs[i].value * state.dataAlign) | ||
| 859 | % sizeof(unsigned long) | ||
| 860 | || addr < startLoc | ||
| 861 | || addr + sizeof(unsigned long) < addr | ||
| 862 | || addr + sizeof(unsigned long) > endLoc) | ||
| 863 | return -EIO; | ||
| 864 | switch(reg_info[i].width) { | ||
| 865 | #define CASE(n) case sizeof(u##n): \ | ||
| 866 | __get_user(FRAME_REG(i, u##n), (u##n *)addr); \ | ||
| 867 | break | ||
| 868 | CASES; | ||
| 869 | #undef CASE | ||
| 870 | default: | ||
| 871 | return -EIO; | ||
| 872 | } | ||
| 873 | } | ||
| 874 | break; | ||
| 875 | } | ||
| 876 | } | ||
| 877 | |||
| 878 | return 0; | ||
| 879 | #undef CASES | ||
| 880 | #undef FRAME_REG | ||
| 881 | } | ||
| 882 | EXPORT_SYMBOL(unwind); | ||
| 883 | |||
| 884 | int unwind_init_frame_info(struct unwind_frame_info *info, | ||
| 885 | struct task_struct *tsk, | ||
| 886 | /*const*/ struct pt_regs *regs) | ||
| 887 | { | ||
| 888 | info->task = tsk; | ||
| 889 | info->call_frame = 0; | ||
| 890 | arch_unw_init_frame_info(info, regs); | ||
| 891 | |||
| 892 | return 0; | ||
| 893 | } | ||
| 894 | EXPORT_SYMBOL(unwind_init_frame_info); | ||
| 895 | |||
| 896 | /* | ||
| 897 | * Prepare to unwind a blocked task. | ||
| 898 | */ | ||
| 899 | int unwind_init_blocked(struct unwind_frame_info *info, | ||
| 900 | struct task_struct *tsk) | ||
| 901 | { | ||
| 902 | info->task = tsk; | ||
| 903 | info->call_frame = 0; | ||
| 904 | arch_unw_init_blocked(info); | ||
| 905 | |||
| 906 | return 0; | ||
| 907 | } | ||
| 908 | EXPORT_SYMBOL(unwind_init_blocked); | ||
| 909 | |||
| 910 | /* | ||
| 911 | * Prepare to unwind the currently running thread. | ||
| 912 | */ | ||
| 913 | int unwind_init_running(struct unwind_frame_info *info, | ||
| 914 | asmlinkage int (*callback)(struct unwind_frame_info *, | ||
| 915 | void *arg), | ||
| 916 | void *arg) | ||
| 917 | { | ||
| 918 | info->task = current; | ||
| 919 | info->call_frame = 0; | ||
| 920 | |||
| 921 | return arch_unwind_init_running(info, callback, arg); | ||
| 922 | } | ||
| 923 | EXPORT_SYMBOL(unwind_init_running); | ||
| 924 | |||
| 925 | /* | ||
| 926 | * Unwind until the return pointer is in user-land (or until an error | ||
| 927 | * occurs). Returns 0 if successful, negative number in case of | ||
| 928 | * error. | ||
| 929 | */ | ||
| 930 | int unwind_to_user(struct unwind_frame_info *info) | ||
| 931 | { | ||
| 932 | while (!arch_unw_user_mode(info)) { | ||
| 933 | int err = unwind(info); | ||
| 934 | |||
| 935 | if (err < 0) | ||
| 936 | return err; | ||
| 937 | } | ||
| 938 | |||
| 939 | return 0; | ||
| 940 | } | ||
| 941 | EXPORT_SYMBOL(unwind_to_user); | ||
diff --git a/kernel/wait.c b/kernel/wait.c index 791681cfea98..59a82f63275d 100644 --- a/kernel/wait.c +++ b/kernel/wait.c | |||
| @@ -3,7 +3,6 @@ | |||
| 3 | * | 3 | * |
| 4 | * (C) 2004 William Irwin, Oracle | 4 | * (C) 2004 William Irwin, Oracle |
| 5 | */ | 5 | */ |
| 6 | #include <linux/config.h> | ||
| 7 | #include <linux/init.h> | 6 | #include <linux/init.h> |
| 8 | #include <linux/module.h> | 7 | #include <linux/module.h> |
| 9 | #include <linux/sched.h> | 8 | #include <linux/sched.h> |
| @@ -11,6 +10,14 @@ | |||
| 11 | #include <linux/wait.h> | 10 | #include <linux/wait.h> |
| 12 | #include <linux/hash.h> | 11 | #include <linux/hash.h> |
| 13 | 12 | ||
| 13 | void init_waitqueue_head(wait_queue_head_t *q) | ||
| 14 | { | ||
| 15 | spin_lock_init(&q->lock); | ||
| 16 | INIT_LIST_HEAD(&q->task_list); | ||
| 17 | } | ||
| 18 | |||
| 19 | EXPORT_SYMBOL(init_waitqueue_head); | ||
| 20 | |||
| 14 | void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) | 21 | void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) |
| 15 | { | 22 | { |
| 16 | unsigned long flags; | 23 | unsigned long flags; |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 740c5abceb07..835fe28b87a8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -51,7 +51,7 @@ struct cpu_workqueue_struct { | |||
| 51 | wait_queue_head_t work_done; | 51 | wait_queue_head_t work_done; |
| 52 | 52 | ||
| 53 | struct workqueue_struct *wq; | 53 | struct workqueue_struct *wq; |
| 54 | task_t *thread; | 54 | struct task_struct *thread; |
| 55 | 55 | ||
| 56 | int run_depth; /* Detect run_workqueue() recursion depth */ | 56 | int run_depth; /* Detect run_workqueue() recursion depth */ |
| 57 | } ____cacheline_aligned; | 57 | } ____cacheline_aligned; |
| @@ -68,7 +68,7 @@ struct workqueue_struct { | |||
| 68 | 68 | ||
| 69 | /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove | 69 | /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove |
| 70 | threads to each one as cpus come/go. */ | 70 | threads to each one as cpus come/go. */ |
| 71 | static DEFINE_SPINLOCK(workqueue_lock); | 71 | static DEFINE_MUTEX(workqueue_mutex); |
| 72 | static LIST_HEAD(workqueues); | 72 | static LIST_HEAD(workqueues); |
| 73 | 73 | ||
| 74 | static int singlethread_cpu; | 74 | static int singlethread_cpu; |
| @@ -93,9 +93,12 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, | |||
| 93 | spin_unlock_irqrestore(&cwq->lock, flags); | 93 | spin_unlock_irqrestore(&cwq->lock, flags); |
| 94 | } | 94 | } |
| 95 | 95 | ||
| 96 | /* | 96 | /** |
| 97 | * Queue work on a workqueue. Return non-zero if it was successfully | 97 | * queue_work - queue work on a workqueue |
| 98 | * added. | 98 | * @wq: workqueue to use |
| 99 | * @work: work to queue | ||
| 100 | * | ||
| 101 | * Returns non-zero if it was successfully added. | ||
| 99 | * | 102 | * |
| 100 | * We queue the work to the CPU it was submitted, but there is no | 103 | * We queue the work to the CPU it was submitted, but there is no |
| 101 | * guarantee that it will be processed by that CPU. | 104 | * guarantee that it will be processed by that CPU. |
| @@ -114,6 +117,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) | |||
| 114 | put_cpu(); | 117 | put_cpu(); |
| 115 | return ret; | 118 | return ret; |
| 116 | } | 119 | } |
| 120 | EXPORT_SYMBOL_GPL(queue_work); | ||
| 117 | 121 | ||
| 118 | static void delayed_work_timer_fn(unsigned long __data) | 122 | static void delayed_work_timer_fn(unsigned long __data) |
| 119 | { | 123 | { |
| @@ -127,6 +131,14 @@ static void delayed_work_timer_fn(unsigned long __data) | |||
| 127 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); | 131 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); |
| 128 | } | 132 | } |
| 129 | 133 | ||
| 134 | /** | ||
| 135 | * queue_delayed_work - queue work on a workqueue after delay | ||
| 136 | * @wq: workqueue to use | ||
| 137 | * @work: work to queue | ||
| 138 | * @delay: number of jiffies to wait before queueing | ||
| 139 | * | ||
| 140 | * Returns non-zero if it was successfully added. | ||
| 141 | */ | ||
| 130 | int fastcall queue_delayed_work(struct workqueue_struct *wq, | 142 | int fastcall queue_delayed_work(struct workqueue_struct *wq, |
| 131 | struct work_struct *work, unsigned long delay) | 143 | struct work_struct *work, unsigned long delay) |
| 132 | { | 144 | { |
| @@ -147,6 +159,38 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq, | |||
| 147 | } | 159 | } |
| 148 | return ret; | 160 | return ret; |
| 149 | } | 161 | } |
| 162 | EXPORT_SYMBOL_GPL(queue_delayed_work); | ||
| 163 | |||
| 164 | /** | ||
| 165 | * queue_delayed_work_on - queue work on specific CPU after delay | ||
| 166 | * @cpu: CPU number to execute work on | ||
| 167 | * @wq: workqueue to use | ||
| 168 | * @work: work to queue | ||
| 169 | * @delay: number of jiffies to wait before queueing | ||
| 170 | * | ||
| 171 | * Returns non-zero if it was successfully added. | ||
| 172 | */ | ||
| 173 | int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, | ||
| 174 | struct work_struct *work, unsigned long delay) | ||
| 175 | { | ||
| 176 | int ret = 0; | ||
| 177 | struct timer_list *timer = &work->timer; | ||
| 178 | |||
| 179 | if (!test_and_set_bit(0, &work->pending)) { | ||
| 180 | BUG_ON(timer_pending(timer)); | ||
| 181 | BUG_ON(!list_empty(&work->entry)); | ||
| 182 | |||
| 183 | /* This stores wq for the moment, for the timer_fn */ | ||
| 184 | work->wq_data = wq; | ||
| 185 | timer->expires = jiffies + delay; | ||
| 186 | timer->data = (unsigned long)work; | ||
| 187 | timer->function = delayed_work_timer_fn; | ||
| 188 | add_timer_on(timer, cpu); | ||
| 189 | ret = 1; | ||
| 190 | } | ||
| 191 | return ret; | ||
| 192 | } | ||
| 193 | EXPORT_SYMBOL_GPL(queue_delayed_work_on); | ||
| 150 | 194 | ||
| 151 | static void run_workqueue(struct cpu_workqueue_struct *cwq) | 195 | static void run_workqueue(struct cpu_workqueue_struct *cwq) |
| 152 | { | 196 | { |
| @@ -251,8 +295,9 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) | |||
| 251 | } | 295 | } |
| 252 | } | 296 | } |
| 253 | 297 | ||
| 254 | /* | 298 | /** |
| 255 | * flush_workqueue - ensure that any scheduled work has run to completion. | 299 | * flush_workqueue - ensure that any scheduled work has run to completion. |
| 300 | * @wq: workqueue to flush | ||
| 256 | * | 301 | * |
| 257 | * Forces execution of the workqueue and blocks until its completion. | 302 | * Forces execution of the workqueue and blocks until its completion. |
| 258 | * This is typically used in driver shutdown handlers. | 303 | * This is typically used in driver shutdown handlers. |
| @@ -275,12 +320,13 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) | |||
| 275 | } else { | 320 | } else { |
| 276 | int cpu; | 321 | int cpu; |
| 277 | 322 | ||
| 278 | lock_cpu_hotplug(); | 323 | mutex_lock(&workqueue_mutex); |
| 279 | for_each_online_cpu(cpu) | 324 | for_each_online_cpu(cpu) |
| 280 | flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); | 325 | flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); |
| 281 | unlock_cpu_hotplug(); | 326 | mutex_unlock(&workqueue_mutex); |
| 282 | } | 327 | } |
| 283 | } | 328 | } |
| 329 | EXPORT_SYMBOL_GPL(flush_workqueue); | ||
| 284 | 330 | ||
| 285 | static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, | 331 | static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, |
| 286 | int cpu) | 332 | int cpu) |
| @@ -325,8 +371,7 @@ struct workqueue_struct *__create_workqueue(const char *name, | |||
| 325 | } | 371 | } |
| 326 | 372 | ||
| 327 | wq->name = name; | 373 | wq->name = name; |
| 328 | /* We don't need the distraction of CPUs appearing and vanishing. */ | 374 | mutex_lock(&workqueue_mutex); |
| 329 | lock_cpu_hotplug(); | ||
| 330 | if (singlethread) { | 375 | if (singlethread) { |
| 331 | INIT_LIST_HEAD(&wq->list); | 376 | INIT_LIST_HEAD(&wq->list); |
| 332 | p = create_workqueue_thread(wq, singlethread_cpu); | 377 | p = create_workqueue_thread(wq, singlethread_cpu); |
| @@ -335,9 +380,7 @@ struct workqueue_struct *__create_workqueue(const char *name, | |||
| 335 | else | 380 | else |
| 336 | wake_up_process(p); | 381 | wake_up_process(p); |
| 337 | } else { | 382 | } else { |
| 338 | spin_lock(&workqueue_lock); | ||
| 339 | list_add(&wq->list, &workqueues); | 383 | list_add(&wq->list, &workqueues); |
| 340 | spin_unlock(&workqueue_lock); | ||
| 341 | for_each_online_cpu(cpu) { | 384 | for_each_online_cpu(cpu) { |
| 342 | p = create_workqueue_thread(wq, cpu); | 385 | p = create_workqueue_thread(wq, cpu); |
| 343 | if (p) { | 386 | if (p) { |
| @@ -347,7 +390,7 @@ struct workqueue_struct *__create_workqueue(const char *name, | |||
| 347 | destroy = 1; | 390 | destroy = 1; |
| 348 | } | 391 | } |
| 349 | } | 392 | } |
| 350 | unlock_cpu_hotplug(); | 393 | mutex_unlock(&workqueue_mutex); |
| 351 | 394 | ||
| 352 | /* | 395 | /* |
| 353 | * Was there any error during startup? If yes then clean up: | 396 | * Was there any error during startup? If yes then clean up: |
| @@ -358,6 +401,7 @@ struct workqueue_struct *__create_workqueue(const char *name, | |||
| 358 | } | 401 | } |
| 359 | return wq; | 402 | return wq; |
| 360 | } | 403 | } |
| 404 | EXPORT_SYMBOL_GPL(__create_workqueue); | ||
| 361 | 405 | ||
| 362 | static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) | 406 | static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) |
| 363 | { | 407 | { |
| @@ -374,6 +418,12 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) | |||
| 374 | kthread_stop(p); | 418 | kthread_stop(p); |
| 375 | } | 419 | } |
| 376 | 420 | ||
| 421 | /** | ||
| 422 | * destroy_workqueue - safely terminate a workqueue | ||
| 423 | * @wq: target workqueue | ||
| 424 | * | ||
| 425 | * Safely destroy a workqueue. All work currently pending will be done first. | ||
| 426 | */ | ||
| 377 | void destroy_workqueue(struct workqueue_struct *wq) | 427 | void destroy_workqueue(struct workqueue_struct *wq) |
| 378 | { | 428 | { |
| 379 | int cpu; | 429 | int cpu; |
| @@ -381,69 +431,94 @@ void destroy_workqueue(struct workqueue_struct *wq) | |||
| 381 | flush_workqueue(wq); | 431 | flush_workqueue(wq); |
| 382 | 432 | ||
| 383 | /* We don't need the distraction of CPUs appearing and vanishing. */ | 433 | /* We don't need the distraction of CPUs appearing and vanishing. */ |
| 384 | lock_cpu_hotplug(); | 434 | mutex_lock(&workqueue_mutex); |
| 385 | if (is_single_threaded(wq)) | 435 | if (is_single_threaded(wq)) |
| 386 | cleanup_workqueue_thread(wq, singlethread_cpu); | 436 | cleanup_workqueue_thread(wq, singlethread_cpu); |
| 387 | else { | 437 | else { |
| 388 | for_each_online_cpu(cpu) | 438 | for_each_online_cpu(cpu) |
| 389 | cleanup_workqueue_thread(wq, cpu); | 439 | cleanup_workqueue_thread(wq, cpu); |
| 390 | spin_lock(&workqueue_lock); | ||
| 391 | list_del(&wq->list); | 440 | list_del(&wq->list); |
| 392 | spin_unlock(&workqueue_lock); | ||
| 393 | } | 441 | } |
| 394 | unlock_cpu_hotplug(); | 442 | mutex_unlock(&workqueue_mutex); |
| 395 | free_percpu(wq->cpu_wq); | 443 | free_percpu(wq->cpu_wq); |
| 396 | kfree(wq); | 444 | kfree(wq); |
| 397 | } | 445 | } |
| 446 | EXPORT_SYMBOL_GPL(destroy_workqueue); | ||
| 398 | 447 | ||
| 399 | static struct workqueue_struct *keventd_wq; | 448 | static struct workqueue_struct *keventd_wq; |
| 400 | 449 | ||
| 450 | /** | ||
| 451 | * schedule_work - put work task in global workqueue | ||
| 452 | * @work: job to be done | ||
| 453 | * | ||
| 454 | * This puts a job in the kernel-global workqueue. | ||
| 455 | */ | ||
| 401 | int fastcall schedule_work(struct work_struct *work) | 456 | int fastcall schedule_work(struct work_struct *work) |
| 402 | { | 457 | { |
| 403 | return queue_work(keventd_wq, work); | 458 | return queue_work(keventd_wq, work); |
| 404 | } | 459 | } |
| 460 | EXPORT_SYMBOL(schedule_work); | ||
| 405 | 461 | ||
| 462 | /** | ||
| 463 | * schedule_delayed_work - put work task in global workqueue after delay | ||
| 464 | * @work: job to be done | ||
| 465 | * @delay: number of jiffies to wait | ||
| 466 | * | ||
| 467 | * After waiting for a given time this puts a job in the kernel-global | ||
| 468 | * workqueue. | ||
| 469 | */ | ||
| 406 | int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) | 470 | int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) |
| 407 | { | 471 | { |
| 408 | return queue_delayed_work(keventd_wq, work, delay); | 472 | return queue_delayed_work(keventd_wq, work, delay); |
| 409 | } | 473 | } |
| 474 | EXPORT_SYMBOL(schedule_delayed_work); | ||
| 410 | 475 | ||
| 476 | /** | ||
| 477 | * schedule_delayed_work_on - queue work in global workqueue on CPU after delay | ||
| 478 | * @cpu: cpu to use | ||
| 479 | * @work: job to be done | ||
| 480 | * @delay: number of jiffies to wait | ||
| 481 | * | ||
| 482 | * After waiting for a given time this puts a job in the kernel-global | ||
| 483 | * workqueue on the specified CPU. | ||
| 484 | */ | ||
| 411 | int schedule_delayed_work_on(int cpu, | 485 | int schedule_delayed_work_on(int cpu, |
| 412 | struct work_struct *work, unsigned long delay) | 486 | struct work_struct *work, unsigned long delay) |
| 413 | { | 487 | { |
| 414 | int ret = 0; | 488 | return queue_delayed_work_on(cpu, keventd_wq, work, delay); |
| 415 | struct timer_list *timer = &work->timer; | ||
| 416 | |||
| 417 | if (!test_and_set_bit(0, &work->pending)) { | ||
| 418 | BUG_ON(timer_pending(timer)); | ||
| 419 | BUG_ON(!list_empty(&work->entry)); | ||
| 420 | /* This stores keventd_wq for the moment, for the timer_fn */ | ||
| 421 | work->wq_data = keventd_wq; | ||
| 422 | timer->expires = jiffies + delay; | ||
| 423 | timer->data = (unsigned long)work; | ||
| 424 | timer->function = delayed_work_timer_fn; | ||
| 425 | add_timer_on(timer, cpu); | ||
| 426 | ret = 1; | ||
| 427 | } | ||
| 428 | return ret; | ||
| 429 | } | 489 | } |
| 490 | EXPORT_SYMBOL(schedule_delayed_work_on); | ||
| 430 | 491 | ||
| 431 | int schedule_on_each_cpu(void (*func) (void *info), void *info) | 492 | /** |
| 493 | * schedule_on_each_cpu - call a function on each online CPU from keventd | ||
| 494 | * @func: the function to call | ||
| 495 | * @info: a pointer to pass to func() | ||
| 496 | * | ||
| 497 | * Returns zero on success. | ||
| 498 | * Returns -ve errno on failure. | ||
| 499 | * | ||
| 500 | * Appears to be racy against CPU hotplug. | ||
| 501 | * | ||
| 502 | * schedule_on_each_cpu() is very slow. | ||
| 503 | */ | ||
| 504 | int schedule_on_each_cpu(void (*func)(void *info), void *info) | ||
| 432 | { | 505 | { |
| 433 | int cpu; | 506 | int cpu; |
| 434 | struct work_struct *work; | 507 | struct work_struct *works; |
| 435 | |||
| 436 | work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL); | ||
| 437 | 508 | ||
| 438 | if (!work) | 509 | works = alloc_percpu(struct work_struct); |
| 510 | if (!works) | ||
| 439 | return -ENOMEM; | 511 | return -ENOMEM; |
| 512 | |||
| 513 | mutex_lock(&workqueue_mutex); | ||
| 440 | for_each_online_cpu(cpu) { | 514 | for_each_online_cpu(cpu) { |
| 441 | INIT_WORK(work + cpu, func, info); | 515 | INIT_WORK(per_cpu_ptr(works, cpu), func, info); |
| 442 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), | 516 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), |
| 443 | work + cpu); | 517 | per_cpu_ptr(works, cpu)); |
| 444 | } | 518 | } |
| 519 | mutex_unlock(&workqueue_mutex); | ||
| 445 | flush_workqueue(keventd_wq); | 520 | flush_workqueue(keventd_wq); |
| 446 | kfree(work); | 521 | free_percpu(works); |
| 447 | return 0; | 522 | return 0; |
| 448 | } | 523 | } |
| 449 | 524 | ||
| @@ -451,6 +526,7 @@ void flush_scheduled_work(void) | |||
| 451 | { | 526 | { |
| 452 | flush_workqueue(keventd_wq); | 527 | flush_workqueue(keventd_wq); |
| 453 | } | 528 | } |
| 529 | EXPORT_SYMBOL(flush_scheduled_work); | ||
| 454 | 530 | ||
| 455 | /** | 531 | /** |
| 456 | * cancel_rearming_delayed_workqueue - reliably kill off a delayed | 532 | * cancel_rearming_delayed_workqueue - reliably kill off a delayed |
| @@ -547,7 +623,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) | |||
| 547 | } | 623 | } |
| 548 | 624 | ||
| 549 | /* We're holding the cpucontrol mutex here */ | 625 | /* We're holding the cpucontrol mutex here */ |
| 550 | static int workqueue_cpu_callback(struct notifier_block *nfb, | 626 | static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, |
| 551 | unsigned long action, | 627 | unsigned long action, |
| 552 | void *hcpu) | 628 | void *hcpu) |
| 553 | { | 629 | { |
| @@ -556,6 +632,7 @@ static int workqueue_cpu_callback(struct notifier_block *nfb, | |||
| 556 | 632 | ||
| 557 | switch (action) { | 633 | switch (action) { |
| 558 | case CPU_UP_PREPARE: | 634 | case CPU_UP_PREPARE: |
| 635 | mutex_lock(&workqueue_mutex); | ||
| 559 | /* Create a new workqueue thread for it. */ | 636 | /* Create a new workqueue thread for it. */ |
| 560 | list_for_each_entry(wq, &workqueues, list) { | 637 | list_for_each_entry(wq, &workqueues, list) { |
| 561 | if (!create_workqueue_thread(wq, hotcpu)) { | 638 | if (!create_workqueue_thread(wq, hotcpu)) { |
| @@ -574,15 +651,27 @@ static int workqueue_cpu_callback(struct notifier_block *nfb, | |||
| 574 | kthread_bind(cwq->thread, hotcpu); | 651 | kthread_bind(cwq->thread, hotcpu); |
| 575 | wake_up_process(cwq->thread); | 652 | wake_up_process(cwq->thread); |
| 576 | } | 653 | } |
| 654 | mutex_unlock(&workqueue_mutex); | ||
| 577 | break; | 655 | break; |
| 578 | 656 | ||
| 579 | case CPU_UP_CANCELED: | 657 | case CPU_UP_CANCELED: |
| 580 | list_for_each_entry(wq, &workqueues, list) { | 658 | list_for_each_entry(wq, &workqueues, list) { |
| 659 | if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread) | ||
| 660 | continue; | ||
| 581 | /* Unbind so it can run. */ | 661 | /* Unbind so it can run. */ |
| 582 | kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, | 662 | kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, |
| 583 | any_online_cpu(cpu_online_map)); | 663 | any_online_cpu(cpu_online_map)); |
| 584 | cleanup_workqueue_thread(wq, hotcpu); | 664 | cleanup_workqueue_thread(wq, hotcpu); |
| 585 | } | 665 | } |
| 666 | mutex_unlock(&workqueue_mutex); | ||
| 667 | break; | ||
| 668 | |||
| 669 | case CPU_DOWN_PREPARE: | ||
| 670 | mutex_lock(&workqueue_mutex); | ||
| 671 | break; | ||
| 672 | |||
| 673 | case CPU_DOWN_FAILED: | ||
| 674 | mutex_unlock(&workqueue_mutex); | ||
| 586 | break; | 675 | break; |
| 587 | 676 | ||
| 588 | case CPU_DEAD: | 677 | case CPU_DEAD: |
| @@ -590,6 +679,7 @@ static int workqueue_cpu_callback(struct notifier_block *nfb, | |||
| 590 | cleanup_workqueue_thread(wq, hotcpu); | 679 | cleanup_workqueue_thread(wq, hotcpu); |
| 591 | list_for_each_entry(wq, &workqueues, list) | 680 | list_for_each_entry(wq, &workqueues, list) |
| 592 | take_over_work(wq, hotcpu); | 681 | take_over_work(wq, hotcpu); |
| 682 | mutex_unlock(&workqueue_mutex); | ||
| 593 | break; | 683 | break; |
| 594 | } | 684 | } |
| 595 | 685 | ||
| @@ -605,13 +695,3 @@ void init_workqueues(void) | |||
| 605 | BUG_ON(!keventd_wq); | 695 | BUG_ON(!keventd_wq); |
| 606 | } | 696 | } |
| 607 | 697 | ||
| 608 | EXPORT_SYMBOL_GPL(__create_workqueue); | ||
| 609 | EXPORT_SYMBOL_GPL(queue_work); | ||
| 610 | EXPORT_SYMBOL_GPL(queue_delayed_work); | ||
| 611 | EXPORT_SYMBOL_GPL(flush_workqueue); | ||
| 612 | EXPORT_SYMBOL_GPL(destroy_workqueue); | ||
| 613 | |||
| 614 | EXPORT_SYMBOL(schedule_work); | ||
| 615 | EXPORT_SYMBOL(schedule_delayed_work); | ||
| 616 | EXPORT_SYMBOL(schedule_delayed_work_on); | ||
| 617 | EXPORT_SYMBOL(flush_scheduled_work); | ||
