diff options
Diffstat (limited to 'kernel')
87 files changed, 14238 insertions, 2773 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 58908f9d156a..d62ec66c1af2 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -8,20 +8,30 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ | |||
8 | signal.o sys.o kmod.o workqueue.o pid.o \ | 8 | signal.o sys.o kmod.o workqueue.o pid.o \ |
9 | rcupdate.o extable.o params.o posix-timers.o \ | 9 | rcupdate.o extable.o params.o posix-timers.o \ |
10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
11 | hrtimer.o | 11 | hrtimer.o rwsem.o |
12 | 12 | ||
13 | obj-$(CONFIG_STACKTRACE) += stacktrace.o | ||
14 | obj-y += time/ | ||
13 | obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o | 15 | obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o |
16 | obj-$(CONFIG_LOCKDEP) += lockdep.o | ||
17 | ifeq ($(CONFIG_PROC_FS),y) | ||
18 | obj-$(CONFIG_LOCKDEP) += lockdep_proc.o | ||
19 | endif | ||
14 | obj-$(CONFIG_FUTEX) += futex.o | 20 | obj-$(CONFIG_FUTEX) += futex.o |
15 | ifeq ($(CONFIG_COMPAT),y) | 21 | ifeq ($(CONFIG_COMPAT),y) |
16 | obj-$(CONFIG_FUTEX) += futex_compat.o | 22 | obj-$(CONFIG_FUTEX) += futex_compat.o |
17 | endif | 23 | endif |
24 | obj-$(CONFIG_RT_MUTEXES) += rtmutex.o | ||
25 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o | ||
26 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o | ||
18 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 27 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
19 | obj-$(CONFIG_SMP) += cpu.o spinlock.o | 28 | obj-$(CONFIG_SMP) += cpu.o spinlock.o |
20 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o | 29 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o |
30 | obj-$(CONFIG_PROVE_LOCKING) += spinlock.o | ||
21 | obj-$(CONFIG_UID16) += uid16.o | 31 | obj-$(CONFIG_UID16) += uid16.o |
22 | obj-$(CONFIG_MODULES) += module.o | 32 | obj-$(CONFIG_MODULES) += module.o |
23 | obj-$(CONFIG_OBSOLETE_INTERMODULE) += intermodule.o | ||
24 | obj-$(CONFIG_KALLSYMS) += kallsyms.o | 33 | obj-$(CONFIG_KALLSYMS) += kallsyms.o |
34 | obj-$(CONFIG_STACK_UNWIND) += unwind.o | ||
25 | obj-$(CONFIG_PM) += power/ | 35 | obj-$(CONFIG_PM) += power/ |
26 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o | 36 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o |
27 | obj-$(CONFIG_KEXEC) += kexec.o | 37 | obj-$(CONFIG_KEXEC) += kexec.o |
@@ -38,6 +48,8 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | |||
38 | obj-$(CONFIG_SECCOMP) += seccomp.o | 48 | obj-$(CONFIG_SECCOMP) += seccomp.o |
39 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 49 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
40 | obj-$(CONFIG_RELAY) += relay.o | 50 | obj-$(CONFIG_RELAY) += relay.o |
51 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o | ||
52 | obj-$(CONFIG_TASKSTATS) += taskstats.o | ||
41 | 53 | ||
42 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) | 54 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) |
43 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 55 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
diff --git a/kernel/acct.c b/kernel/acct.c index b327f4d20104..2a7c933651c7 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -43,7 +43,6 @@ | |||
43 | * a struct file opened for write. Fixed. 2/6/2000, AV. | 43 | * a struct file opened for write. Fixed. 2/6/2000, AV. |
44 | */ | 44 | */ |
45 | 45 | ||
46 | #include <linux/config.h> | ||
47 | #include <linux/mm.h> | 46 | #include <linux/mm.h> |
48 | #include <linux/slab.h> | 47 | #include <linux/slab.h> |
49 | #include <linux/acct.h> | 48 | #include <linux/acct.h> |
@@ -75,7 +74,7 @@ int acct_parm[3] = {4, 2, 30}; | |||
75 | /* | 74 | /* |
76 | * External references and all of the globals. | 75 | * External references and all of the globals. |
77 | */ | 76 | */ |
78 | static void do_acct_process(long, struct file *); | 77 | static void do_acct_process(struct file *); |
79 | 78 | ||
80 | /* | 79 | /* |
81 | * This structure is used so that all the data protected by lock | 80 | * This structure is used so that all the data protected by lock |
@@ -118,7 +117,7 @@ static int check_free_space(struct file *file) | |||
118 | spin_unlock(&acct_globals.lock); | 117 | spin_unlock(&acct_globals.lock); |
119 | 118 | ||
120 | /* May block */ | 119 | /* May block */ |
121 | if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf)) | 120 | if (vfs_statfs(file->f_dentry, &sbuf)) |
122 | return res; | 121 | return res; |
123 | suspend = sbuf.f_blocks * SUSPEND; | 122 | suspend = sbuf.f_blocks * SUSPEND; |
124 | resume = sbuf.f_blocks * RESUME; | 123 | resume = sbuf.f_blocks * RESUME; |
@@ -196,7 +195,7 @@ static void acct_file_reopen(struct file *file) | |||
196 | if (old_acct) { | 195 | if (old_acct) { |
197 | mnt_unpin(old_acct->f_vfsmnt); | 196 | mnt_unpin(old_acct->f_vfsmnt); |
198 | spin_unlock(&acct_globals.lock); | 197 | spin_unlock(&acct_globals.lock); |
199 | do_acct_process(0, old_acct); | 198 | do_acct_process(old_acct); |
200 | filp_close(old_acct, NULL); | 199 | filp_close(old_acct, NULL); |
201 | spin_lock(&acct_globals.lock); | 200 | spin_lock(&acct_globals.lock); |
202 | } | 201 | } |
@@ -419,16 +418,15 @@ static u32 encode_float(u64 value) | |||
419 | /* | 418 | /* |
420 | * do_acct_process does all actual work. Caller holds the reference to file. | 419 | * do_acct_process does all actual work. Caller holds the reference to file. |
421 | */ | 420 | */ |
422 | static void do_acct_process(long exitcode, struct file *file) | 421 | static void do_acct_process(struct file *file) |
423 | { | 422 | { |
423 | struct pacct_struct *pacct = ¤t->signal->pacct; | ||
424 | acct_t ac; | 424 | acct_t ac; |
425 | mm_segment_t fs; | 425 | mm_segment_t fs; |
426 | unsigned long vsize; | ||
427 | unsigned long flim; | 426 | unsigned long flim; |
428 | u64 elapsed; | 427 | u64 elapsed; |
429 | u64 run_time; | 428 | u64 run_time; |
430 | struct timespec uptime; | 429 | struct timespec uptime; |
431 | unsigned long jiffies; | ||
432 | 430 | ||
433 | /* | 431 | /* |
434 | * First check to see if there is enough free_space to continue | 432 | * First check to see if there is enough free_space to continue |
@@ -469,12 +467,6 @@ static void do_acct_process(long exitcode, struct file *file) | |||
469 | #endif | 467 | #endif |
470 | do_div(elapsed, AHZ); | 468 | do_div(elapsed, AHZ); |
471 | ac.ac_btime = xtime.tv_sec - elapsed; | 469 | ac.ac_btime = xtime.tv_sec - elapsed; |
472 | jiffies = cputime_to_jiffies(cputime_add(current->utime, | ||
473 | current->signal->utime)); | ||
474 | ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies)); | ||
475 | jiffies = cputime_to_jiffies(cputime_add(current->stime, | ||
476 | current->signal->stime)); | ||
477 | ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies)); | ||
478 | /* we really need to bite the bullet and change layout */ | 470 | /* we really need to bite the bullet and change layout */ |
479 | ac.ac_uid = current->uid; | 471 | ac.ac_uid = current->uid; |
480 | ac.ac_gid = current->gid; | 472 | ac.ac_gid = current->gid; |
@@ -496,37 +488,18 @@ static void do_acct_process(long exitcode, struct file *file) | |||
496 | old_encode_dev(tty_devnum(current->signal->tty)) : 0; | 488 | old_encode_dev(tty_devnum(current->signal->tty)) : 0; |
497 | read_unlock(&tasklist_lock); | 489 | read_unlock(&tasklist_lock); |
498 | 490 | ||
499 | ac.ac_flag = 0; | 491 | spin_lock_irq(¤t->sighand->siglock); |
500 | if (current->flags & PF_FORKNOEXEC) | 492 | ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); |
501 | ac.ac_flag |= AFORK; | 493 | ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); |
502 | if (current->flags & PF_SUPERPRIV) | 494 | ac.ac_flag = pacct->ac_flag; |
503 | ac.ac_flag |= ASU; | 495 | ac.ac_mem = encode_comp_t(pacct->ac_mem); |
504 | if (current->flags & PF_DUMPCORE) | 496 | ac.ac_minflt = encode_comp_t(pacct->ac_minflt); |
505 | ac.ac_flag |= ACORE; | 497 | ac.ac_majflt = encode_comp_t(pacct->ac_majflt); |
506 | if (current->flags & PF_SIGNALED) | 498 | ac.ac_exitcode = pacct->ac_exitcode; |
507 | ac.ac_flag |= AXSIG; | 499 | spin_unlock_irq(¤t->sighand->siglock); |
508 | |||
509 | vsize = 0; | ||
510 | if (current->mm) { | ||
511 | struct vm_area_struct *vma; | ||
512 | down_read(¤t->mm->mmap_sem); | ||
513 | vma = current->mm->mmap; | ||
514 | while (vma) { | ||
515 | vsize += vma->vm_end - vma->vm_start; | ||
516 | vma = vma->vm_next; | ||
517 | } | ||
518 | up_read(¤t->mm->mmap_sem); | ||
519 | } | ||
520 | vsize = vsize / 1024; | ||
521 | ac.ac_mem = encode_comp_t(vsize); | ||
522 | ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ | 500 | ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ |
523 | ac.ac_rw = encode_comp_t(ac.ac_io / 1024); | 501 | ac.ac_rw = encode_comp_t(ac.ac_io / 1024); |
524 | ac.ac_minflt = encode_comp_t(current->signal->min_flt + | ||
525 | current->min_flt); | ||
526 | ac.ac_majflt = encode_comp_t(current->signal->maj_flt + | ||
527 | current->maj_flt); | ||
528 | ac.ac_swaps = encode_comp_t(0); | 502 | ac.ac_swaps = encode_comp_t(0); |
529 | ac.ac_exitcode = exitcode; | ||
530 | 503 | ||
531 | /* | 504 | /* |
532 | * Kernel segment override to datasegment and write it | 505 | * Kernel segment override to datasegment and write it |
@@ -546,12 +519,64 @@ static void do_acct_process(long exitcode, struct file *file) | |||
546 | } | 519 | } |
547 | 520 | ||
548 | /** | 521 | /** |
522 | * acct_init_pacct - initialize a new pacct_struct | ||
523 | * @pacct: per-process accounting info struct to initialize | ||
524 | */ | ||
525 | void acct_init_pacct(struct pacct_struct *pacct) | ||
526 | { | ||
527 | memset(pacct, 0, sizeof(struct pacct_struct)); | ||
528 | pacct->ac_utime = pacct->ac_stime = cputime_zero; | ||
529 | } | ||
530 | |||
531 | /** | ||
532 | * acct_collect - collect accounting information into pacct_struct | ||
533 | * @exitcode: task exit code | ||
534 | * @group_dead: not 0, if this thread is the last one in the process. | ||
535 | */ | ||
536 | void acct_collect(long exitcode, int group_dead) | ||
537 | { | ||
538 | struct pacct_struct *pacct = ¤t->signal->pacct; | ||
539 | unsigned long vsize = 0; | ||
540 | |||
541 | if (group_dead && current->mm) { | ||
542 | struct vm_area_struct *vma; | ||
543 | down_read(¤t->mm->mmap_sem); | ||
544 | vma = current->mm->mmap; | ||
545 | while (vma) { | ||
546 | vsize += vma->vm_end - vma->vm_start; | ||
547 | vma = vma->vm_next; | ||
548 | } | ||
549 | up_read(¤t->mm->mmap_sem); | ||
550 | } | ||
551 | |||
552 | spin_lock_irq(¤t->sighand->siglock); | ||
553 | if (group_dead) | ||
554 | pacct->ac_mem = vsize / 1024; | ||
555 | if (thread_group_leader(current)) { | ||
556 | pacct->ac_exitcode = exitcode; | ||
557 | if (current->flags & PF_FORKNOEXEC) | ||
558 | pacct->ac_flag |= AFORK; | ||
559 | } | ||
560 | if (current->flags & PF_SUPERPRIV) | ||
561 | pacct->ac_flag |= ASU; | ||
562 | if (current->flags & PF_DUMPCORE) | ||
563 | pacct->ac_flag |= ACORE; | ||
564 | if (current->flags & PF_SIGNALED) | ||
565 | pacct->ac_flag |= AXSIG; | ||
566 | pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); | ||
567 | pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); | ||
568 | pacct->ac_minflt += current->min_flt; | ||
569 | pacct->ac_majflt += current->maj_flt; | ||
570 | spin_unlock_irq(¤t->sighand->siglock); | ||
571 | } | ||
572 | |||
573 | /** | ||
549 | * acct_process - now just a wrapper around do_acct_process | 574 | * acct_process - now just a wrapper around do_acct_process |
550 | * @exitcode: task exit code | 575 | * @exitcode: task exit code |
551 | * | 576 | * |
552 | * handles process accounting for an exiting task | 577 | * handles process accounting for an exiting task |
553 | */ | 578 | */ |
554 | void acct_process(long exitcode) | 579 | void acct_process(void) |
555 | { | 580 | { |
556 | struct file *file = NULL; | 581 | struct file *file = NULL; |
557 | 582 | ||
@@ -570,7 +595,7 @@ void acct_process(long exitcode) | |||
570 | get_file(file); | 595 | get_file(file); |
571 | spin_unlock(&acct_globals.lock); | 596 | spin_unlock(&acct_globals.lock); |
572 | 597 | ||
573 | do_acct_process(exitcode, file); | 598 | do_acct_process(file); |
574 | fput(file); | 599 | fput(file); |
575 | } | 600 | } |
576 | 601 | ||
@@ -599,9 +624,7 @@ void acct_update_integrals(struct task_struct *tsk) | |||
599 | */ | 624 | */ |
600 | void acct_clear_integrals(struct task_struct *tsk) | 625 | void acct_clear_integrals(struct task_struct *tsk) |
601 | { | 626 | { |
602 | if (tsk) { | 627 | tsk->acct_stimexpd = 0; |
603 | tsk->acct_stimexpd = 0; | 628 | tsk->acct_rss_mem1 = 0; |
604 | tsk->acct_rss_mem1 = 0; | 629 | tsk->acct_vm_mem1 = 0; |
605 | tsk->acct_vm_mem1 = 0; | ||
606 | } | ||
607 | } | 630 | } |
diff --git a/kernel/audit.c b/kernel/audit.c index df57b493e1cb..d417ca1db79b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -56,6 +56,7 @@ | |||
56 | #include <linux/skbuff.h> | 56 | #include <linux/skbuff.h> |
57 | #include <linux/netlink.h> | 57 | #include <linux/netlink.h> |
58 | #include <linux/selinux.h> | 58 | #include <linux/selinux.h> |
59 | #include <linux/inotify.h> | ||
59 | 60 | ||
60 | #include "audit.h" | 61 | #include "audit.h" |
61 | 62 | ||
@@ -89,6 +90,7 @@ static int audit_backlog_wait_overflow = 0; | |||
89 | /* The identity of the user shutting down the audit system. */ | 90 | /* The identity of the user shutting down the audit system. */ |
90 | uid_t audit_sig_uid = -1; | 91 | uid_t audit_sig_uid = -1; |
91 | pid_t audit_sig_pid = -1; | 92 | pid_t audit_sig_pid = -1; |
93 | u32 audit_sig_sid = 0; | ||
92 | 94 | ||
93 | /* Records can be lost in several ways: | 95 | /* Records can be lost in several ways: |
94 | 0) [suppressed in audit_alloc] | 96 | 0) [suppressed in audit_alloc] |
@@ -102,6 +104,12 @@ static atomic_t audit_lost = ATOMIC_INIT(0); | |||
102 | /* The netlink socket. */ | 104 | /* The netlink socket. */ |
103 | static struct sock *audit_sock; | 105 | static struct sock *audit_sock; |
104 | 106 | ||
107 | /* Inotify handle. */ | ||
108 | struct inotify_handle *audit_ih; | ||
109 | |||
110 | /* Hash for inode-based rules */ | ||
111 | struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; | ||
112 | |||
105 | /* The audit_freelist is a list of pre-allocated audit buffers (if more | 113 | /* The audit_freelist is a list of pre-allocated audit buffers (if more |
106 | * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of | 114 | * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of |
107 | * being placed on the freelist). */ | 115 | * being placed on the freelist). */ |
@@ -114,10 +122,8 @@ static struct task_struct *kauditd_task; | |||
114 | static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); | 122 | static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); |
115 | static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); | 123 | static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); |
116 | 124 | ||
117 | /* The netlink socket is only to be read by 1 CPU, which lets us assume | 125 | /* Serialize requests from userspace. */ |
118 | * that list additions and deletions never happen simultaneously in | 126 | static DEFINE_MUTEX(audit_cmd_mutex); |
119 | * auditsc.c */ | ||
120 | DEFINE_MUTEX(audit_netlink_mutex); | ||
121 | 127 | ||
122 | /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting | 128 | /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting |
123 | * audit records. Since printk uses a 1024 byte buffer, this buffer | 129 | * audit records. Since printk uses a 1024 byte buffer, this buffer |
@@ -250,7 +256,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) | |||
250 | "audit_rate_limit=%d old=%d by auid=%u", | 256 | "audit_rate_limit=%d old=%d by auid=%u", |
251 | limit, old, loginuid); | 257 | limit, old, loginuid); |
252 | audit_rate_limit = limit; | 258 | audit_rate_limit = limit; |
253 | return old; | 259 | return 0; |
254 | } | 260 | } |
255 | 261 | ||
256 | static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) | 262 | static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) |
@@ -273,7 +279,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) | |||
273 | "audit_backlog_limit=%d old=%d by auid=%u", | 279 | "audit_backlog_limit=%d old=%d by auid=%u", |
274 | limit, old, loginuid); | 280 | limit, old, loginuid); |
275 | audit_backlog_limit = limit; | 281 | audit_backlog_limit = limit; |
276 | return old; | 282 | return 0; |
277 | } | 283 | } |
278 | 284 | ||
279 | static int audit_set_enabled(int state, uid_t loginuid, u32 sid) | 285 | static int audit_set_enabled(int state, uid_t loginuid, u32 sid) |
@@ -299,7 +305,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid) | |||
299 | "audit_enabled=%d old=%d by auid=%u", | 305 | "audit_enabled=%d old=%d by auid=%u", |
300 | state, old, loginuid); | 306 | state, old, loginuid); |
301 | audit_enabled = state; | 307 | audit_enabled = state; |
302 | return old; | 308 | return 0; |
303 | } | 309 | } |
304 | 310 | ||
305 | static int audit_set_failure(int state, uid_t loginuid, u32 sid) | 311 | static int audit_set_failure(int state, uid_t loginuid, u32 sid) |
@@ -327,7 +333,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid) | |||
327 | "audit_failure=%d old=%d by auid=%u", | 333 | "audit_failure=%d old=%d by auid=%u", |
328 | state, old, loginuid); | 334 | state, old, loginuid); |
329 | audit_failure = state; | 335 | audit_failure = state; |
330 | return old; | 336 | return 0; |
331 | } | 337 | } |
332 | 338 | ||
333 | static int kauditd_thread(void *dummy) | 339 | static int kauditd_thread(void *dummy) |
@@ -363,9 +369,52 @@ static int kauditd_thread(void *dummy) | |||
363 | remove_wait_queue(&kauditd_wait, &wait); | 369 | remove_wait_queue(&kauditd_wait, &wait); |
364 | } | 370 | } |
365 | } | 371 | } |
372 | } | ||
373 | |||
374 | int audit_send_list(void *_dest) | ||
375 | { | ||
376 | struct audit_netlink_list *dest = _dest; | ||
377 | int pid = dest->pid; | ||
378 | struct sk_buff *skb; | ||
379 | |||
380 | /* wait for parent to finish and send an ACK */ | ||
381 | mutex_lock(&audit_cmd_mutex); | ||
382 | mutex_unlock(&audit_cmd_mutex); | ||
383 | |||
384 | while ((skb = __skb_dequeue(&dest->q)) != NULL) | ||
385 | netlink_unicast(audit_sock, skb, pid, 0); | ||
386 | |||
387 | kfree(dest); | ||
388 | |||
366 | return 0; | 389 | return 0; |
367 | } | 390 | } |
368 | 391 | ||
392 | struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, | ||
393 | int multi, void *payload, int size) | ||
394 | { | ||
395 | struct sk_buff *skb; | ||
396 | struct nlmsghdr *nlh; | ||
397 | int len = NLMSG_SPACE(size); | ||
398 | void *data; | ||
399 | int flags = multi ? NLM_F_MULTI : 0; | ||
400 | int t = done ? NLMSG_DONE : type; | ||
401 | |||
402 | skb = alloc_skb(len, GFP_KERNEL); | ||
403 | if (!skb) | ||
404 | return NULL; | ||
405 | |||
406 | nlh = NLMSG_PUT(skb, pid, seq, t, size); | ||
407 | nlh->nlmsg_flags = flags; | ||
408 | data = NLMSG_DATA(nlh); | ||
409 | memcpy(data, payload, size); | ||
410 | return skb; | ||
411 | |||
412 | nlmsg_failure: /* Used by NLMSG_PUT */ | ||
413 | if (skb) | ||
414 | kfree_skb(skb); | ||
415 | return NULL; | ||
416 | } | ||
417 | |||
369 | /** | 418 | /** |
370 | * audit_send_reply - send an audit reply message via netlink | 419 | * audit_send_reply - send an audit reply message via netlink |
371 | * @pid: process id to send reply to | 420 | * @pid: process id to send reply to |
@@ -383,36 +432,20 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi, | |||
383 | void *payload, int size) | 432 | void *payload, int size) |
384 | { | 433 | { |
385 | struct sk_buff *skb; | 434 | struct sk_buff *skb; |
386 | struct nlmsghdr *nlh; | 435 | skb = audit_make_reply(pid, seq, type, done, multi, payload, size); |
387 | int len = NLMSG_SPACE(size); | ||
388 | void *data; | ||
389 | int flags = multi ? NLM_F_MULTI : 0; | ||
390 | int t = done ? NLMSG_DONE : type; | ||
391 | |||
392 | skb = alloc_skb(len, GFP_KERNEL); | ||
393 | if (!skb) | 436 | if (!skb) |
394 | return; | 437 | return; |
395 | |||
396 | nlh = NLMSG_PUT(skb, pid, seq, t, size); | ||
397 | nlh->nlmsg_flags = flags; | ||
398 | data = NLMSG_DATA(nlh); | ||
399 | memcpy(data, payload, size); | ||
400 | |||
401 | /* Ignore failure. It'll only happen if the sender goes away, | 438 | /* Ignore failure. It'll only happen if the sender goes away, |
402 | because our timeout is set to infinite. */ | 439 | because our timeout is set to infinite. */ |
403 | netlink_unicast(audit_sock, skb, pid, 0); | 440 | netlink_unicast(audit_sock, skb, pid, 0); |
404 | return; | 441 | return; |
405 | |||
406 | nlmsg_failure: /* Used by NLMSG_PUT */ | ||
407 | if (skb) | ||
408 | kfree_skb(skb); | ||
409 | } | 442 | } |
410 | 443 | ||
411 | /* | 444 | /* |
412 | * Check for appropriate CAP_AUDIT_ capabilities on incoming audit | 445 | * Check for appropriate CAP_AUDIT_ capabilities on incoming audit |
413 | * control messages. | 446 | * control messages. |
414 | */ | 447 | */ |
415 | static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) | 448 | static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) |
416 | { | 449 | { |
417 | int err = 0; | 450 | int err = 0; |
418 | 451 | ||
@@ -426,13 +459,13 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) | |||
426 | case AUDIT_DEL: | 459 | case AUDIT_DEL: |
427 | case AUDIT_DEL_RULE: | 460 | case AUDIT_DEL_RULE: |
428 | case AUDIT_SIGNAL_INFO: | 461 | case AUDIT_SIGNAL_INFO: |
429 | if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) | 462 | if (security_netlink_recv(skb, CAP_AUDIT_CONTROL)) |
430 | err = -EPERM; | 463 | err = -EPERM; |
431 | break; | 464 | break; |
432 | case AUDIT_USER: | 465 | case AUDIT_USER: |
433 | case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: | 466 | case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: |
434 | case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: | 467 | case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: |
435 | if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) | 468 | if (security_netlink_recv(skb, CAP_AUDIT_WRITE)) |
436 | err = -EPERM; | 469 | err = -EPERM; |
437 | break; | 470 | break; |
438 | default: /* bad msg */ | 471 | default: /* bad msg */ |
@@ -451,9 +484,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
451 | struct audit_buffer *ab; | 484 | struct audit_buffer *ab; |
452 | u16 msg_type = nlh->nlmsg_type; | 485 | u16 msg_type = nlh->nlmsg_type; |
453 | uid_t loginuid; /* loginuid of sender */ | 486 | uid_t loginuid; /* loginuid of sender */ |
454 | struct audit_sig_info sig_data; | 487 | struct audit_sig_info *sig_data; |
488 | char *ctx; | ||
489 | u32 len; | ||
455 | 490 | ||
456 | err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); | 491 | err = audit_netlink_ok(skb, msg_type); |
457 | if (err) | 492 | if (err) |
458 | return err; | 493 | return err; |
459 | 494 | ||
@@ -503,12 +538,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
503 | if (status_get->mask & AUDIT_STATUS_PID) { | 538 | if (status_get->mask & AUDIT_STATUS_PID) { |
504 | int old = audit_pid; | 539 | int old = audit_pid; |
505 | if (sid) { | 540 | if (sid) { |
506 | char *ctx = NULL; | 541 | if ((err = selinux_ctxid_to_string( |
507 | u32 len; | ||
508 | int rc; | ||
509 | if ((rc = selinux_ctxid_to_string( | ||
510 | sid, &ctx, &len))) | 542 | sid, &ctx, &len))) |
511 | return rc; | 543 | return err; |
512 | else | 544 | else |
513 | audit_log(NULL, GFP_KERNEL, | 545 | audit_log(NULL, GFP_KERNEL, |
514 | AUDIT_CONFIG_CHANGE, | 546 | AUDIT_CONFIG_CHANGE, |
@@ -523,10 +555,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
523 | audit_pid = status_get->pid; | 555 | audit_pid = status_get->pid; |
524 | } | 556 | } |
525 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) | 557 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) |
526 | audit_set_rate_limit(status_get->rate_limit, | 558 | err = audit_set_rate_limit(status_get->rate_limit, |
527 | loginuid, sid); | 559 | loginuid, sid); |
528 | if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) | 560 | if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) |
529 | audit_set_backlog_limit(status_get->backlog_limit, | 561 | err = audit_set_backlog_limit(status_get->backlog_limit, |
530 | loginuid, sid); | 562 | loginuid, sid); |
531 | break; | 563 | break; |
532 | case AUDIT_USER: | 564 | case AUDIT_USER: |
@@ -544,8 +576,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
544 | "user pid=%d uid=%u auid=%u", | 576 | "user pid=%d uid=%u auid=%u", |
545 | pid, uid, loginuid); | 577 | pid, uid, loginuid); |
546 | if (sid) { | 578 | if (sid) { |
547 | char *ctx = NULL; | ||
548 | u32 len; | ||
549 | if (selinux_ctxid_to_string( | 579 | if (selinux_ctxid_to_string( |
550 | sid, &ctx, &len)) { | 580 | sid, &ctx, &len)) { |
551 | audit_log_format(ab, | 581 | audit_log_format(ab, |
@@ -584,10 +614,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
584 | loginuid, sid); | 614 | loginuid, sid); |
585 | break; | 615 | break; |
586 | case AUDIT_SIGNAL_INFO: | 616 | case AUDIT_SIGNAL_INFO: |
587 | sig_data.uid = audit_sig_uid; | 617 | err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len); |
588 | sig_data.pid = audit_sig_pid; | 618 | if (err) |
619 | return err; | ||
620 | sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); | ||
621 | if (!sig_data) { | ||
622 | kfree(ctx); | ||
623 | return -ENOMEM; | ||
624 | } | ||
625 | sig_data->uid = audit_sig_uid; | ||
626 | sig_data->pid = audit_sig_pid; | ||
627 | memcpy(sig_data->ctx, ctx, len); | ||
628 | kfree(ctx); | ||
589 | audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, | 629 | audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, |
590 | 0, 0, &sig_data, sizeof(sig_data)); | 630 | 0, 0, sig_data, sizeof(*sig_data) + len); |
631 | kfree(sig_data); | ||
591 | break; | 632 | break; |
592 | default: | 633 | default: |
593 | err = -EINVAL; | 634 | err = -EINVAL; |
@@ -629,20 +670,30 @@ static void audit_receive(struct sock *sk, int length) | |||
629 | struct sk_buff *skb; | 670 | struct sk_buff *skb; |
630 | unsigned int qlen; | 671 | unsigned int qlen; |
631 | 672 | ||
632 | mutex_lock(&audit_netlink_mutex); | 673 | mutex_lock(&audit_cmd_mutex); |
633 | 674 | ||
634 | for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { | 675 | for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { |
635 | skb = skb_dequeue(&sk->sk_receive_queue); | 676 | skb = skb_dequeue(&sk->sk_receive_queue); |
636 | audit_receive_skb(skb); | 677 | audit_receive_skb(skb); |
637 | kfree_skb(skb); | 678 | kfree_skb(skb); |
638 | } | 679 | } |
639 | mutex_unlock(&audit_netlink_mutex); | 680 | mutex_unlock(&audit_cmd_mutex); |
640 | } | 681 | } |
641 | 682 | ||
683 | #ifdef CONFIG_AUDITSYSCALL | ||
684 | static const struct inotify_operations audit_inotify_ops = { | ||
685 | .handle_event = audit_handle_ievent, | ||
686 | .destroy_watch = audit_free_parent, | ||
687 | }; | ||
688 | #endif | ||
642 | 689 | ||
643 | /* Initialize audit support at boot time. */ | 690 | /* Initialize audit support at boot time. */ |
644 | static int __init audit_init(void) | 691 | static int __init audit_init(void) |
645 | { | 692 | { |
693 | #ifdef CONFIG_AUDITSYSCALL | ||
694 | int i; | ||
695 | #endif | ||
696 | |||
646 | printk(KERN_INFO "audit: initializing netlink socket (%s)\n", | 697 | printk(KERN_INFO "audit: initializing netlink socket (%s)\n", |
647 | audit_default ? "enabled" : "disabled"); | 698 | audit_default ? "enabled" : "disabled"); |
648 | audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, | 699 | audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, |
@@ -661,6 +712,16 @@ static int __init audit_init(void) | |||
661 | selinux_audit_set_callback(&selinux_audit_rule_update); | 712 | selinux_audit_set_callback(&selinux_audit_rule_update); |
662 | 713 | ||
663 | audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); | 714 | audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); |
715 | |||
716 | #ifdef CONFIG_AUDITSYSCALL | ||
717 | audit_ih = inotify_init(&audit_inotify_ops); | ||
718 | if (IS_ERR(audit_ih)) | ||
719 | audit_panic("cannot initialize inotify handle"); | ||
720 | |||
721 | for (i = 0; i < AUDIT_INODE_BUCKETS; i++) | ||
722 | INIT_LIST_HEAD(&audit_inode_hash[i]); | ||
723 | #endif | ||
724 | |||
664 | return 0; | 725 | return 0; |
665 | } | 726 | } |
666 | __initcall(audit_init); | 727 | __initcall(audit_init); |
@@ -690,10 +751,12 @@ static void audit_buffer_free(struct audit_buffer *ab) | |||
690 | kfree_skb(ab->skb); | 751 | kfree_skb(ab->skb); |
691 | 752 | ||
692 | spin_lock_irqsave(&audit_freelist_lock, flags); | 753 | spin_lock_irqsave(&audit_freelist_lock, flags); |
693 | if (++audit_freelist_count > AUDIT_MAXFREE) | 754 | if (audit_freelist_count > AUDIT_MAXFREE) |
694 | kfree(ab); | 755 | kfree(ab); |
695 | else | 756 | else { |
757 | audit_freelist_count++; | ||
696 | list_add(&ab->list, &audit_freelist); | 758 | list_add(&ab->list, &audit_freelist); |
759 | } | ||
697 | spin_unlock_irqrestore(&audit_freelist_lock, flags); | 760 | spin_unlock_irqrestore(&audit_freelist_lock, flags); |
698 | } | 761 | } |
699 | 762 | ||
@@ -755,7 +818,7 @@ err: | |||
755 | */ | 818 | */ |
756 | unsigned int audit_serial(void) | 819 | unsigned int audit_serial(void) |
757 | { | 820 | { |
758 | static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; | 821 | static DEFINE_SPINLOCK(serial_lock); |
759 | static unsigned int serial = 0; | 822 | static unsigned int serial = 0; |
760 | 823 | ||
761 | unsigned long flags; | 824 | unsigned long flags; |
@@ -988,28 +1051,76 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, | |||
988 | skb_put(skb, len << 1); /* new string is twice the old string */ | 1051 | skb_put(skb, len << 1); /* new string is twice the old string */ |
989 | } | 1052 | } |
990 | 1053 | ||
1054 | /* | ||
1055 | * Format a string of no more than slen characters into the audit buffer, | ||
1056 | * enclosed in quote marks. | ||
1057 | */ | ||
1058 | static void audit_log_n_string(struct audit_buffer *ab, size_t slen, | ||
1059 | const char *string) | ||
1060 | { | ||
1061 | int avail, new_len; | ||
1062 | unsigned char *ptr; | ||
1063 | struct sk_buff *skb; | ||
1064 | |||
1065 | BUG_ON(!ab->skb); | ||
1066 | skb = ab->skb; | ||
1067 | avail = skb_tailroom(skb); | ||
1068 | new_len = slen + 3; /* enclosing quotes + null terminator */ | ||
1069 | if (new_len > avail) { | ||
1070 | avail = audit_expand(ab, new_len); | ||
1071 | if (!avail) | ||
1072 | return; | ||
1073 | } | ||
1074 | ptr = skb->tail; | ||
1075 | *ptr++ = '"'; | ||
1076 | memcpy(ptr, string, slen); | ||
1077 | ptr += slen; | ||
1078 | *ptr++ = '"'; | ||
1079 | *ptr = 0; | ||
1080 | skb_put(skb, slen + 2); /* don't include null terminator */ | ||
1081 | } | ||
1082 | |||
991 | /** | 1083 | /** |
992 | * audit_log_unstrustedstring - log a string that may contain random characters | 1084 | * audit_log_n_unstrustedstring - log a string that may contain random characters |
993 | * @ab: audit_buffer | 1085 | * @ab: audit_buffer |
1086 | * @len: lenth of string (not including trailing null) | ||
994 | * @string: string to be logged | 1087 | * @string: string to be logged |
995 | * | 1088 | * |
996 | * This code will escape a string that is passed to it if the string | 1089 | * This code will escape a string that is passed to it if the string |
997 | * contains a control character, unprintable character, double quote mark, | 1090 | * contains a control character, unprintable character, double quote mark, |
998 | * or a space. Unescaped strings will start and end with a double quote mark. | 1091 | * or a space. Unescaped strings will start and end with a double quote mark. |
999 | * Strings that are escaped are printed in hex (2 digits per char). | 1092 | * Strings that are escaped are printed in hex (2 digits per char). |
1093 | * | ||
1094 | * The caller specifies the number of characters in the string to log, which may | ||
1095 | * or may not be the entire string. | ||
1000 | */ | 1096 | */ |
1001 | void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) | 1097 | const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, |
1098 | const char *string) | ||
1002 | { | 1099 | { |
1003 | const unsigned char *p = string; | 1100 | const unsigned char *p = string; |
1004 | 1101 | ||
1005 | while (*p) { | 1102 | while (*p) { |
1006 | if (*p == '"' || *p < 0x21 || *p > 0x7f) { | 1103 | if (*p == '"' || *p < 0x21 || *p > 0x7f) { |
1007 | audit_log_hex(ab, string, strlen(string)); | 1104 | audit_log_hex(ab, string, len); |
1008 | return; | 1105 | return string + len + 1; |
1009 | } | 1106 | } |
1010 | p++; | 1107 | p++; |
1011 | } | 1108 | } |
1012 | audit_log_format(ab, "\"%s\"", string); | 1109 | audit_log_n_string(ab, len, string); |
1110 | return p + 1; | ||
1111 | } | ||
1112 | |||
1113 | /** | ||
1114 | * audit_log_unstrustedstring - log a string that may contain random characters | ||
1115 | * @ab: audit_buffer | ||
1116 | * @string: string to be logged | ||
1117 | * | ||
1118 | * Same as audit_log_n_unstrustedstring(), except that strlen is used to | ||
1119 | * determine string length. | ||
1120 | */ | ||
1121 | const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string) | ||
1122 | { | ||
1123 | return audit_log_n_untrustedstring(ab, strlen(string), string); | ||
1013 | } | 1124 | } |
1014 | 1125 | ||
1015 | /* This is a helper-function to print the escaped d_path */ | 1126 | /* This is a helper-function to print the escaped d_path */ |
diff --git a/kernel/audit.h b/kernel/audit.h index 6f733920fd32..6aa33b848cf2 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
@@ -19,9 +19,9 @@ | |||
19 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 19 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
20 | */ | 20 | */ |
21 | 21 | ||
22 | #include <linux/mutex.h> | ||
23 | #include <linux/fs.h> | 22 | #include <linux/fs.h> |
24 | #include <linux/audit.h> | 23 | #include <linux/audit.h> |
24 | #include <linux/skbuff.h> | ||
25 | 25 | ||
26 | /* 0 = no checking | 26 | /* 0 = no checking |
27 | 1 = put_count checking | 27 | 1 = put_count checking |
@@ -53,6 +53,18 @@ enum audit_state { | |||
53 | }; | 53 | }; |
54 | 54 | ||
55 | /* Rule lists */ | 55 | /* Rule lists */ |
56 | struct audit_parent; | ||
57 | |||
58 | struct audit_watch { | ||
59 | atomic_t count; /* reference count */ | ||
60 | char *path; /* insertion path */ | ||
61 | dev_t dev; /* associated superblock device */ | ||
62 | unsigned long ino; /* associated inode number */ | ||
63 | struct audit_parent *parent; /* associated parent */ | ||
64 | struct list_head wlist; /* entry in parent->watches list */ | ||
65 | struct list_head rules; /* associated rules */ | ||
66 | }; | ||
67 | |||
56 | struct audit_field { | 68 | struct audit_field { |
57 | u32 type; | 69 | u32 type; |
58 | u32 val; | 70 | u32 val; |
@@ -69,7 +81,11 @@ struct audit_krule { | |||
69 | u32 mask[AUDIT_BITMASK_SIZE]; | 81 | u32 mask[AUDIT_BITMASK_SIZE]; |
70 | u32 buflen; /* for data alloc on list rules */ | 82 | u32 buflen; /* for data alloc on list rules */ |
71 | u32 field_count; | 83 | u32 field_count; |
84 | char *filterkey; /* ties events to rules */ | ||
72 | struct audit_field *fields; | 85 | struct audit_field *fields; |
86 | struct audit_field *inode_f; /* quick access to an inode field */ | ||
87 | struct audit_watch *watch; /* associated watch */ | ||
88 | struct list_head rlist; /* entry in audit_watch.rules list */ | ||
73 | }; | 89 | }; |
74 | 90 | ||
75 | struct audit_entry { | 91 | struct audit_entry { |
@@ -78,15 +94,53 @@ struct audit_entry { | |||
78 | struct audit_krule rule; | 94 | struct audit_krule rule; |
79 | }; | 95 | }; |
80 | 96 | ||
81 | |||
82 | extern int audit_pid; | 97 | extern int audit_pid; |
83 | extern int audit_comparator(const u32 left, const u32 op, const u32 right); | ||
84 | 98 | ||
99 | #define AUDIT_INODE_BUCKETS 32 | ||
100 | extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; | ||
101 | |||
102 | static inline int audit_hash_ino(u32 ino) | ||
103 | { | ||
104 | return (ino & (AUDIT_INODE_BUCKETS-1)); | ||
105 | } | ||
106 | |||
107 | extern int audit_comparator(const u32 left, const u32 op, const u32 right); | ||
108 | extern int audit_compare_dname_path(const char *dname, const char *path, | ||
109 | int *dirlen); | ||
110 | extern struct sk_buff * audit_make_reply(int pid, int seq, int type, | ||
111 | int done, int multi, | ||
112 | void *payload, int size); | ||
85 | extern void audit_send_reply(int pid, int seq, int type, | 113 | extern void audit_send_reply(int pid, int seq, int type, |
86 | int done, int multi, | 114 | int done, int multi, |
87 | void *payload, int size); | 115 | void *payload, int size); |
88 | extern void audit_log_lost(const char *message); | 116 | extern void audit_log_lost(const char *message); |
89 | extern void audit_panic(const char *message); | 117 | extern void audit_panic(const char *message); |
90 | extern struct mutex audit_netlink_mutex; | ||
91 | 118 | ||
119 | struct audit_netlink_list { | ||
120 | int pid; | ||
121 | struct sk_buff_head q; | ||
122 | }; | ||
123 | |||
124 | int audit_send_list(void *); | ||
125 | |||
126 | struct inotify_watch; | ||
127 | extern void audit_free_parent(struct inotify_watch *); | ||
128 | extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32, | ||
129 | const char *, struct inode *); | ||
92 | extern int selinux_audit_rule_update(void); | 130 | extern int selinux_audit_rule_update(void); |
131 | |||
132 | #ifdef CONFIG_AUDITSYSCALL | ||
133 | extern void __audit_signal_info(int sig, struct task_struct *t); | ||
134 | static inline void audit_signal_info(int sig, struct task_struct *t) | ||
135 | { | ||
136 | if (unlikely(audit_pid && t->tgid == audit_pid)) | ||
137 | __audit_signal_info(sig, t); | ||
138 | } | ||
139 | extern enum audit_state audit_filter_inodes(struct task_struct *, | ||
140 | struct audit_context *); | ||
141 | extern void audit_set_auditable(struct audit_context *); | ||
142 | #else | ||
143 | #define audit_signal_info(s,t) | ||
144 | #define audit_filter_inodes(t,c) AUDIT_DISABLED | ||
145 | #define audit_set_auditable(c) | ||
146 | #endif | ||
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 7c134906d689..5b4e16276ca0 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
@@ -22,13 +22,59 @@ | |||
22 | #include <linux/kernel.h> | 22 | #include <linux/kernel.h> |
23 | #include <linux/audit.h> | 23 | #include <linux/audit.h> |
24 | #include <linux/kthread.h> | 24 | #include <linux/kthread.h> |
25 | #include <linux/mutex.h> | ||
26 | #include <linux/fs.h> | ||
27 | #include <linux/namei.h> | ||
25 | #include <linux/netlink.h> | 28 | #include <linux/netlink.h> |
29 | #include <linux/sched.h> | ||
30 | #include <linux/inotify.h> | ||
26 | #include <linux/selinux.h> | 31 | #include <linux/selinux.h> |
27 | #include "audit.h" | 32 | #include "audit.h" |
28 | 33 | ||
29 | /* There are three lists of rules -- one to search at task creation | 34 | /* |
30 | * time, one to search at syscall entry time, and another to search at | 35 | * Locking model: |
31 | * syscall exit time. */ | 36 | * |
37 | * audit_filter_mutex: | ||
38 | * Synchronizes writes and blocking reads of audit's filterlist | ||
39 | * data. Rcu is used to traverse the filterlist and access | ||
40 | * contents of structs audit_entry, audit_watch and opaque | ||
41 | * selinux rules during filtering. If modified, these structures | ||
42 | * must be copied and replace their counterparts in the filterlist. | ||
43 | * An audit_parent struct is not accessed during filtering, so may | ||
44 | * be written directly provided audit_filter_mutex is held. | ||
45 | */ | ||
46 | |||
47 | /* | ||
48 | * Reference counting: | ||
49 | * | ||
50 | * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED | ||
51 | * event. Each audit_watch holds a reference to its associated parent. | ||
52 | * | ||
53 | * audit_watch: if added to lists, lifetime is from audit_init_watch() to | ||
54 | * audit_remove_watch(). Additionally, an audit_watch may exist | ||
55 | * temporarily to assist in searching existing filter data. Each | ||
56 | * audit_krule holds a reference to its associated watch. | ||
57 | */ | ||
58 | |||
59 | struct audit_parent { | ||
60 | struct list_head ilist; /* entry in inotify registration list */ | ||
61 | struct list_head watches; /* associated watches */ | ||
62 | struct inotify_watch wdata; /* inotify watch data */ | ||
63 | unsigned flags; /* status flags */ | ||
64 | }; | ||
65 | |||
66 | /* | ||
67 | * audit_parent status flags: | ||
68 | * | ||
69 | * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to | ||
70 | * a filesystem event to ensure we're adding audit watches to a valid parent. | ||
71 | * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot | ||
72 | * receive them while we have nameidata, but must be used for IN_MOVE_SELF which | ||
73 | * we can receive while holding nameidata. | ||
74 | */ | ||
75 | #define AUDIT_PARENT_INVALID 0x001 | ||
76 | |||
77 | /* Audit filter lists, defined in <linux/audit.h> */ | ||
32 | struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { | 78 | struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { |
33 | LIST_HEAD_INIT(audit_filter_list[0]), | 79 | LIST_HEAD_INIT(audit_filter_list[0]), |
34 | LIST_HEAD_INIT(audit_filter_list[1]), | 80 | LIST_HEAD_INIT(audit_filter_list[1]), |
@@ -41,9 +87,53 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { | |||
41 | #endif | 87 | #endif |
42 | }; | 88 | }; |
43 | 89 | ||
90 | static DEFINE_MUTEX(audit_filter_mutex); | ||
91 | |||
92 | /* Inotify handle */ | ||
93 | extern struct inotify_handle *audit_ih; | ||
94 | |||
95 | /* Inotify events we care about. */ | ||
96 | #define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF | ||
97 | |||
98 | void audit_free_parent(struct inotify_watch *i_watch) | ||
99 | { | ||
100 | struct audit_parent *parent; | ||
101 | |||
102 | parent = container_of(i_watch, struct audit_parent, wdata); | ||
103 | WARN_ON(!list_empty(&parent->watches)); | ||
104 | kfree(parent); | ||
105 | } | ||
106 | |||
107 | static inline void audit_get_watch(struct audit_watch *watch) | ||
108 | { | ||
109 | atomic_inc(&watch->count); | ||
110 | } | ||
111 | |||
112 | static void audit_put_watch(struct audit_watch *watch) | ||
113 | { | ||
114 | if (atomic_dec_and_test(&watch->count)) { | ||
115 | WARN_ON(watch->parent); | ||
116 | WARN_ON(!list_empty(&watch->rules)); | ||
117 | kfree(watch->path); | ||
118 | kfree(watch); | ||
119 | } | ||
120 | } | ||
121 | |||
122 | static void audit_remove_watch(struct audit_watch *watch) | ||
123 | { | ||
124 | list_del(&watch->wlist); | ||
125 | put_inotify_watch(&watch->parent->wdata); | ||
126 | watch->parent = NULL; | ||
127 | audit_put_watch(watch); /* match initial get */ | ||
128 | } | ||
129 | |||
44 | static inline void audit_free_rule(struct audit_entry *e) | 130 | static inline void audit_free_rule(struct audit_entry *e) |
45 | { | 131 | { |
46 | int i; | 132 | int i; |
133 | |||
134 | /* some rules don't have associated watches */ | ||
135 | if (e->rule.watch) | ||
136 | audit_put_watch(e->rule.watch); | ||
47 | if (e->rule.fields) | 137 | if (e->rule.fields) |
48 | for (i = 0; i < e->rule.field_count; i++) { | 138 | for (i = 0; i < e->rule.field_count; i++) { |
49 | struct audit_field *f = &e->rule.fields[i]; | 139 | struct audit_field *f = &e->rule.fields[i]; |
@@ -51,6 +141,7 @@ static inline void audit_free_rule(struct audit_entry *e) | |||
51 | selinux_audit_rule_free(f->se_rule); | 141 | selinux_audit_rule_free(f->se_rule); |
52 | } | 142 | } |
53 | kfree(e->rule.fields); | 143 | kfree(e->rule.fields); |
144 | kfree(e->rule.filterkey); | ||
54 | kfree(e); | 145 | kfree(e); |
55 | } | 146 | } |
56 | 147 | ||
@@ -60,6 +151,50 @@ static inline void audit_free_rule_rcu(struct rcu_head *head) | |||
60 | audit_free_rule(e); | 151 | audit_free_rule(e); |
61 | } | 152 | } |
62 | 153 | ||
154 | /* Initialize a parent watch entry. */ | ||
155 | static struct audit_parent *audit_init_parent(struct nameidata *ndp) | ||
156 | { | ||
157 | struct audit_parent *parent; | ||
158 | s32 wd; | ||
159 | |||
160 | parent = kzalloc(sizeof(*parent), GFP_KERNEL); | ||
161 | if (unlikely(!parent)) | ||
162 | return ERR_PTR(-ENOMEM); | ||
163 | |||
164 | INIT_LIST_HEAD(&parent->watches); | ||
165 | parent->flags = 0; | ||
166 | |||
167 | inotify_init_watch(&parent->wdata); | ||
168 | /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ | ||
169 | get_inotify_watch(&parent->wdata); | ||
170 | wd = inotify_add_watch(audit_ih, &parent->wdata, ndp->dentry->d_inode, | ||
171 | AUDIT_IN_WATCH); | ||
172 | if (wd < 0) { | ||
173 | audit_free_parent(&parent->wdata); | ||
174 | return ERR_PTR(wd); | ||
175 | } | ||
176 | |||
177 | return parent; | ||
178 | } | ||
179 | |||
180 | /* Initialize a watch entry. */ | ||
181 | static struct audit_watch *audit_init_watch(char *path) | ||
182 | { | ||
183 | struct audit_watch *watch; | ||
184 | |||
185 | watch = kzalloc(sizeof(*watch), GFP_KERNEL); | ||
186 | if (unlikely(!watch)) | ||
187 | return ERR_PTR(-ENOMEM); | ||
188 | |||
189 | INIT_LIST_HEAD(&watch->rules); | ||
190 | atomic_set(&watch->count, 1); | ||
191 | watch->path = path; | ||
192 | watch->dev = (dev_t)-1; | ||
193 | watch->ino = (unsigned long)-1; | ||
194 | |||
195 | return watch; | ||
196 | } | ||
197 | |||
63 | /* Initialize an audit filterlist entry. */ | 198 | /* Initialize an audit filterlist entry. */ |
64 | static inline struct audit_entry *audit_init_entry(u32 field_count) | 199 | static inline struct audit_entry *audit_init_entry(u32 field_count) |
65 | { | 200 | { |
@@ -107,6 +242,66 @@ static char *audit_unpack_string(void **bufp, size_t *remain, size_t len) | |||
107 | return str; | 242 | return str; |
108 | } | 243 | } |
109 | 244 | ||
245 | /* Translate an inode field to kernel respresentation. */ | ||
246 | static inline int audit_to_inode(struct audit_krule *krule, | ||
247 | struct audit_field *f) | ||
248 | { | ||
249 | if (krule->listnr != AUDIT_FILTER_EXIT || | ||
250 | krule->watch || krule->inode_f) | ||
251 | return -EINVAL; | ||
252 | |||
253 | krule->inode_f = f; | ||
254 | return 0; | ||
255 | } | ||
256 | |||
257 | /* Translate a watch string to kernel respresentation. */ | ||
258 | static int audit_to_watch(struct audit_krule *krule, char *path, int len, | ||
259 | u32 op) | ||
260 | { | ||
261 | struct audit_watch *watch; | ||
262 | |||
263 | if (!audit_ih) | ||
264 | return -EOPNOTSUPP; | ||
265 | |||
266 | if (path[0] != '/' || path[len-1] == '/' || | ||
267 | krule->listnr != AUDIT_FILTER_EXIT || | ||
268 | op & ~AUDIT_EQUAL || | ||
269 | krule->inode_f || krule->watch) /* 1 inode # per rule, for hash */ | ||
270 | return -EINVAL; | ||
271 | |||
272 | watch = audit_init_watch(path); | ||
273 | if (unlikely(IS_ERR(watch))) | ||
274 | return PTR_ERR(watch); | ||
275 | |||
276 | audit_get_watch(watch); | ||
277 | krule->watch = watch; | ||
278 | |||
279 | return 0; | ||
280 | } | ||
281 | |||
282 | static __u32 *classes[AUDIT_SYSCALL_CLASSES]; | ||
283 | |||
284 | int __init audit_register_class(int class, unsigned *list) | ||
285 | { | ||
286 | __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL); | ||
287 | if (!p) | ||
288 | return -ENOMEM; | ||
289 | while (*list != ~0U) { | ||
290 | unsigned n = *list++; | ||
291 | if (n >= AUDIT_BITMASK_SIZE * 32 - AUDIT_SYSCALL_CLASSES) { | ||
292 | kfree(p); | ||
293 | return -EINVAL; | ||
294 | } | ||
295 | p[AUDIT_WORD(n)] |= AUDIT_BIT(n); | ||
296 | } | ||
297 | if (class >= AUDIT_SYSCALL_CLASSES || classes[class]) { | ||
298 | kfree(p); | ||
299 | return -EINVAL; | ||
300 | } | ||
301 | classes[class] = p; | ||
302 | return 0; | ||
303 | } | ||
304 | |||
110 | /* Common user-space to kernel rule translation. */ | 305 | /* Common user-space to kernel rule translation. */ |
111 | static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) | 306 | static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) |
112 | { | 307 | { |
@@ -128,8 +323,11 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) | |||
128 | #endif | 323 | #endif |
129 | ; | 324 | ; |
130 | } | 325 | } |
131 | if (rule->action != AUDIT_NEVER && rule->action != AUDIT_POSSIBLE && | 326 | if (unlikely(rule->action == AUDIT_POSSIBLE)) { |
132 | rule->action != AUDIT_ALWAYS) | 327 | printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n"); |
328 | goto exit_err; | ||
329 | } | ||
330 | if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS) | ||
133 | goto exit_err; | 331 | goto exit_err; |
134 | if (rule->field_count > AUDIT_MAX_FIELDS) | 332 | if (rule->field_count > AUDIT_MAX_FIELDS) |
135 | goto exit_err; | 333 | goto exit_err; |
@@ -147,6 +345,22 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) | |||
147 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) | 345 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) |
148 | entry->rule.mask[i] = rule->mask[i]; | 346 | entry->rule.mask[i] = rule->mask[i]; |
149 | 347 | ||
348 | for (i = 0; i < AUDIT_SYSCALL_CLASSES; i++) { | ||
349 | int bit = AUDIT_BITMASK_SIZE * 32 - i - 1; | ||
350 | __u32 *p = &entry->rule.mask[AUDIT_WORD(bit)]; | ||
351 | __u32 *class; | ||
352 | |||
353 | if (!(*p & AUDIT_BIT(bit))) | ||
354 | continue; | ||
355 | *p &= ~AUDIT_BIT(bit); | ||
356 | class = classes[i]; | ||
357 | if (class) { | ||
358 | int j; | ||
359 | for (j = 0; j < AUDIT_BITMASK_SIZE; j++) | ||
360 | entry->rule.mask[j] |= class[j]; | ||
361 | } | ||
362 | } | ||
363 | |||
150 | return entry; | 364 | return entry; |
151 | 365 | ||
152 | exit_err: | 366 | exit_err: |
@@ -158,6 +372,7 @@ exit_err: | |||
158 | static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) | 372 | static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) |
159 | { | 373 | { |
160 | struct audit_entry *entry; | 374 | struct audit_entry *entry; |
375 | struct audit_field *f; | ||
161 | int err = 0; | 376 | int err = 0; |
162 | int i; | 377 | int i; |
163 | 378 | ||
@@ -172,14 +387,37 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) | |||
172 | f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); | 387 | f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); |
173 | f->val = rule->values[i]; | 388 | f->val = rule->values[i]; |
174 | 389 | ||
175 | if (f->type & AUDIT_UNUSED_BITS || | 390 | err = -EINVAL; |
176 | f->type == AUDIT_SE_USER || | 391 | switch(f->type) { |
177 | f->type == AUDIT_SE_ROLE || | 392 | default: |
178 | f->type == AUDIT_SE_TYPE || | ||
179 | f->type == AUDIT_SE_SEN || | ||
180 | f->type == AUDIT_SE_CLR) { | ||
181 | err = -EINVAL; | ||
182 | goto exit_free; | 393 | goto exit_free; |
394 | case AUDIT_PID: | ||
395 | case AUDIT_UID: | ||
396 | case AUDIT_EUID: | ||
397 | case AUDIT_SUID: | ||
398 | case AUDIT_FSUID: | ||
399 | case AUDIT_GID: | ||
400 | case AUDIT_EGID: | ||
401 | case AUDIT_SGID: | ||
402 | case AUDIT_FSGID: | ||
403 | case AUDIT_LOGINUID: | ||
404 | case AUDIT_PERS: | ||
405 | case AUDIT_ARCH: | ||
406 | case AUDIT_MSGTYPE: | ||
407 | case AUDIT_DEVMAJOR: | ||
408 | case AUDIT_DEVMINOR: | ||
409 | case AUDIT_EXIT: | ||
410 | case AUDIT_SUCCESS: | ||
411 | case AUDIT_ARG0: | ||
412 | case AUDIT_ARG1: | ||
413 | case AUDIT_ARG2: | ||
414 | case AUDIT_ARG3: | ||
415 | break; | ||
416 | case AUDIT_INODE: | ||
417 | err = audit_to_inode(&entry->rule, f); | ||
418 | if (err) | ||
419 | goto exit_free; | ||
420 | break; | ||
183 | } | 421 | } |
184 | 422 | ||
185 | entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; | 423 | entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; |
@@ -196,6 +434,18 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) | |||
196 | } | 434 | } |
197 | } | 435 | } |
198 | 436 | ||
437 | f = entry->rule.inode_f; | ||
438 | if (f) { | ||
439 | switch(f->op) { | ||
440 | case AUDIT_NOT_EQUAL: | ||
441 | entry->rule.inode_f = NULL; | ||
442 | case AUDIT_EQUAL: | ||
443 | break; | ||
444 | default: | ||
445 | goto exit_free; | ||
446 | } | ||
447 | } | ||
448 | |||
199 | exit_nofree: | 449 | exit_nofree: |
200 | return entry; | 450 | return entry; |
201 | 451 | ||
@@ -210,6 +460,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
210 | { | 460 | { |
211 | int err = 0; | 461 | int err = 0; |
212 | struct audit_entry *entry; | 462 | struct audit_entry *entry; |
463 | struct audit_field *f; | ||
213 | void *bufp; | 464 | void *bufp; |
214 | size_t remain = datasz - sizeof(struct audit_rule_data); | 465 | size_t remain = datasz - sizeof(struct audit_rule_data); |
215 | int i; | 466 | int i; |
@@ -235,11 +486,39 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
235 | f->se_str = NULL; | 486 | f->se_str = NULL; |
236 | f->se_rule = NULL; | 487 | f->se_rule = NULL; |
237 | switch(f->type) { | 488 | switch(f->type) { |
238 | case AUDIT_SE_USER: | 489 | case AUDIT_PID: |
239 | case AUDIT_SE_ROLE: | 490 | case AUDIT_UID: |
240 | case AUDIT_SE_TYPE: | 491 | case AUDIT_EUID: |
241 | case AUDIT_SE_SEN: | 492 | case AUDIT_SUID: |
242 | case AUDIT_SE_CLR: | 493 | case AUDIT_FSUID: |
494 | case AUDIT_GID: | ||
495 | case AUDIT_EGID: | ||
496 | case AUDIT_SGID: | ||
497 | case AUDIT_FSGID: | ||
498 | case AUDIT_LOGINUID: | ||
499 | case AUDIT_PERS: | ||
500 | case AUDIT_ARCH: | ||
501 | case AUDIT_MSGTYPE: | ||
502 | case AUDIT_PPID: | ||
503 | case AUDIT_DEVMAJOR: | ||
504 | case AUDIT_DEVMINOR: | ||
505 | case AUDIT_EXIT: | ||
506 | case AUDIT_SUCCESS: | ||
507 | case AUDIT_ARG0: | ||
508 | case AUDIT_ARG1: | ||
509 | case AUDIT_ARG2: | ||
510 | case AUDIT_ARG3: | ||
511 | break; | ||
512 | case AUDIT_SUBJ_USER: | ||
513 | case AUDIT_SUBJ_ROLE: | ||
514 | case AUDIT_SUBJ_TYPE: | ||
515 | case AUDIT_SUBJ_SEN: | ||
516 | case AUDIT_SUBJ_CLR: | ||
517 | case AUDIT_OBJ_USER: | ||
518 | case AUDIT_OBJ_ROLE: | ||
519 | case AUDIT_OBJ_TYPE: | ||
520 | case AUDIT_OBJ_LEV_LOW: | ||
521 | case AUDIT_OBJ_LEV_HIGH: | ||
243 | str = audit_unpack_string(&bufp, &remain, f->val); | 522 | str = audit_unpack_string(&bufp, &remain, f->val); |
244 | if (IS_ERR(str)) | 523 | if (IS_ERR(str)) |
245 | goto exit_free; | 524 | goto exit_free; |
@@ -260,6 +539,47 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
260 | } else | 539 | } else |
261 | f->se_str = str; | 540 | f->se_str = str; |
262 | break; | 541 | break; |
542 | case AUDIT_WATCH: | ||
543 | str = audit_unpack_string(&bufp, &remain, f->val); | ||
544 | if (IS_ERR(str)) | ||
545 | goto exit_free; | ||
546 | entry->rule.buflen += f->val; | ||
547 | |||
548 | err = audit_to_watch(&entry->rule, str, f->val, f->op); | ||
549 | if (err) { | ||
550 | kfree(str); | ||
551 | goto exit_free; | ||
552 | } | ||
553 | break; | ||
554 | case AUDIT_INODE: | ||
555 | err = audit_to_inode(&entry->rule, f); | ||
556 | if (err) | ||
557 | goto exit_free; | ||
558 | break; | ||
559 | case AUDIT_FILTERKEY: | ||
560 | err = -EINVAL; | ||
561 | if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) | ||
562 | goto exit_free; | ||
563 | str = audit_unpack_string(&bufp, &remain, f->val); | ||
564 | if (IS_ERR(str)) | ||
565 | goto exit_free; | ||
566 | entry->rule.buflen += f->val; | ||
567 | entry->rule.filterkey = str; | ||
568 | break; | ||
569 | default: | ||
570 | goto exit_free; | ||
571 | } | ||
572 | } | ||
573 | |||
574 | f = entry->rule.inode_f; | ||
575 | if (f) { | ||
576 | switch(f->op) { | ||
577 | case AUDIT_NOT_EQUAL: | ||
578 | entry->rule.inode_f = NULL; | ||
579 | case AUDIT_EQUAL: | ||
580 | break; | ||
581 | default: | ||
582 | goto exit_free; | ||
263 | } | 583 | } |
264 | } | 584 | } |
265 | 585 | ||
@@ -291,7 +611,7 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) | |||
291 | 611 | ||
292 | rule = kmalloc(sizeof(*rule), GFP_KERNEL); | 612 | rule = kmalloc(sizeof(*rule), GFP_KERNEL); |
293 | if (unlikely(!rule)) | 613 | if (unlikely(!rule)) |
294 | return ERR_PTR(-ENOMEM); | 614 | return NULL; |
295 | memset(rule, 0, sizeof(*rule)); | 615 | memset(rule, 0, sizeof(*rule)); |
296 | 616 | ||
297 | rule->flags = krule->flags | krule->listnr; | 617 | rule->flags = krule->flags | krule->listnr; |
@@ -322,7 +642,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) | |||
322 | 642 | ||
323 | data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL); | 643 | data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL); |
324 | if (unlikely(!data)) | 644 | if (unlikely(!data)) |
325 | return ERR_PTR(-ENOMEM); | 645 | return NULL; |
326 | memset(data, 0, sizeof(*data)); | 646 | memset(data, 0, sizeof(*data)); |
327 | 647 | ||
328 | data->flags = krule->flags | krule->listnr; | 648 | data->flags = krule->flags | krule->listnr; |
@@ -335,14 +655,27 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) | |||
335 | data->fields[i] = f->type; | 655 | data->fields[i] = f->type; |
336 | data->fieldflags[i] = f->op; | 656 | data->fieldflags[i] = f->op; |
337 | switch(f->type) { | 657 | switch(f->type) { |
338 | case AUDIT_SE_USER: | 658 | case AUDIT_SUBJ_USER: |
339 | case AUDIT_SE_ROLE: | 659 | case AUDIT_SUBJ_ROLE: |
340 | case AUDIT_SE_TYPE: | 660 | case AUDIT_SUBJ_TYPE: |
341 | case AUDIT_SE_SEN: | 661 | case AUDIT_SUBJ_SEN: |
342 | case AUDIT_SE_CLR: | 662 | case AUDIT_SUBJ_CLR: |
663 | case AUDIT_OBJ_USER: | ||
664 | case AUDIT_OBJ_ROLE: | ||
665 | case AUDIT_OBJ_TYPE: | ||
666 | case AUDIT_OBJ_LEV_LOW: | ||
667 | case AUDIT_OBJ_LEV_HIGH: | ||
343 | data->buflen += data->values[i] = | 668 | data->buflen += data->values[i] = |
344 | audit_pack_string(&bufp, f->se_str); | 669 | audit_pack_string(&bufp, f->se_str); |
345 | break; | 670 | break; |
671 | case AUDIT_WATCH: | ||
672 | data->buflen += data->values[i] = | ||
673 | audit_pack_string(&bufp, krule->watch->path); | ||
674 | break; | ||
675 | case AUDIT_FILTERKEY: | ||
676 | data->buflen += data->values[i] = | ||
677 | audit_pack_string(&bufp, krule->filterkey); | ||
678 | break; | ||
346 | default: | 679 | default: |
347 | data->values[i] = f->val; | 680 | data->values[i] = f->val; |
348 | } | 681 | } |
@@ -370,14 +703,28 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) | |||
370 | return 1; | 703 | return 1; |
371 | 704 | ||
372 | switch(a->fields[i].type) { | 705 | switch(a->fields[i].type) { |
373 | case AUDIT_SE_USER: | 706 | case AUDIT_SUBJ_USER: |
374 | case AUDIT_SE_ROLE: | 707 | case AUDIT_SUBJ_ROLE: |
375 | case AUDIT_SE_TYPE: | 708 | case AUDIT_SUBJ_TYPE: |
376 | case AUDIT_SE_SEN: | 709 | case AUDIT_SUBJ_SEN: |
377 | case AUDIT_SE_CLR: | 710 | case AUDIT_SUBJ_CLR: |
711 | case AUDIT_OBJ_USER: | ||
712 | case AUDIT_OBJ_ROLE: | ||
713 | case AUDIT_OBJ_TYPE: | ||
714 | case AUDIT_OBJ_LEV_LOW: | ||
715 | case AUDIT_OBJ_LEV_HIGH: | ||
378 | if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) | 716 | if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) |
379 | return 1; | 717 | return 1; |
380 | break; | 718 | break; |
719 | case AUDIT_WATCH: | ||
720 | if (strcmp(a->watch->path, b->watch->path)) | ||
721 | return 1; | ||
722 | break; | ||
723 | case AUDIT_FILTERKEY: | ||
724 | /* both filterkeys exist based on above type compare */ | ||
725 | if (strcmp(a->filterkey, b->filterkey)) | ||
726 | return 1; | ||
727 | break; | ||
381 | default: | 728 | default: |
382 | if (a->fields[i].val != b->fields[i].val) | 729 | if (a->fields[i].val != b->fields[i].val) |
383 | return 1; | 730 | return 1; |
@@ -391,6 +738,32 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) | |||
391 | return 0; | 738 | return 0; |
392 | } | 739 | } |
393 | 740 | ||
741 | /* Duplicate the given audit watch. The new watch's rules list is initialized | ||
742 | * to an empty list and wlist is undefined. */ | ||
743 | static struct audit_watch *audit_dupe_watch(struct audit_watch *old) | ||
744 | { | ||
745 | char *path; | ||
746 | struct audit_watch *new; | ||
747 | |||
748 | path = kstrdup(old->path, GFP_KERNEL); | ||
749 | if (unlikely(!path)) | ||
750 | return ERR_PTR(-ENOMEM); | ||
751 | |||
752 | new = audit_init_watch(path); | ||
753 | if (unlikely(IS_ERR(new))) { | ||
754 | kfree(path); | ||
755 | goto out; | ||
756 | } | ||
757 | |||
758 | new->dev = old->dev; | ||
759 | new->ino = old->ino; | ||
760 | get_inotify_watch(&old->parent->wdata); | ||
761 | new->parent = old->parent; | ||
762 | |||
763 | out: | ||
764 | return new; | ||
765 | } | ||
766 | |||
394 | /* Duplicate selinux field information. The se_rule is opaque, so must be | 767 | /* Duplicate selinux field information. The se_rule is opaque, so must be |
395 | * re-initialized. */ | 768 | * re-initialized. */ |
396 | static inline int audit_dupe_selinux_field(struct audit_field *df, | 769 | static inline int audit_dupe_selinux_field(struct audit_field *df, |
@@ -422,12 +795,16 @@ static inline int audit_dupe_selinux_field(struct audit_field *df, | |||
422 | /* Duplicate an audit rule. This will be a deep copy with the exception | 795 | /* Duplicate an audit rule. This will be a deep copy with the exception |
423 | * of the watch - that pointer is carried over. The selinux specific fields | 796 | * of the watch - that pointer is carried over. The selinux specific fields |
424 | * will be updated in the copy. The point is to be able to replace the old | 797 | * will be updated in the copy. The point is to be able to replace the old |
425 | * rule with the new rule in the filterlist, then free the old rule. */ | 798 | * rule with the new rule in the filterlist, then free the old rule. |
426 | static struct audit_entry *audit_dupe_rule(struct audit_krule *old) | 799 | * The rlist element is undefined; list manipulations are handled apart from |
800 | * the initial copy. */ | ||
801 | static struct audit_entry *audit_dupe_rule(struct audit_krule *old, | ||
802 | struct audit_watch *watch) | ||
427 | { | 803 | { |
428 | u32 fcount = old->field_count; | 804 | u32 fcount = old->field_count; |
429 | struct audit_entry *entry; | 805 | struct audit_entry *entry; |
430 | struct audit_krule *new; | 806 | struct audit_krule *new; |
807 | char *fk; | ||
431 | int i, err = 0; | 808 | int i, err = 0; |
432 | 809 | ||
433 | entry = audit_init_entry(fcount); | 810 | entry = audit_init_entry(fcount); |
@@ -442,6 +819,8 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old) | |||
442 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) | 819 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) |
443 | new->mask[i] = old->mask[i]; | 820 | new->mask[i] = old->mask[i]; |
444 | new->buflen = old->buflen; | 821 | new->buflen = old->buflen; |
822 | new->inode_f = old->inode_f; | ||
823 | new->watch = NULL; | ||
445 | new->field_count = old->field_count; | 824 | new->field_count = old->field_count; |
446 | memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); | 825 | memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); |
447 | 826 | ||
@@ -449,13 +828,25 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old) | |||
449 | * the originals will all be freed when the old rule is freed. */ | 828 | * the originals will all be freed when the old rule is freed. */ |
450 | for (i = 0; i < fcount; i++) { | 829 | for (i = 0; i < fcount; i++) { |
451 | switch (new->fields[i].type) { | 830 | switch (new->fields[i].type) { |
452 | case AUDIT_SE_USER: | 831 | case AUDIT_SUBJ_USER: |
453 | case AUDIT_SE_ROLE: | 832 | case AUDIT_SUBJ_ROLE: |
454 | case AUDIT_SE_TYPE: | 833 | case AUDIT_SUBJ_TYPE: |
455 | case AUDIT_SE_SEN: | 834 | case AUDIT_SUBJ_SEN: |
456 | case AUDIT_SE_CLR: | 835 | case AUDIT_SUBJ_CLR: |
836 | case AUDIT_OBJ_USER: | ||
837 | case AUDIT_OBJ_ROLE: | ||
838 | case AUDIT_OBJ_TYPE: | ||
839 | case AUDIT_OBJ_LEV_LOW: | ||
840 | case AUDIT_OBJ_LEV_HIGH: | ||
457 | err = audit_dupe_selinux_field(&new->fields[i], | 841 | err = audit_dupe_selinux_field(&new->fields[i], |
458 | &old->fields[i]); | 842 | &old->fields[i]); |
843 | break; | ||
844 | case AUDIT_FILTERKEY: | ||
845 | fk = kstrdup(old->filterkey, GFP_KERNEL); | ||
846 | if (unlikely(!fk)) | ||
847 | err = -ENOMEM; | ||
848 | else | ||
849 | new->filterkey = fk; | ||
459 | } | 850 | } |
460 | if (err) { | 851 | if (err) { |
461 | audit_free_rule(entry); | 852 | audit_free_rule(entry); |
@@ -463,68 +854,409 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old) | |||
463 | } | 854 | } |
464 | } | 855 | } |
465 | 856 | ||
857 | if (watch) { | ||
858 | audit_get_watch(watch); | ||
859 | new->watch = watch; | ||
860 | } | ||
861 | |||
466 | return entry; | 862 | return entry; |
467 | } | 863 | } |
468 | 864 | ||
469 | /* Add rule to given filterlist if not a duplicate. Protected by | 865 | /* Update inode info in audit rules based on filesystem event. */ |
470 | * audit_netlink_mutex. */ | 866 | static void audit_update_watch(struct audit_parent *parent, |
867 | const char *dname, dev_t dev, | ||
868 | unsigned long ino, unsigned invalidating) | ||
869 | { | ||
870 | struct audit_watch *owatch, *nwatch, *nextw; | ||
871 | struct audit_krule *r, *nextr; | ||
872 | struct audit_entry *oentry, *nentry; | ||
873 | struct audit_buffer *ab; | ||
874 | |||
875 | mutex_lock(&audit_filter_mutex); | ||
876 | list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { | ||
877 | if (audit_compare_dname_path(dname, owatch->path, NULL)) | ||
878 | continue; | ||
879 | |||
880 | /* If the update involves invalidating rules, do the inode-based | ||
881 | * filtering now, so we don't omit records. */ | ||
882 | if (invalidating && | ||
883 | audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT) | ||
884 | audit_set_auditable(current->audit_context); | ||
885 | |||
886 | nwatch = audit_dupe_watch(owatch); | ||
887 | if (unlikely(IS_ERR(nwatch))) { | ||
888 | mutex_unlock(&audit_filter_mutex); | ||
889 | audit_panic("error updating watch, skipping"); | ||
890 | return; | ||
891 | } | ||
892 | nwatch->dev = dev; | ||
893 | nwatch->ino = ino; | ||
894 | |||
895 | list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) { | ||
896 | |||
897 | oentry = container_of(r, struct audit_entry, rule); | ||
898 | list_del(&oentry->rule.rlist); | ||
899 | list_del_rcu(&oentry->list); | ||
900 | |||
901 | nentry = audit_dupe_rule(&oentry->rule, nwatch); | ||
902 | if (unlikely(IS_ERR(nentry))) | ||
903 | audit_panic("error updating watch, removing"); | ||
904 | else { | ||
905 | int h = audit_hash_ino((u32)ino); | ||
906 | list_add(&nentry->rule.rlist, &nwatch->rules); | ||
907 | list_add_rcu(&nentry->list, &audit_inode_hash[h]); | ||
908 | } | ||
909 | |||
910 | call_rcu(&oentry->rcu, audit_free_rule_rcu); | ||
911 | } | ||
912 | |||
913 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | ||
914 | audit_log_format(ab, "audit updated rules specifying watch="); | ||
915 | audit_log_untrustedstring(ab, owatch->path); | ||
916 | audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino); | ||
917 | audit_log_end(ab); | ||
918 | |||
919 | audit_remove_watch(owatch); | ||
920 | goto add_watch_to_parent; /* event applies to a single watch */ | ||
921 | } | ||
922 | mutex_unlock(&audit_filter_mutex); | ||
923 | return; | ||
924 | |||
925 | add_watch_to_parent: | ||
926 | list_add(&nwatch->wlist, &parent->watches); | ||
927 | mutex_unlock(&audit_filter_mutex); | ||
928 | return; | ||
929 | } | ||
930 | |||
931 | /* Remove all watches & rules associated with a parent that is going away. */ | ||
932 | static void audit_remove_parent_watches(struct audit_parent *parent) | ||
933 | { | ||
934 | struct audit_watch *w, *nextw; | ||
935 | struct audit_krule *r, *nextr; | ||
936 | struct audit_entry *e; | ||
937 | |||
938 | mutex_lock(&audit_filter_mutex); | ||
939 | parent->flags |= AUDIT_PARENT_INVALID; | ||
940 | list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { | ||
941 | list_for_each_entry_safe(r, nextr, &w->rules, rlist) { | ||
942 | e = container_of(r, struct audit_entry, rule); | ||
943 | list_del(&r->rlist); | ||
944 | list_del_rcu(&e->list); | ||
945 | call_rcu(&e->rcu, audit_free_rule_rcu); | ||
946 | |||
947 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
948 | "audit implicitly removed rule from list=%d\n", | ||
949 | AUDIT_FILTER_EXIT); | ||
950 | } | ||
951 | audit_remove_watch(w); | ||
952 | } | ||
953 | mutex_unlock(&audit_filter_mutex); | ||
954 | } | ||
955 | |||
956 | /* Unregister inotify watches for parents on in_list. | ||
957 | * Generates an IN_IGNORED event. */ | ||
958 | static void audit_inotify_unregister(struct list_head *in_list) | ||
959 | { | ||
960 | struct audit_parent *p, *n; | ||
961 | |||
962 | list_for_each_entry_safe(p, n, in_list, ilist) { | ||
963 | list_del(&p->ilist); | ||
964 | inotify_rm_watch(audit_ih, &p->wdata); | ||
965 | /* the put matching the get in audit_do_del_rule() */ | ||
966 | put_inotify_watch(&p->wdata); | ||
967 | } | ||
968 | } | ||
969 | |||
970 | /* Find an existing audit rule. | ||
971 | * Caller must hold audit_filter_mutex to prevent stale rule data. */ | ||
972 | static struct audit_entry *audit_find_rule(struct audit_entry *entry, | ||
973 | struct list_head *list) | ||
974 | { | ||
975 | struct audit_entry *e, *found = NULL; | ||
976 | int h; | ||
977 | |||
978 | if (entry->rule.watch) { | ||
979 | /* we don't know the inode number, so must walk entire hash */ | ||
980 | for (h = 0; h < AUDIT_INODE_BUCKETS; h++) { | ||
981 | list = &audit_inode_hash[h]; | ||
982 | list_for_each_entry(e, list, list) | ||
983 | if (!audit_compare_rule(&entry->rule, &e->rule)) { | ||
984 | found = e; | ||
985 | goto out; | ||
986 | } | ||
987 | } | ||
988 | goto out; | ||
989 | } | ||
990 | |||
991 | list_for_each_entry(e, list, list) | ||
992 | if (!audit_compare_rule(&entry->rule, &e->rule)) { | ||
993 | found = e; | ||
994 | goto out; | ||
995 | } | ||
996 | |||
997 | out: | ||
998 | return found; | ||
999 | } | ||
1000 | |||
1001 | /* Get path information necessary for adding watches. */ | ||
1002 | static int audit_get_nd(char *path, struct nameidata **ndp, | ||
1003 | struct nameidata **ndw) | ||
1004 | { | ||
1005 | struct nameidata *ndparent, *ndwatch; | ||
1006 | int err; | ||
1007 | |||
1008 | ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); | ||
1009 | if (unlikely(!ndparent)) | ||
1010 | return -ENOMEM; | ||
1011 | |||
1012 | ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); | ||
1013 | if (unlikely(!ndwatch)) { | ||
1014 | kfree(ndparent); | ||
1015 | return -ENOMEM; | ||
1016 | } | ||
1017 | |||
1018 | err = path_lookup(path, LOOKUP_PARENT, ndparent); | ||
1019 | if (err) { | ||
1020 | kfree(ndparent); | ||
1021 | kfree(ndwatch); | ||
1022 | return err; | ||
1023 | } | ||
1024 | |||
1025 | err = path_lookup(path, 0, ndwatch); | ||
1026 | if (err) { | ||
1027 | kfree(ndwatch); | ||
1028 | ndwatch = NULL; | ||
1029 | } | ||
1030 | |||
1031 | *ndp = ndparent; | ||
1032 | *ndw = ndwatch; | ||
1033 | |||
1034 | return 0; | ||
1035 | } | ||
1036 | |||
1037 | /* Release resources used for watch path information. */ | ||
1038 | static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) | ||
1039 | { | ||
1040 | if (ndp) { | ||
1041 | path_release(ndp); | ||
1042 | kfree(ndp); | ||
1043 | } | ||
1044 | if (ndw) { | ||
1045 | path_release(ndw); | ||
1046 | kfree(ndw); | ||
1047 | } | ||
1048 | } | ||
1049 | |||
1050 | /* Associate the given rule with an existing parent inotify_watch. | ||
1051 | * Caller must hold audit_filter_mutex. */ | ||
1052 | static void audit_add_to_parent(struct audit_krule *krule, | ||
1053 | struct audit_parent *parent) | ||
1054 | { | ||
1055 | struct audit_watch *w, *watch = krule->watch; | ||
1056 | int watch_found = 0; | ||
1057 | |||
1058 | list_for_each_entry(w, &parent->watches, wlist) { | ||
1059 | if (strcmp(watch->path, w->path)) | ||
1060 | continue; | ||
1061 | |||
1062 | watch_found = 1; | ||
1063 | |||
1064 | /* put krule's and initial refs to temporary watch */ | ||
1065 | audit_put_watch(watch); | ||
1066 | audit_put_watch(watch); | ||
1067 | |||
1068 | audit_get_watch(w); | ||
1069 | krule->watch = watch = w; | ||
1070 | break; | ||
1071 | } | ||
1072 | |||
1073 | if (!watch_found) { | ||
1074 | get_inotify_watch(&parent->wdata); | ||
1075 | watch->parent = parent; | ||
1076 | |||
1077 | list_add(&watch->wlist, &parent->watches); | ||
1078 | } | ||
1079 | list_add(&krule->rlist, &watch->rules); | ||
1080 | } | ||
1081 | |||
1082 | /* Find a matching watch entry, or add this one. | ||
1083 | * Caller must hold audit_filter_mutex. */ | ||
1084 | static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, | ||
1085 | struct nameidata *ndw) | ||
1086 | { | ||
1087 | struct audit_watch *watch = krule->watch; | ||
1088 | struct inotify_watch *i_watch; | ||
1089 | struct audit_parent *parent; | ||
1090 | int ret = 0; | ||
1091 | |||
1092 | /* update watch filter fields */ | ||
1093 | if (ndw) { | ||
1094 | watch->dev = ndw->dentry->d_inode->i_sb->s_dev; | ||
1095 | watch->ino = ndw->dentry->d_inode->i_ino; | ||
1096 | } | ||
1097 | |||
1098 | /* The audit_filter_mutex must not be held during inotify calls because | ||
1099 | * we hold it during inotify event callback processing. If an existing | ||
1100 | * inotify watch is found, inotify_find_watch() grabs a reference before | ||
1101 | * returning. | ||
1102 | */ | ||
1103 | mutex_unlock(&audit_filter_mutex); | ||
1104 | |||
1105 | if (inotify_find_watch(audit_ih, ndp->dentry->d_inode, &i_watch) < 0) { | ||
1106 | parent = audit_init_parent(ndp); | ||
1107 | if (IS_ERR(parent)) { | ||
1108 | /* caller expects mutex locked */ | ||
1109 | mutex_lock(&audit_filter_mutex); | ||
1110 | return PTR_ERR(parent); | ||
1111 | } | ||
1112 | } else | ||
1113 | parent = container_of(i_watch, struct audit_parent, wdata); | ||
1114 | |||
1115 | mutex_lock(&audit_filter_mutex); | ||
1116 | |||
1117 | /* parent was moved before we took audit_filter_mutex */ | ||
1118 | if (parent->flags & AUDIT_PARENT_INVALID) | ||
1119 | ret = -ENOENT; | ||
1120 | else | ||
1121 | audit_add_to_parent(krule, parent); | ||
1122 | |||
1123 | /* match get in audit_init_parent or inotify_find_watch */ | ||
1124 | put_inotify_watch(&parent->wdata); | ||
1125 | return ret; | ||
1126 | } | ||
1127 | |||
1128 | /* Add rule to given filterlist if not a duplicate. */ | ||
471 | static inline int audit_add_rule(struct audit_entry *entry, | 1129 | static inline int audit_add_rule(struct audit_entry *entry, |
472 | struct list_head *list) | 1130 | struct list_head *list) |
473 | { | 1131 | { |
474 | struct audit_entry *e; | 1132 | struct audit_entry *e; |
1133 | struct audit_field *inode_f = entry->rule.inode_f; | ||
1134 | struct audit_watch *watch = entry->rule.watch; | ||
1135 | struct nameidata *ndp, *ndw; | ||
1136 | int h, err, putnd_needed = 0; | ||
1137 | |||
1138 | if (inode_f) { | ||
1139 | h = audit_hash_ino(inode_f->val); | ||
1140 | list = &audit_inode_hash[h]; | ||
1141 | } | ||
475 | 1142 | ||
476 | /* Do not use the _rcu iterator here, since this is the only | 1143 | mutex_lock(&audit_filter_mutex); |
477 | * addition routine. */ | 1144 | e = audit_find_rule(entry, list); |
478 | list_for_each_entry(e, list, list) { | 1145 | mutex_unlock(&audit_filter_mutex); |
479 | if (!audit_compare_rule(&entry->rule, &e->rule)) | 1146 | if (e) { |
480 | return -EEXIST; | 1147 | err = -EEXIST; |
1148 | goto error; | ||
1149 | } | ||
1150 | |||
1151 | /* Avoid calling path_lookup under audit_filter_mutex. */ | ||
1152 | if (watch) { | ||
1153 | err = audit_get_nd(watch->path, &ndp, &ndw); | ||
1154 | if (err) | ||
1155 | goto error; | ||
1156 | putnd_needed = 1; | ||
1157 | } | ||
1158 | |||
1159 | mutex_lock(&audit_filter_mutex); | ||
1160 | if (watch) { | ||
1161 | /* audit_filter_mutex is dropped and re-taken during this call */ | ||
1162 | err = audit_add_watch(&entry->rule, ndp, ndw); | ||
1163 | if (err) { | ||
1164 | mutex_unlock(&audit_filter_mutex); | ||
1165 | goto error; | ||
1166 | } | ||
1167 | h = audit_hash_ino((u32)watch->ino); | ||
1168 | list = &audit_inode_hash[h]; | ||
481 | } | 1169 | } |
482 | 1170 | ||
483 | if (entry->rule.flags & AUDIT_FILTER_PREPEND) { | 1171 | if (entry->rule.flags & AUDIT_FILTER_PREPEND) { |
484 | list_add_rcu(&entry->list, list); | 1172 | list_add_rcu(&entry->list, list); |
1173 | entry->rule.flags &= ~AUDIT_FILTER_PREPEND; | ||
485 | } else { | 1174 | } else { |
486 | list_add_tail_rcu(&entry->list, list); | 1175 | list_add_tail_rcu(&entry->list, list); |
487 | } | 1176 | } |
1177 | mutex_unlock(&audit_filter_mutex); | ||
488 | 1178 | ||
489 | return 0; | 1179 | if (putnd_needed) |
1180 | audit_put_nd(ndp, ndw); | ||
1181 | |||
1182 | return 0; | ||
1183 | |||
1184 | error: | ||
1185 | if (putnd_needed) | ||
1186 | audit_put_nd(ndp, ndw); | ||
1187 | if (watch) | ||
1188 | audit_put_watch(watch); /* tmp watch, matches initial get */ | ||
1189 | return err; | ||
490 | } | 1190 | } |
491 | 1191 | ||
492 | /* Remove an existing rule from filterlist. Protected by | 1192 | /* Remove an existing rule from filterlist. */ |
493 | * audit_netlink_mutex. */ | ||
494 | static inline int audit_del_rule(struct audit_entry *entry, | 1193 | static inline int audit_del_rule(struct audit_entry *entry, |
495 | struct list_head *list) | 1194 | struct list_head *list) |
496 | { | 1195 | { |
497 | struct audit_entry *e; | 1196 | struct audit_entry *e; |
1197 | struct audit_field *inode_f = entry->rule.inode_f; | ||
1198 | struct audit_watch *watch, *tmp_watch = entry->rule.watch; | ||
1199 | LIST_HEAD(inotify_list); | ||
1200 | int h, ret = 0; | ||
1201 | |||
1202 | if (inode_f) { | ||
1203 | h = audit_hash_ino(inode_f->val); | ||
1204 | list = &audit_inode_hash[h]; | ||
1205 | } | ||
498 | 1206 | ||
499 | /* Do not use the _rcu iterator here, since this is the only | 1207 | mutex_lock(&audit_filter_mutex); |
500 | * deletion routine. */ | 1208 | e = audit_find_rule(entry, list); |
501 | list_for_each_entry(e, list, list) { | 1209 | if (!e) { |
502 | if (!audit_compare_rule(&entry->rule, &e->rule)) { | 1210 | mutex_unlock(&audit_filter_mutex); |
503 | list_del_rcu(&e->list); | 1211 | ret = -ENOENT; |
504 | call_rcu(&e->rcu, audit_free_rule_rcu); | 1212 | goto out; |
505 | return 0; | 1213 | } |
1214 | |||
1215 | watch = e->rule.watch; | ||
1216 | if (watch) { | ||
1217 | struct audit_parent *parent = watch->parent; | ||
1218 | |||
1219 | list_del(&e->rule.rlist); | ||
1220 | |||
1221 | if (list_empty(&watch->rules)) { | ||
1222 | audit_remove_watch(watch); | ||
1223 | |||
1224 | if (list_empty(&parent->watches)) { | ||
1225 | /* Put parent on the inotify un-registration | ||
1226 | * list. Grab a reference before releasing | ||
1227 | * audit_filter_mutex, to be released in | ||
1228 | * audit_inotify_unregister(). */ | ||
1229 | list_add(&parent->ilist, &inotify_list); | ||
1230 | get_inotify_watch(&parent->wdata); | ||
1231 | } | ||
506 | } | 1232 | } |
507 | } | 1233 | } |
508 | return -ENOENT; /* No matching rule */ | 1234 | |
1235 | list_del_rcu(&e->list); | ||
1236 | call_rcu(&e->rcu, audit_free_rule_rcu); | ||
1237 | |||
1238 | mutex_unlock(&audit_filter_mutex); | ||
1239 | |||
1240 | if (!list_empty(&inotify_list)) | ||
1241 | audit_inotify_unregister(&inotify_list); | ||
1242 | |||
1243 | out: | ||
1244 | if (tmp_watch) | ||
1245 | audit_put_watch(tmp_watch); /* match initial get */ | ||
1246 | |||
1247 | return ret; | ||
509 | } | 1248 | } |
510 | 1249 | ||
511 | /* List rules using struct audit_rule. Exists for backward | 1250 | /* List rules using struct audit_rule. Exists for backward |
512 | * compatibility with userspace. */ | 1251 | * compatibility with userspace. */ |
513 | static int audit_list(void *_dest) | 1252 | static void audit_list(int pid, int seq, struct sk_buff_head *q) |
514 | { | 1253 | { |
515 | int pid, seq; | 1254 | struct sk_buff *skb; |
516 | int *dest = _dest; | ||
517 | struct audit_entry *entry; | 1255 | struct audit_entry *entry; |
518 | int i; | 1256 | int i; |
519 | 1257 | ||
520 | pid = dest[0]; | 1258 | /* This is a blocking read, so use audit_filter_mutex instead of rcu |
521 | seq = dest[1]; | 1259 | * iterator to sync with list writers. */ |
522 | kfree(dest); | ||
523 | |||
524 | mutex_lock(&audit_netlink_mutex); | ||
525 | |||
526 | /* The *_rcu iterators not needed here because we are | ||
527 | always called with audit_netlink_mutex held. */ | ||
528 | for (i=0; i<AUDIT_NR_FILTERS; i++) { | 1260 | for (i=0; i<AUDIT_NR_FILTERS; i++) { |
529 | list_for_each_entry(entry, &audit_filter_list[i], list) { | 1261 | list_for_each_entry(entry, &audit_filter_list[i], list) { |
530 | struct audit_rule *rule; | 1262 | struct audit_rule *rule; |
@@ -532,33 +1264,41 @@ static int audit_list(void *_dest) | |||
532 | rule = audit_krule_to_rule(&entry->rule); | 1264 | rule = audit_krule_to_rule(&entry->rule); |
533 | if (unlikely(!rule)) | 1265 | if (unlikely(!rule)) |
534 | break; | 1266 | break; |
535 | audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, | 1267 | skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, |
536 | rule, sizeof(*rule)); | 1268 | rule, sizeof(*rule)); |
1269 | if (skb) | ||
1270 | skb_queue_tail(q, skb); | ||
537 | kfree(rule); | 1271 | kfree(rule); |
538 | } | 1272 | } |
539 | } | 1273 | } |
540 | audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); | 1274 | for (i = 0; i < AUDIT_INODE_BUCKETS; i++) { |
541 | 1275 | list_for_each_entry(entry, &audit_inode_hash[i], list) { | |
542 | mutex_unlock(&audit_netlink_mutex); | 1276 | struct audit_rule *rule; |
543 | return 0; | 1277 | |
1278 | rule = audit_krule_to_rule(&entry->rule); | ||
1279 | if (unlikely(!rule)) | ||
1280 | break; | ||
1281 | skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, | ||
1282 | rule, sizeof(*rule)); | ||
1283 | if (skb) | ||
1284 | skb_queue_tail(q, skb); | ||
1285 | kfree(rule); | ||
1286 | } | ||
1287 | } | ||
1288 | skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); | ||
1289 | if (skb) | ||
1290 | skb_queue_tail(q, skb); | ||
544 | } | 1291 | } |
545 | 1292 | ||
546 | /* List rules using struct audit_rule_data. */ | 1293 | /* List rules using struct audit_rule_data. */ |
547 | static int audit_list_rules(void *_dest) | 1294 | static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) |
548 | { | 1295 | { |
549 | int pid, seq; | 1296 | struct sk_buff *skb; |
550 | int *dest = _dest; | ||
551 | struct audit_entry *e; | 1297 | struct audit_entry *e; |
552 | int i; | 1298 | int i; |
553 | 1299 | ||
554 | pid = dest[0]; | 1300 | /* This is a blocking read, so use audit_filter_mutex instead of rcu |
555 | seq = dest[1]; | 1301 | * iterator to sync with list writers. */ |
556 | kfree(dest); | ||
557 | |||
558 | mutex_lock(&audit_netlink_mutex); | ||
559 | |||
560 | /* The *_rcu iterators not needed here because we are | ||
561 | always called with audit_netlink_mutex held. */ | ||
562 | for (i=0; i<AUDIT_NR_FILTERS; i++) { | 1302 | for (i=0; i<AUDIT_NR_FILTERS; i++) { |
563 | list_for_each_entry(e, &audit_filter_list[i], list) { | 1303 | list_for_each_entry(e, &audit_filter_list[i], list) { |
564 | struct audit_rule_data *data; | 1304 | struct audit_rule_data *data; |
@@ -566,15 +1306,58 @@ static int audit_list_rules(void *_dest) | |||
566 | data = audit_krule_to_data(&e->rule); | 1306 | data = audit_krule_to_data(&e->rule); |
567 | if (unlikely(!data)) | 1307 | if (unlikely(!data)) |
568 | break; | 1308 | break; |
569 | audit_send_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, | 1309 | skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, |
570 | data, sizeof(*data)); | 1310 | data, sizeof(*data) + data->buflen); |
1311 | if (skb) | ||
1312 | skb_queue_tail(q, skb); | ||
571 | kfree(data); | 1313 | kfree(data); |
572 | } | 1314 | } |
573 | } | 1315 | } |
574 | audit_send_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); | 1316 | for (i=0; i< AUDIT_INODE_BUCKETS; i++) { |
1317 | list_for_each_entry(e, &audit_inode_hash[i], list) { | ||
1318 | struct audit_rule_data *data; | ||
575 | 1319 | ||
576 | mutex_unlock(&audit_netlink_mutex); | 1320 | data = audit_krule_to_data(&e->rule); |
577 | return 0; | 1321 | if (unlikely(!data)) |
1322 | break; | ||
1323 | skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, | ||
1324 | data, sizeof(*data) + data->buflen); | ||
1325 | if (skb) | ||
1326 | skb_queue_tail(q, skb); | ||
1327 | kfree(data); | ||
1328 | } | ||
1329 | } | ||
1330 | skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); | ||
1331 | if (skb) | ||
1332 | skb_queue_tail(q, skb); | ||
1333 | } | ||
1334 | |||
1335 | /* Log rule additions and removals */ | ||
1336 | static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action, | ||
1337 | struct audit_krule *rule, int res) | ||
1338 | { | ||
1339 | struct audit_buffer *ab; | ||
1340 | |||
1341 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | ||
1342 | if (!ab) | ||
1343 | return; | ||
1344 | audit_log_format(ab, "auid=%u", loginuid); | ||
1345 | if (sid) { | ||
1346 | char *ctx = NULL; | ||
1347 | u32 len; | ||
1348 | if (selinux_ctxid_to_string(sid, &ctx, &len)) | ||
1349 | audit_log_format(ab, " ssid=%u", sid); | ||
1350 | else | ||
1351 | audit_log_format(ab, " subj=%s", ctx); | ||
1352 | kfree(ctx); | ||
1353 | } | ||
1354 | audit_log_format(ab, " %s rule key=", action); | ||
1355 | if (rule->filterkey) | ||
1356 | audit_log_untrustedstring(ab, rule->filterkey); | ||
1357 | else | ||
1358 | audit_log_format(ab, "(null)"); | ||
1359 | audit_log_format(ab, " list=%d res=%d", rule->listnr, res); | ||
1360 | audit_log_end(ab); | ||
578 | } | 1361 | } |
579 | 1362 | ||
580 | /** | 1363 | /** |
@@ -592,7 +1375,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, | |||
592 | size_t datasz, uid_t loginuid, u32 sid) | 1375 | size_t datasz, uid_t loginuid, u32 sid) |
593 | { | 1376 | { |
594 | struct task_struct *tsk; | 1377 | struct task_struct *tsk; |
595 | int *dest; | 1378 | struct audit_netlink_list *dest; |
596 | int err = 0; | 1379 | int err = 0; |
597 | struct audit_entry *entry; | 1380 | struct audit_entry *entry; |
598 | 1381 | ||
@@ -605,18 +1388,22 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, | |||
605 | * happen if we're actually running in the context of auditctl | 1388 | * happen if we're actually running in the context of auditctl |
606 | * trying to _send_ the stuff */ | 1389 | * trying to _send_ the stuff */ |
607 | 1390 | ||
608 | dest = kmalloc(2 * sizeof(int), GFP_KERNEL); | 1391 | dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); |
609 | if (!dest) | 1392 | if (!dest) |
610 | return -ENOMEM; | 1393 | return -ENOMEM; |
611 | dest[0] = pid; | 1394 | dest->pid = pid; |
612 | dest[1] = seq; | 1395 | skb_queue_head_init(&dest->q); |
613 | 1396 | ||
1397 | mutex_lock(&audit_filter_mutex); | ||
614 | if (type == AUDIT_LIST) | 1398 | if (type == AUDIT_LIST) |
615 | tsk = kthread_run(audit_list, dest, "audit_list"); | 1399 | audit_list(pid, seq, &dest->q); |
616 | else | 1400 | else |
617 | tsk = kthread_run(audit_list_rules, dest, | 1401 | audit_list_rules(pid, seq, &dest->q); |
618 | "audit_list_rules"); | 1402 | mutex_unlock(&audit_filter_mutex); |
1403 | |||
1404 | tsk = kthread_run(audit_send_list, dest, "audit_send_list"); | ||
619 | if (IS_ERR(tsk)) { | 1405 | if (IS_ERR(tsk)) { |
1406 | skb_queue_purge(&dest->q); | ||
620 | kfree(dest); | 1407 | kfree(dest); |
621 | err = PTR_ERR(tsk); | 1408 | err = PTR_ERR(tsk); |
622 | } | 1409 | } |
@@ -632,23 +1419,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, | |||
632 | 1419 | ||
633 | err = audit_add_rule(entry, | 1420 | err = audit_add_rule(entry, |
634 | &audit_filter_list[entry->rule.listnr]); | 1421 | &audit_filter_list[entry->rule.listnr]); |
635 | if (sid) { | 1422 | audit_log_rule_change(loginuid, sid, "add", &entry->rule, !err); |
636 | char *ctx = NULL; | ||
637 | u32 len; | ||
638 | if (selinux_ctxid_to_string(sid, &ctx, &len)) { | ||
639 | /* Maybe call audit_panic? */ | ||
640 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
641 | "auid=%u ssid=%u add rule to list=%d res=%d", | ||
642 | loginuid, sid, entry->rule.listnr, !err); | ||
643 | } else | ||
644 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
645 | "auid=%u subj=%s add rule to list=%d res=%d", | ||
646 | loginuid, ctx, entry->rule.listnr, !err); | ||
647 | kfree(ctx); | ||
648 | } else | ||
649 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
650 | "auid=%u add rule to list=%d res=%d", | ||
651 | loginuid, entry->rule.listnr, !err); | ||
652 | 1423 | ||
653 | if (err) | 1424 | if (err) |
654 | audit_free_rule(entry); | 1425 | audit_free_rule(entry); |
@@ -664,24 +1435,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, | |||
664 | 1435 | ||
665 | err = audit_del_rule(entry, | 1436 | err = audit_del_rule(entry, |
666 | &audit_filter_list[entry->rule.listnr]); | 1437 | &audit_filter_list[entry->rule.listnr]); |
667 | 1438 | audit_log_rule_change(loginuid, sid, "remove", &entry->rule, | |
668 | if (sid) { | 1439 | !err); |
669 | char *ctx = NULL; | ||
670 | u32 len; | ||
671 | if (selinux_ctxid_to_string(sid, &ctx, &len)) { | ||
672 | /* Maybe call audit_panic? */ | ||
673 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
674 | "auid=%u ssid=%u remove rule from list=%d res=%d", | ||
675 | loginuid, sid, entry->rule.listnr, !err); | ||
676 | } else | ||
677 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
678 | "auid=%u subj=%s remove rule from list=%d res=%d", | ||
679 | loginuid, ctx, entry->rule.listnr, !err); | ||
680 | kfree(ctx); | ||
681 | } else | ||
682 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
683 | "auid=%u remove rule from list=%d res=%d", | ||
684 | loginuid, entry->rule.listnr, !err); | ||
685 | 1440 | ||
686 | audit_free_rule(entry); | 1441 | audit_free_rule(entry); |
687 | break; | 1442 | break; |
@@ -712,7 +1467,43 @@ int audit_comparator(const u32 left, const u32 op, const u32 right) | |||
712 | return 0; | 1467 | return 0; |
713 | } | 1468 | } |
714 | 1469 | ||
1470 | /* Compare given dentry name with last component in given path, | ||
1471 | * return of 0 indicates a match. */ | ||
1472 | int audit_compare_dname_path(const char *dname, const char *path, | ||
1473 | int *dirlen) | ||
1474 | { | ||
1475 | int dlen, plen; | ||
1476 | const char *p; | ||
1477 | |||
1478 | if (!dname || !path) | ||
1479 | return 1; | ||
715 | 1480 | ||
1481 | dlen = strlen(dname); | ||
1482 | plen = strlen(path); | ||
1483 | if (plen < dlen) | ||
1484 | return 1; | ||
1485 | |||
1486 | /* disregard trailing slashes */ | ||
1487 | p = path + plen - 1; | ||
1488 | while ((*p == '/') && (p > path)) | ||
1489 | p--; | ||
1490 | |||
1491 | /* find last path component */ | ||
1492 | p = p - dlen + 1; | ||
1493 | if (p < path) | ||
1494 | return 1; | ||
1495 | else if (p > path) { | ||
1496 | if (*--p != '/') | ||
1497 | return 1; | ||
1498 | else | ||
1499 | p++; | ||
1500 | } | ||
1501 | |||
1502 | /* return length of path's directory component */ | ||
1503 | if (dirlen) | ||
1504 | *dirlen = p - path; | ||
1505 | return strncmp(p, dname, dlen); | ||
1506 | } | ||
716 | 1507 | ||
717 | static int audit_filter_user_rules(struct netlink_skb_parms *cb, | 1508 | static int audit_filter_user_rules(struct netlink_skb_parms *cb, |
718 | struct audit_krule *rule, | 1509 | struct audit_krule *rule, |
@@ -744,7 +1535,6 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, | |||
744 | } | 1535 | } |
745 | switch (rule->action) { | 1536 | switch (rule->action) { |
746 | case AUDIT_NEVER: *state = AUDIT_DISABLED; break; | 1537 | case AUDIT_NEVER: *state = AUDIT_DISABLED; break; |
747 | case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; | ||
748 | case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; | 1538 | case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; |
749 | } | 1539 | } |
750 | return 1; | 1540 | return 1; |
@@ -806,11 +1596,16 @@ static inline int audit_rule_has_selinux(struct audit_krule *rule) | |||
806 | for (i = 0; i < rule->field_count; i++) { | 1596 | for (i = 0; i < rule->field_count; i++) { |
807 | struct audit_field *f = &rule->fields[i]; | 1597 | struct audit_field *f = &rule->fields[i]; |
808 | switch (f->type) { | 1598 | switch (f->type) { |
809 | case AUDIT_SE_USER: | 1599 | case AUDIT_SUBJ_USER: |
810 | case AUDIT_SE_ROLE: | 1600 | case AUDIT_SUBJ_ROLE: |
811 | case AUDIT_SE_TYPE: | 1601 | case AUDIT_SUBJ_TYPE: |
812 | case AUDIT_SE_SEN: | 1602 | case AUDIT_SUBJ_SEN: |
813 | case AUDIT_SE_CLR: | 1603 | case AUDIT_SUBJ_CLR: |
1604 | case AUDIT_OBJ_USER: | ||
1605 | case AUDIT_OBJ_ROLE: | ||
1606 | case AUDIT_OBJ_TYPE: | ||
1607 | case AUDIT_OBJ_LEV_LOW: | ||
1608 | case AUDIT_OBJ_LEV_HIGH: | ||
814 | return 1; | 1609 | return 1; |
815 | } | 1610 | } |
816 | } | 1611 | } |
@@ -826,32 +1621,65 @@ static inline int audit_rule_has_selinux(struct audit_krule *rule) | |||
826 | int selinux_audit_rule_update(void) | 1621 | int selinux_audit_rule_update(void) |
827 | { | 1622 | { |
828 | struct audit_entry *entry, *n, *nentry; | 1623 | struct audit_entry *entry, *n, *nentry; |
1624 | struct audit_watch *watch; | ||
829 | int i, err = 0; | 1625 | int i, err = 0; |
830 | 1626 | ||
831 | /* audit_netlink_mutex synchronizes the writers */ | 1627 | /* audit_filter_mutex synchronizes the writers */ |
832 | mutex_lock(&audit_netlink_mutex); | 1628 | mutex_lock(&audit_filter_mutex); |
833 | 1629 | ||
834 | for (i = 0; i < AUDIT_NR_FILTERS; i++) { | 1630 | for (i = 0; i < AUDIT_NR_FILTERS; i++) { |
835 | list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { | 1631 | list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { |
836 | if (!audit_rule_has_selinux(&entry->rule)) | 1632 | if (!audit_rule_has_selinux(&entry->rule)) |
837 | continue; | 1633 | continue; |
838 | 1634 | ||
839 | nentry = audit_dupe_rule(&entry->rule); | 1635 | watch = entry->rule.watch; |
1636 | nentry = audit_dupe_rule(&entry->rule, watch); | ||
840 | if (unlikely(IS_ERR(nentry))) { | 1637 | if (unlikely(IS_ERR(nentry))) { |
841 | /* save the first error encountered for the | 1638 | /* save the first error encountered for the |
842 | * return value */ | 1639 | * return value */ |
843 | if (!err) | 1640 | if (!err) |
844 | err = PTR_ERR(nentry); | 1641 | err = PTR_ERR(nentry); |
845 | audit_panic("error updating selinux filters"); | 1642 | audit_panic("error updating selinux filters"); |
1643 | if (watch) | ||
1644 | list_del(&entry->rule.rlist); | ||
846 | list_del_rcu(&entry->list); | 1645 | list_del_rcu(&entry->list); |
847 | } else { | 1646 | } else { |
1647 | if (watch) { | ||
1648 | list_add(&nentry->rule.rlist, | ||
1649 | &watch->rules); | ||
1650 | list_del(&entry->rule.rlist); | ||
1651 | } | ||
848 | list_replace_rcu(&entry->list, &nentry->list); | 1652 | list_replace_rcu(&entry->list, &nentry->list); |
849 | } | 1653 | } |
850 | call_rcu(&entry->rcu, audit_free_rule_rcu); | 1654 | call_rcu(&entry->rcu, audit_free_rule_rcu); |
851 | } | 1655 | } |
852 | } | 1656 | } |
853 | 1657 | ||
854 | mutex_unlock(&audit_netlink_mutex); | 1658 | mutex_unlock(&audit_filter_mutex); |
855 | 1659 | ||
856 | return err; | 1660 | return err; |
857 | } | 1661 | } |
1662 | |||
1663 | /* Update watch data in audit rules based on inotify events. */ | ||
1664 | void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, | ||
1665 | u32 cookie, const char *dname, struct inode *inode) | ||
1666 | { | ||
1667 | struct audit_parent *parent; | ||
1668 | |||
1669 | parent = container_of(i_watch, struct audit_parent, wdata); | ||
1670 | |||
1671 | if (mask & (IN_CREATE|IN_MOVED_TO) && inode) | ||
1672 | audit_update_watch(parent, dname, inode->i_sb->s_dev, | ||
1673 | inode->i_ino, 0); | ||
1674 | else if (mask & (IN_DELETE|IN_MOVED_FROM)) | ||
1675 | audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); | ||
1676 | /* inotify automatically removes the watch and sends IN_IGNORED */ | ||
1677 | else if (mask & (IN_DELETE_SELF|IN_UNMOUNT)) | ||
1678 | audit_remove_parent_watches(parent); | ||
1679 | /* inotify does not remove the watch, so remove it manually */ | ||
1680 | else if(mask & IN_MOVE_SELF) { | ||
1681 | audit_remove_parent_watches(parent); | ||
1682 | inotify_remove_watch_locked(audit_ih, i_watch); | ||
1683 | } else if (mask & IN_IGNORED) | ||
1684 | put_inotify_watch(i_watch); | ||
1685 | } | ||
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1c03a4ed1b27..ae40ac8c39e7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -3,7 +3,7 @@ | |||
3 | * | 3 | * |
4 | * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. | 4 | * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. |
5 | * Copyright 2005 Hewlett-Packard Development Company, L.P. | 5 | * Copyright 2005 Hewlett-Packard Development Company, L.P. |
6 | * Copyright (C) 2005 IBM Corporation | 6 | * Copyright (C) 2005, 2006 IBM Corporation |
7 | * All Rights Reserved. | 7 | * All Rights Reserved. |
8 | * | 8 | * |
9 | * This program is free software; you can redistribute it and/or modify | 9 | * This program is free software; you can redistribute it and/or modify |
@@ -29,6 +29,9 @@ | |||
29 | * this file -- see entry.S) is based on a GPL'd patch written by | 29 | * this file -- see entry.S) is based on a GPL'd patch written by |
30 | * okir@suse.de and Copyright 2003 SuSE Linux AG. | 30 | * okir@suse.de and Copyright 2003 SuSE Linux AG. |
31 | * | 31 | * |
32 | * POSIX message queue support added by George Wilson <ltcgcw@us.ibm.com>, | ||
33 | * 2006. | ||
34 | * | ||
32 | * The support of additional filter rules compares (>, <, >=, <=) was | 35 | * The support of additional filter rules compares (>, <, >=, <=) was |
33 | * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005. | 36 | * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005. |
34 | * | 37 | * |
@@ -49,6 +52,7 @@ | |||
49 | #include <linux/module.h> | 52 | #include <linux/module.h> |
50 | #include <linux/mount.h> | 53 | #include <linux/mount.h> |
51 | #include <linux/socket.h> | 54 | #include <linux/socket.h> |
55 | #include <linux/mqueue.h> | ||
52 | #include <linux/audit.h> | 56 | #include <linux/audit.h> |
53 | #include <linux/personality.h> | 57 | #include <linux/personality.h> |
54 | #include <linux/time.h> | 58 | #include <linux/time.h> |
@@ -59,6 +63,8 @@ | |||
59 | #include <linux/list.h> | 63 | #include <linux/list.h> |
60 | #include <linux/tty.h> | 64 | #include <linux/tty.h> |
61 | #include <linux/selinux.h> | 65 | #include <linux/selinux.h> |
66 | #include <linux/binfmts.h> | ||
67 | #include <linux/syscalls.h> | ||
62 | 68 | ||
63 | #include "audit.h" | 69 | #include "audit.h" |
64 | 70 | ||
@@ -76,6 +82,9 @@ extern int audit_enabled; | |||
76 | * path_lookup. */ | 82 | * path_lookup. */ |
77 | #define AUDIT_NAMES_RESERVED 7 | 83 | #define AUDIT_NAMES_RESERVED 7 |
78 | 84 | ||
85 | /* Indicates that audit should log the full pathname. */ | ||
86 | #define AUDIT_NAME_FULL -1 | ||
87 | |||
79 | /* When fs/namei.c:getname() is called, we store the pointer in name and | 88 | /* When fs/namei.c:getname() is called, we store the pointer in name and |
80 | * we don't let putname() free it (instead we free all of the saved | 89 | * we don't let putname() free it (instead we free all of the saved |
81 | * pointers at syscall exit time). | 90 | * pointers at syscall exit time). |
@@ -83,8 +92,9 @@ extern int audit_enabled; | |||
83 | * Further, in fs/namei.c:path_lookup() we store the inode and device. */ | 92 | * Further, in fs/namei.c:path_lookup() we store the inode and device. */ |
84 | struct audit_names { | 93 | struct audit_names { |
85 | const char *name; | 94 | const char *name; |
95 | int name_len; /* number of name's characters to log */ | ||
96 | unsigned name_put; /* call __putname() for this name */ | ||
86 | unsigned long ino; | 97 | unsigned long ino; |
87 | unsigned long pino; | ||
88 | dev_t dev; | 98 | dev_t dev; |
89 | umode_t mode; | 99 | umode_t mode; |
90 | uid_t uid; | 100 | uid_t uid; |
@@ -100,6 +110,33 @@ struct audit_aux_data { | |||
100 | 110 | ||
101 | #define AUDIT_AUX_IPCPERM 0 | 111 | #define AUDIT_AUX_IPCPERM 0 |
102 | 112 | ||
113 | struct audit_aux_data_mq_open { | ||
114 | struct audit_aux_data d; | ||
115 | int oflag; | ||
116 | mode_t mode; | ||
117 | struct mq_attr attr; | ||
118 | }; | ||
119 | |||
120 | struct audit_aux_data_mq_sendrecv { | ||
121 | struct audit_aux_data d; | ||
122 | mqd_t mqdes; | ||
123 | size_t msg_len; | ||
124 | unsigned int msg_prio; | ||
125 | struct timespec abs_timeout; | ||
126 | }; | ||
127 | |||
128 | struct audit_aux_data_mq_notify { | ||
129 | struct audit_aux_data d; | ||
130 | mqd_t mqdes; | ||
131 | struct sigevent notification; | ||
132 | }; | ||
133 | |||
134 | struct audit_aux_data_mq_getsetattr { | ||
135 | struct audit_aux_data d; | ||
136 | mqd_t mqdes; | ||
137 | struct mq_attr mqstat; | ||
138 | }; | ||
139 | |||
103 | struct audit_aux_data_ipcctl { | 140 | struct audit_aux_data_ipcctl { |
104 | struct audit_aux_data d; | 141 | struct audit_aux_data d; |
105 | struct ipc_perm p; | 142 | struct ipc_perm p; |
@@ -110,6 +147,13 @@ struct audit_aux_data_ipcctl { | |||
110 | u32 osid; | 147 | u32 osid; |
111 | }; | 148 | }; |
112 | 149 | ||
150 | struct audit_aux_data_execve { | ||
151 | struct audit_aux_data d; | ||
152 | int argc; | ||
153 | int envc; | ||
154 | char mem[0]; | ||
155 | }; | ||
156 | |||
113 | struct audit_aux_data_socketcall { | 157 | struct audit_aux_data_socketcall { |
114 | struct audit_aux_data d; | 158 | struct audit_aux_data d; |
115 | int nargs; | 159 | int nargs; |
@@ -142,13 +186,14 @@ struct audit_context { | |||
142 | int auditable; /* 1 if record should be written */ | 186 | int auditable; /* 1 if record should be written */ |
143 | int name_count; | 187 | int name_count; |
144 | struct audit_names names[AUDIT_NAMES]; | 188 | struct audit_names names[AUDIT_NAMES]; |
189 | char * filterkey; /* key for rule that triggered record */ | ||
145 | struct dentry * pwd; | 190 | struct dentry * pwd; |
146 | struct vfsmount * pwdmnt; | 191 | struct vfsmount * pwdmnt; |
147 | struct audit_context *previous; /* For nested syscalls */ | 192 | struct audit_context *previous; /* For nested syscalls */ |
148 | struct audit_aux_data *aux; | 193 | struct audit_aux_data *aux; |
149 | 194 | ||
150 | /* Save things to print about task_struct */ | 195 | /* Save things to print about task_struct */ |
151 | pid_t pid; | 196 | pid_t pid, ppid; |
152 | uid_t uid, euid, suid, fsuid; | 197 | uid_t uid, euid, suid, fsuid; |
153 | gid_t gid, egid, sgid, fsgid; | 198 | gid_t gid, egid, sgid, fsgid; |
154 | unsigned long personality; | 199 | unsigned long personality; |
@@ -160,12 +205,13 @@ struct audit_context { | |||
160 | #endif | 205 | #endif |
161 | }; | 206 | }; |
162 | 207 | ||
163 | 208 | /* Determine if any context name data matches a rule's watch data */ | |
164 | /* Compare a task_struct with an audit_rule. Return 1 on match, 0 | 209 | /* Compare a task_struct with an audit_rule. Return 1 on match, 0 |
165 | * otherwise. */ | 210 | * otherwise. */ |
166 | static int audit_filter_rules(struct task_struct *tsk, | 211 | static int audit_filter_rules(struct task_struct *tsk, |
167 | struct audit_krule *rule, | 212 | struct audit_krule *rule, |
168 | struct audit_context *ctx, | 213 | struct audit_context *ctx, |
214 | struct audit_names *name, | ||
169 | enum audit_state *state) | 215 | enum audit_state *state) |
170 | { | 216 | { |
171 | int i, j, need_sid = 1; | 217 | int i, j, need_sid = 1; |
@@ -179,6 +225,10 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
179 | case AUDIT_PID: | 225 | case AUDIT_PID: |
180 | result = audit_comparator(tsk->pid, f->op, f->val); | 226 | result = audit_comparator(tsk->pid, f->op, f->val); |
181 | break; | 227 | break; |
228 | case AUDIT_PPID: | ||
229 | if (ctx) | ||
230 | result = audit_comparator(ctx->ppid, f->op, f->val); | ||
231 | break; | ||
182 | case AUDIT_UID: | 232 | case AUDIT_UID: |
183 | result = audit_comparator(tsk->uid, f->op, f->val); | 233 | result = audit_comparator(tsk->uid, f->op, f->val); |
184 | break; | 234 | break; |
@@ -224,7 +274,10 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
224 | } | 274 | } |
225 | break; | 275 | break; |
226 | case AUDIT_DEVMAJOR: | 276 | case AUDIT_DEVMAJOR: |
227 | if (ctx) { | 277 | if (name) |
278 | result = audit_comparator(MAJOR(name->dev), | ||
279 | f->op, f->val); | ||
280 | else if (ctx) { | ||
228 | for (j = 0; j < ctx->name_count; j++) { | 281 | for (j = 0; j < ctx->name_count; j++) { |
229 | if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { | 282 | if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { |
230 | ++result; | 283 | ++result; |
@@ -234,7 +287,10 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
234 | } | 287 | } |
235 | break; | 288 | break; |
236 | case AUDIT_DEVMINOR: | 289 | case AUDIT_DEVMINOR: |
237 | if (ctx) { | 290 | if (name) |
291 | result = audit_comparator(MINOR(name->dev), | ||
292 | f->op, f->val); | ||
293 | else if (ctx) { | ||
238 | for (j = 0; j < ctx->name_count; j++) { | 294 | for (j = 0; j < ctx->name_count; j++) { |
239 | if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { | 295 | if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { |
240 | ++result; | 296 | ++result; |
@@ -244,26 +300,32 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
244 | } | 300 | } |
245 | break; | 301 | break; |
246 | case AUDIT_INODE: | 302 | case AUDIT_INODE: |
247 | if (ctx) { | 303 | if (name) |
304 | result = (name->ino == f->val); | ||
305 | else if (ctx) { | ||
248 | for (j = 0; j < ctx->name_count; j++) { | 306 | for (j = 0; j < ctx->name_count; j++) { |
249 | if (audit_comparator(ctx->names[j].ino, f->op, f->val) || | 307 | if (audit_comparator(ctx->names[j].ino, f->op, f->val)) { |
250 | audit_comparator(ctx->names[j].pino, f->op, f->val)) { | ||
251 | ++result; | 308 | ++result; |
252 | break; | 309 | break; |
253 | } | 310 | } |
254 | } | 311 | } |
255 | } | 312 | } |
256 | break; | 313 | break; |
314 | case AUDIT_WATCH: | ||
315 | if (name && rule->watch->ino != (unsigned long)-1) | ||
316 | result = (name->dev == rule->watch->dev && | ||
317 | name->ino == rule->watch->ino); | ||
318 | break; | ||
257 | case AUDIT_LOGINUID: | 319 | case AUDIT_LOGINUID: |
258 | result = 0; | 320 | result = 0; |
259 | if (ctx) | 321 | if (ctx) |
260 | result = audit_comparator(ctx->loginuid, f->op, f->val); | 322 | result = audit_comparator(ctx->loginuid, f->op, f->val); |
261 | break; | 323 | break; |
262 | case AUDIT_SE_USER: | 324 | case AUDIT_SUBJ_USER: |
263 | case AUDIT_SE_ROLE: | 325 | case AUDIT_SUBJ_ROLE: |
264 | case AUDIT_SE_TYPE: | 326 | case AUDIT_SUBJ_TYPE: |
265 | case AUDIT_SE_SEN: | 327 | case AUDIT_SUBJ_SEN: |
266 | case AUDIT_SE_CLR: | 328 | case AUDIT_SUBJ_CLR: |
267 | /* NOTE: this may return negative values indicating | 329 | /* NOTE: this may return negative values indicating |
268 | a temporary error. We simply treat this as a | 330 | a temporary error. We simply treat this as a |
269 | match for now to avoid losing information that | 331 | match for now to avoid losing information that |
@@ -280,6 +342,46 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
280 | ctx); | 342 | ctx); |
281 | } | 343 | } |
282 | break; | 344 | break; |
345 | case AUDIT_OBJ_USER: | ||
346 | case AUDIT_OBJ_ROLE: | ||
347 | case AUDIT_OBJ_TYPE: | ||
348 | case AUDIT_OBJ_LEV_LOW: | ||
349 | case AUDIT_OBJ_LEV_HIGH: | ||
350 | /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR | ||
351 | also applies here */ | ||
352 | if (f->se_rule) { | ||
353 | /* Find files that match */ | ||
354 | if (name) { | ||
355 | result = selinux_audit_rule_match( | ||
356 | name->osid, f->type, f->op, | ||
357 | f->se_rule, ctx); | ||
358 | } else if (ctx) { | ||
359 | for (j = 0; j < ctx->name_count; j++) { | ||
360 | if (selinux_audit_rule_match( | ||
361 | ctx->names[j].osid, | ||
362 | f->type, f->op, | ||
363 | f->se_rule, ctx)) { | ||
364 | ++result; | ||
365 | break; | ||
366 | } | ||
367 | } | ||
368 | } | ||
369 | /* Find ipc objects that match */ | ||
370 | if (ctx) { | ||
371 | struct audit_aux_data *aux; | ||
372 | for (aux = ctx->aux; aux; | ||
373 | aux = aux->next) { | ||
374 | if (aux->type == AUDIT_IPC) { | ||
375 | struct audit_aux_data_ipcctl *axi = (void *)aux; | ||
376 | if (selinux_audit_rule_match(axi->osid, f->type, f->op, f->se_rule, ctx)) { | ||
377 | ++result; | ||
378 | break; | ||
379 | } | ||
380 | } | ||
381 | } | ||
382 | } | ||
383 | } | ||
384 | break; | ||
283 | case AUDIT_ARG0: | 385 | case AUDIT_ARG0: |
284 | case AUDIT_ARG1: | 386 | case AUDIT_ARG1: |
285 | case AUDIT_ARG2: | 387 | case AUDIT_ARG2: |
@@ -287,14 +389,19 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
287 | if (ctx) | 389 | if (ctx) |
288 | result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val); | 390 | result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val); |
289 | break; | 391 | break; |
392 | case AUDIT_FILTERKEY: | ||
393 | /* ignore this field for filtering */ | ||
394 | result = 1; | ||
395 | break; | ||
290 | } | 396 | } |
291 | 397 | ||
292 | if (!result) | 398 | if (!result) |
293 | return 0; | 399 | return 0; |
294 | } | 400 | } |
401 | if (rule->filterkey) | ||
402 | ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); | ||
295 | switch (rule->action) { | 403 | switch (rule->action) { |
296 | case AUDIT_NEVER: *state = AUDIT_DISABLED; break; | 404 | case AUDIT_NEVER: *state = AUDIT_DISABLED; break; |
297 | case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; | ||
298 | case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; | 405 | case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; |
299 | } | 406 | } |
300 | return 1; | 407 | return 1; |
@@ -311,7 +418,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk) | |||
311 | 418 | ||
312 | rcu_read_lock(); | 419 | rcu_read_lock(); |
313 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { | 420 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { |
314 | if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { | 421 | if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { |
315 | rcu_read_unlock(); | 422 | rcu_read_unlock(); |
316 | return state; | 423 | return state; |
317 | } | 424 | } |
@@ -341,8 +448,47 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, | |||
341 | int bit = AUDIT_BIT(ctx->major); | 448 | int bit = AUDIT_BIT(ctx->major); |
342 | 449 | ||
343 | list_for_each_entry_rcu(e, list, list) { | 450 | list_for_each_entry_rcu(e, list, list) { |
344 | if ((e->rule.mask[word] & bit) == bit | 451 | if ((e->rule.mask[word] & bit) == bit && |
345 | && audit_filter_rules(tsk, &e->rule, ctx, &state)) { | 452 | audit_filter_rules(tsk, &e->rule, ctx, NULL, |
453 | &state)) { | ||
454 | rcu_read_unlock(); | ||
455 | return state; | ||
456 | } | ||
457 | } | ||
458 | } | ||
459 | rcu_read_unlock(); | ||
460 | return AUDIT_BUILD_CONTEXT; | ||
461 | } | ||
462 | |||
463 | /* At syscall exit time, this filter is called if any audit_names[] have been | ||
464 | * collected during syscall processing. We only check rules in sublists at hash | ||
465 | * buckets applicable to the inode numbers in audit_names[]. | ||
466 | * Regarding audit_state, same rules apply as for audit_filter_syscall(). | ||
467 | */ | ||
468 | enum audit_state audit_filter_inodes(struct task_struct *tsk, | ||
469 | struct audit_context *ctx) | ||
470 | { | ||
471 | int i; | ||
472 | struct audit_entry *e; | ||
473 | enum audit_state state; | ||
474 | |||
475 | if (audit_pid && tsk->tgid == audit_pid) | ||
476 | return AUDIT_DISABLED; | ||
477 | |||
478 | rcu_read_lock(); | ||
479 | for (i = 0; i < ctx->name_count; i++) { | ||
480 | int word = AUDIT_WORD(ctx->major); | ||
481 | int bit = AUDIT_BIT(ctx->major); | ||
482 | struct audit_names *n = &ctx->names[i]; | ||
483 | int h = audit_hash_ino((u32)n->ino); | ||
484 | struct list_head *list = &audit_inode_hash[h]; | ||
485 | |||
486 | if (list_empty(list)) | ||
487 | continue; | ||
488 | |||
489 | list_for_each_entry_rcu(e, list, list) { | ||
490 | if ((e->rule.mask[word] & bit) == bit && | ||
491 | audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { | ||
346 | rcu_read_unlock(); | 492 | rcu_read_unlock(); |
347 | return state; | 493 | return state; |
348 | } | 494 | } |
@@ -352,6 +498,11 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, | |||
352 | return AUDIT_BUILD_CONTEXT; | 498 | return AUDIT_BUILD_CONTEXT; |
353 | } | 499 | } |
354 | 500 | ||
501 | void audit_set_auditable(struct audit_context *ctx) | ||
502 | { | ||
503 | ctx->auditable = 1; | ||
504 | } | ||
505 | |||
355 | static inline struct audit_context *audit_get_context(struct task_struct *tsk, | 506 | static inline struct audit_context *audit_get_context(struct task_struct *tsk, |
356 | int return_valid, | 507 | int return_valid, |
357 | int return_code) | 508 | int return_code) |
@@ -365,12 +516,22 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, | |||
365 | 516 | ||
366 | if (context->in_syscall && !context->auditable) { | 517 | if (context->in_syscall && !context->auditable) { |
367 | enum audit_state state; | 518 | enum audit_state state; |
519 | |||
368 | state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); | 520 | state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); |
521 | if (state == AUDIT_RECORD_CONTEXT) { | ||
522 | context->auditable = 1; | ||
523 | goto get_context; | ||
524 | } | ||
525 | |||
526 | state = audit_filter_inodes(tsk, context); | ||
369 | if (state == AUDIT_RECORD_CONTEXT) | 527 | if (state == AUDIT_RECORD_CONTEXT) |
370 | context->auditable = 1; | 528 | context->auditable = 1; |
529 | |||
371 | } | 530 | } |
372 | 531 | ||
532 | get_context: | ||
373 | context->pid = tsk->pid; | 533 | context->pid = tsk->pid; |
534 | context->ppid = sys_getppid(); /* sic. tsk == current in all cases */ | ||
374 | context->uid = tsk->uid; | 535 | context->uid = tsk->uid; |
375 | context->gid = tsk->gid; | 536 | context->gid = tsk->gid; |
376 | context->euid = tsk->euid; | 537 | context->euid = tsk->euid; |
@@ -413,7 +574,7 @@ static inline void audit_free_names(struct audit_context *context) | |||
413 | #endif | 574 | #endif |
414 | 575 | ||
415 | for (i = 0; i < context->name_count; i++) { | 576 | for (i = 0; i < context->name_count; i++) { |
416 | if (context->names[i].name) | 577 | if (context->names[i].name && context->names[i].name_put) |
417 | __putname(context->names[i].name); | 578 | __putname(context->names[i].name); |
418 | } | 579 | } |
419 | context->name_count = 0; | 580 | context->name_count = 0; |
@@ -513,6 +674,7 @@ static inline void audit_free_context(struct audit_context *context) | |||
513 | } | 674 | } |
514 | audit_free_names(context); | 675 | audit_free_names(context); |
515 | audit_free_aux(context); | 676 | audit_free_aux(context); |
677 | kfree(context->filterkey); | ||
516 | kfree(context); | 678 | kfree(context); |
517 | context = previous; | 679 | context = previous; |
518 | } while (context); | 680 | } while (context); |
@@ -544,8 +706,7 @@ static void audit_log_task_context(struct audit_buffer *ab) | |||
544 | return; | 706 | return; |
545 | 707 | ||
546 | error_path: | 708 | error_path: |
547 | if (ctx) | 709 | kfree(ctx); |
548 | kfree(ctx); | ||
549 | audit_panic("error in audit_log_task_context"); | 710 | audit_panic("error in audit_log_task_context"); |
550 | return; | 711 | return; |
551 | } | 712 | } |
@@ -606,7 +767,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
606 | tty = "(none)"; | 767 | tty = "(none)"; |
607 | audit_log_format(ab, | 768 | audit_log_format(ab, |
608 | " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" | 769 | " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" |
609 | " pid=%d auid=%u uid=%u gid=%u" | 770 | " ppid=%d pid=%d auid=%u uid=%u gid=%u" |
610 | " euid=%u suid=%u fsuid=%u" | 771 | " euid=%u suid=%u fsuid=%u" |
611 | " egid=%u sgid=%u fsgid=%u tty=%s", | 772 | " egid=%u sgid=%u fsgid=%u tty=%s", |
612 | context->argv[0], | 773 | context->argv[0], |
@@ -614,6 +775,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
614 | context->argv[2], | 775 | context->argv[2], |
615 | context->argv[3], | 776 | context->argv[3], |
616 | context->name_count, | 777 | context->name_count, |
778 | context->ppid, | ||
617 | context->pid, | 779 | context->pid, |
618 | context->loginuid, | 780 | context->loginuid, |
619 | context->uid, | 781 | context->uid, |
@@ -621,6 +783,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
621 | context->euid, context->suid, context->fsuid, | 783 | context->euid, context->suid, context->fsuid, |
622 | context->egid, context->sgid, context->fsgid, tty); | 784 | context->egid, context->sgid, context->fsgid, tty); |
623 | audit_log_task_info(ab, tsk); | 785 | audit_log_task_info(ab, tsk); |
786 | if (context->filterkey) { | ||
787 | audit_log_format(ab, " key="); | ||
788 | audit_log_untrustedstring(ab, context->filterkey); | ||
789 | } else | ||
790 | audit_log_format(ab, " key=(null)"); | ||
624 | audit_log_end(ab); | 791 | audit_log_end(ab); |
625 | 792 | ||
626 | for (aux = context->aux; aux; aux = aux->next) { | 793 | for (aux = context->aux; aux; aux = aux->next) { |
@@ -630,11 +797,48 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
630 | continue; /* audit_panic has been called */ | 797 | continue; /* audit_panic has been called */ |
631 | 798 | ||
632 | switch (aux->type) { | 799 | switch (aux->type) { |
800 | case AUDIT_MQ_OPEN: { | ||
801 | struct audit_aux_data_mq_open *axi = (void *)aux; | ||
802 | audit_log_format(ab, | ||
803 | "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " | ||
804 | "mq_msgsize=%ld mq_curmsgs=%ld", | ||
805 | axi->oflag, axi->mode, axi->attr.mq_flags, | ||
806 | axi->attr.mq_maxmsg, axi->attr.mq_msgsize, | ||
807 | axi->attr.mq_curmsgs); | ||
808 | break; } | ||
809 | |||
810 | case AUDIT_MQ_SENDRECV: { | ||
811 | struct audit_aux_data_mq_sendrecv *axi = (void *)aux; | ||
812 | audit_log_format(ab, | ||
813 | "mqdes=%d msg_len=%zd msg_prio=%u " | ||
814 | "abs_timeout_sec=%ld abs_timeout_nsec=%ld", | ||
815 | axi->mqdes, axi->msg_len, axi->msg_prio, | ||
816 | axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec); | ||
817 | break; } | ||
818 | |||
819 | case AUDIT_MQ_NOTIFY: { | ||
820 | struct audit_aux_data_mq_notify *axi = (void *)aux; | ||
821 | audit_log_format(ab, | ||
822 | "mqdes=%d sigev_signo=%d", | ||
823 | axi->mqdes, | ||
824 | axi->notification.sigev_signo); | ||
825 | break; } | ||
826 | |||
827 | case AUDIT_MQ_GETSETATTR: { | ||
828 | struct audit_aux_data_mq_getsetattr *axi = (void *)aux; | ||
829 | audit_log_format(ab, | ||
830 | "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld " | ||
831 | "mq_curmsgs=%ld ", | ||
832 | axi->mqdes, | ||
833 | axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg, | ||
834 | axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs); | ||
835 | break; } | ||
836 | |||
633 | case AUDIT_IPC: { | 837 | case AUDIT_IPC: { |
634 | struct audit_aux_data_ipcctl *axi = (void *)aux; | 838 | struct audit_aux_data_ipcctl *axi = (void *)aux; |
635 | audit_log_format(ab, | 839 | audit_log_format(ab, |
636 | " qbytes=%lx iuid=%u igid=%u mode=%x", | 840 | "ouid=%u ogid=%u mode=%x", |
637 | axi->qbytes, axi->uid, axi->gid, axi->mode); | 841 | axi->uid, axi->gid, axi->mode); |
638 | if (axi->osid != 0) { | 842 | if (axi->osid != 0) { |
639 | char *ctx = NULL; | 843 | char *ctx = NULL; |
640 | u32 len; | 844 | u32 len; |
@@ -652,19 +856,18 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
652 | case AUDIT_IPC_SET_PERM: { | 856 | case AUDIT_IPC_SET_PERM: { |
653 | struct audit_aux_data_ipcctl *axi = (void *)aux; | 857 | struct audit_aux_data_ipcctl *axi = (void *)aux; |
654 | audit_log_format(ab, | 858 | audit_log_format(ab, |
655 | " new qbytes=%lx new iuid=%u new igid=%u new mode=%x", | 859 | "qbytes=%lx ouid=%u ogid=%u mode=%x", |
656 | axi->qbytes, axi->uid, axi->gid, axi->mode); | 860 | axi->qbytes, axi->uid, axi->gid, axi->mode); |
657 | if (axi->osid != 0) { | 861 | break; } |
658 | char *ctx = NULL; | 862 | |
659 | u32 len; | 863 | case AUDIT_EXECVE: { |
660 | if (selinux_ctxid_to_string( | 864 | struct audit_aux_data_execve *axi = (void *)aux; |
661 | axi->osid, &ctx, &len)) { | 865 | int i; |
662 | audit_log_format(ab, " osid=%u", | 866 | const char *p; |
663 | axi->osid); | 867 | for (i = 0, p = axi->mem; i < axi->argc; i++) { |
664 | call_panic = 1; | 868 | audit_log_format(ab, "a%d=", i); |
665 | } else | 869 | p = audit_log_untrustedstring(ab, p); |
666 | audit_log_format(ab, " obj=%s", ctx); | 870 | audit_log_format(ab, "\n"); |
667 | kfree(ctx); | ||
668 | } | 871 | } |
669 | break; } | 872 | break; } |
670 | 873 | ||
@@ -700,8 +903,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
700 | } | 903 | } |
701 | } | 904 | } |
702 | for (i = 0; i < context->name_count; i++) { | 905 | for (i = 0; i < context->name_count; i++) { |
703 | unsigned long ino = context->names[i].ino; | 906 | struct audit_names *n = &context->names[i]; |
704 | unsigned long pino = context->names[i].pino; | ||
705 | 907 | ||
706 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); | 908 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); |
707 | if (!ab) | 909 | if (!ab) |
@@ -709,33 +911,47 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
709 | 911 | ||
710 | audit_log_format(ab, "item=%d", i); | 912 | audit_log_format(ab, "item=%d", i); |
711 | 913 | ||
712 | audit_log_format(ab, " name="); | 914 | if (n->name) { |
713 | if (context->names[i].name) | 915 | switch(n->name_len) { |
714 | audit_log_untrustedstring(ab, context->names[i].name); | 916 | case AUDIT_NAME_FULL: |
715 | else | 917 | /* log the full path */ |
716 | audit_log_format(ab, "(null)"); | 918 | audit_log_format(ab, " name="); |
717 | 919 | audit_log_untrustedstring(ab, n->name); | |
718 | if (pino != (unsigned long)-1) | 920 | break; |
719 | audit_log_format(ab, " parent=%lu", pino); | 921 | case 0: |
720 | if (ino != (unsigned long)-1) | 922 | /* name was specified as a relative path and the |
721 | audit_log_format(ab, " inode=%lu", ino); | 923 | * directory component is the cwd */ |
722 | if ((pino != (unsigned long)-1) || (ino != (unsigned long)-1)) | 924 | audit_log_d_path(ab, " name=", context->pwd, |
723 | audit_log_format(ab, " dev=%02x:%02x mode=%#o" | 925 | context->pwdmnt); |
724 | " ouid=%u ogid=%u rdev=%02x:%02x", | 926 | break; |
725 | MAJOR(context->names[i].dev), | 927 | default: |
726 | MINOR(context->names[i].dev), | 928 | /* log the name's directory component */ |
727 | context->names[i].mode, | 929 | audit_log_format(ab, " name="); |
728 | context->names[i].uid, | 930 | audit_log_n_untrustedstring(ab, n->name_len, |
729 | context->names[i].gid, | 931 | n->name); |
730 | MAJOR(context->names[i].rdev), | 932 | } |
731 | MINOR(context->names[i].rdev)); | 933 | } else |
732 | if (context->names[i].osid != 0) { | 934 | audit_log_format(ab, " name=(null)"); |
935 | |||
936 | if (n->ino != (unsigned long)-1) { | ||
937 | audit_log_format(ab, " inode=%lu" | ||
938 | " dev=%02x:%02x mode=%#o" | ||
939 | " ouid=%u ogid=%u rdev=%02x:%02x", | ||
940 | n->ino, | ||
941 | MAJOR(n->dev), | ||
942 | MINOR(n->dev), | ||
943 | n->mode, | ||
944 | n->uid, | ||
945 | n->gid, | ||
946 | MAJOR(n->rdev), | ||
947 | MINOR(n->rdev)); | ||
948 | } | ||
949 | if (n->osid != 0) { | ||
733 | char *ctx = NULL; | 950 | char *ctx = NULL; |
734 | u32 len; | 951 | u32 len; |
735 | if (selinux_ctxid_to_string( | 952 | if (selinux_ctxid_to_string( |
736 | context->names[i].osid, &ctx, &len)) { | 953 | n->osid, &ctx, &len)) { |
737 | audit_log_format(ab, " osid=%u", | 954 | audit_log_format(ab, " osid=%u", n->osid); |
738 | context->names[i].osid); | ||
739 | call_panic = 2; | 955 | call_panic = 2; |
740 | } else | 956 | } else |
741 | audit_log_format(ab, " obj=%s", ctx); | 957 | audit_log_format(ab, " obj=%s", ctx); |
@@ -897,6 +1113,8 @@ void audit_syscall_exit(int valid, long return_code) | |||
897 | } else { | 1113 | } else { |
898 | audit_free_names(context); | 1114 | audit_free_names(context); |
899 | audit_free_aux(context); | 1115 | audit_free_aux(context); |
1116 | kfree(context->filterkey); | ||
1117 | context->filterkey = NULL; | ||
900 | tsk->audit_context = context; | 1118 | tsk->audit_context = context; |
901 | } | 1119 | } |
902 | } | 1120 | } |
@@ -908,11 +1126,11 @@ void audit_syscall_exit(int valid, long return_code) | |||
908 | * Add a name to the list of audit names for this context. | 1126 | * Add a name to the list of audit names for this context. |
909 | * Called from fs/namei.c:getname(). | 1127 | * Called from fs/namei.c:getname(). |
910 | */ | 1128 | */ |
911 | void audit_getname(const char *name) | 1129 | void __audit_getname(const char *name) |
912 | { | 1130 | { |
913 | struct audit_context *context = current->audit_context; | 1131 | struct audit_context *context = current->audit_context; |
914 | 1132 | ||
915 | if (!context || IS_ERR(name) || !name) | 1133 | if (IS_ERR(name) || !name) |
916 | return; | 1134 | return; |
917 | 1135 | ||
918 | if (!context->in_syscall) { | 1136 | if (!context->in_syscall) { |
@@ -925,6 +1143,8 @@ void audit_getname(const char *name) | |||
925 | } | 1143 | } |
926 | BUG_ON(context->name_count >= AUDIT_NAMES); | 1144 | BUG_ON(context->name_count >= AUDIT_NAMES); |
927 | context->names[context->name_count].name = name; | 1145 | context->names[context->name_count].name = name; |
1146 | context->names[context->name_count].name_len = AUDIT_NAME_FULL; | ||
1147 | context->names[context->name_count].name_put = 1; | ||
928 | context->names[context->name_count].ino = (unsigned long)-1; | 1148 | context->names[context->name_count].ino = (unsigned long)-1; |
929 | ++context->name_count; | 1149 | ++context->name_count; |
930 | if (!context->pwd) { | 1150 | if (!context->pwd) { |
@@ -991,11 +1211,10 @@ static void audit_inode_context(int idx, const struct inode *inode) | |||
991 | * audit_inode - store the inode and device from a lookup | 1211 | * audit_inode - store the inode and device from a lookup |
992 | * @name: name being audited | 1212 | * @name: name being audited |
993 | * @inode: inode being audited | 1213 | * @inode: inode being audited |
994 | * @flags: lookup flags (as used in path_lookup()) | ||
995 | * | 1214 | * |
996 | * Called from fs/namei.c:path_lookup(). | 1215 | * Called from fs/namei.c:path_lookup(). |
997 | */ | 1216 | */ |
998 | void __audit_inode(const char *name, const struct inode *inode, unsigned flags) | 1217 | void __audit_inode(const char *name, const struct inode *inode) |
999 | { | 1218 | { |
1000 | int idx; | 1219 | int idx; |
1001 | struct audit_context *context = current->audit_context; | 1220 | struct audit_context *context = current->audit_context; |
@@ -1021,20 +1240,13 @@ void __audit_inode(const char *name, const struct inode *inode, unsigned flags) | |||
1021 | ++context->ino_count; | 1240 | ++context->ino_count; |
1022 | #endif | 1241 | #endif |
1023 | } | 1242 | } |
1243 | context->names[idx].ino = inode->i_ino; | ||
1024 | context->names[idx].dev = inode->i_sb->s_dev; | 1244 | context->names[idx].dev = inode->i_sb->s_dev; |
1025 | context->names[idx].mode = inode->i_mode; | 1245 | context->names[idx].mode = inode->i_mode; |
1026 | context->names[idx].uid = inode->i_uid; | 1246 | context->names[idx].uid = inode->i_uid; |
1027 | context->names[idx].gid = inode->i_gid; | 1247 | context->names[idx].gid = inode->i_gid; |
1028 | context->names[idx].rdev = inode->i_rdev; | 1248 | context->names[idx].rdev = inode->i_rdev; |
1029 | audit_inode_context(idx, inode); | 1249 | audit_inode_context(idx, inode); |
1030 | if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) && | ||
1031 | (strcmp(name, ".") != 0)) { | ||
1032 | context->names[idx].ino = (unsigned long)-1; | ||
1033 | context->names[idx].pino = inode->i_ino; | ||
1034 | } else { | ||
1035 | context->names[idx].ino = inode->i_ino; | ||
1036 | context->names[idx].pino = (unsigned long)-1; | ||
1037 | } | ||
1038 | } | 1250 | } |
1039 | 1251 | ||
1040 | /** | 1252 | /** |
@@ -1056,51 +1268,40 @@ void __audit_inode_child(const char *dname, const struct inode *inode, | |||
1056 | { | 1268 | { |
1057 | int idx; | 1269 | int idx; |
1058 | struct audit_context *context = current->audit_context; | 1270 | struct audit_context *context = current->audit_context; |
1271 | const char *found_name = NULL; | ||
1272 | int dirlen = 0; | ||
1059 | 1273 | ||
1060 | if (!context->in_syscall) | 1274 | if (!context->in_syscall) |
1061 | return; | 1275 | return; |
1062 | 1276 | ||
1063 | /* determine matching parent */ | 1277 | /* determine matching parent */ |
1064 | if (dname) | 1278 | if (!dname) |
1065 | for (idx = 0; idx < context->name_count; idx++) | 1279 | goto update_context; |
1066 | if (context->names[idx].pino == pino) { | 1280 | for (idx = 0; idx < context->name_count; idx++) |
1067 | const char *n; | 1281 | if (context->names[idx].ino == pino) { |
1068 | const char *name = context->names[idx].name; | 1282 | const char *name = context->names[idx].name; |
1069 | int dlen = strlen(dname); | 1283 | |
1070 | int nlen = name ? strlen(name) : 0; | 1284 | if (!name) |
1071 | 1285 | continue; | |
1072 | if (nlen < dlen) | 1286 | |
1073 | continue; | 1287 | if (audit_compare_dname_path(dname, name, &dirlen) == 0) { |
1074 | 1288 | context->names[idx].name_len = dirlen; | |
1075 | /* disregard trailing slashes */ | 1289 | found_name = name; |
1076 | n = name + nlen - 1; | 1290 | break; |
1077 | while ((*n == '/') && (n > name)) | ||
1078 | n--; | ||
1079 | |||
1080 | /* find last path component */ | ||
1081 | n = n - dlen + 1; | ||
1082 | if (n < name) | ||
1083 | continue; | ||
1084 | else if (n > name) { | ||
1085 | if (*--n != '/') | ||
1086 | continue; | ||
1087 | else | ||
1088 | n++; | ||
1089 | } | ||
1090 | |||
1091 | if (strncmp(n, dname, dlen) == 0) | ||
1092 | goto update_context; | ||
1093 | } | 1291 | } |
1292 | } | ||
1094 | 1293 | ||
1095 | /* catch-all in case match not found */ | 1294 | update_context: |
1096 | idx = context->name_count++; | 1295 | idx = context->name_count++; |
1097 | context->names[idx].name = NULL; | ||
1098 | context->names[idx].pino = pino; | ||
1099 | #if AUDIT_DEBUG | 1296 | #if AUDIT_DEBUG |
1100 | context->ino_count++; | 1297 | context->ino_count++; |
1101 | #endif | 1298 | #endif |
1299 | /* Re-use the name belonging to the slot for a matching parent directory. | ||
1300 | * All names for this context are relinquished in audit_free_names() */ | ||
1301 | context->names[idx].name = found_name; | ||
1302 | context->names[idx].name_len = AUDIT_NAME_FULL; | ||
1303 | context->names[idx].name_put = 0; /* don't call __putname() */ | ||
1102 | 1304 | ||
1103 | update_context: | ||
1104 | if (inode) { | 1305 | if (inode) { |
1105 | context->names[idx].ino = inode->i_ino; | 1306 | context->names[idx].ino = inode->i_ino; |
1106 | context->names[idx].dev = inode->i_sb->s_dev; | 1307 | context->names[idx].dev = inode->i_sb->s_dev; |
@@ -1109,7 +1310,8 @@ update_context: | |||
1109 | context->names[idx].gid = inode->i_gid; | 1310 | context->names[idx].gid = inode->i_gid; |
1110 | context->names[idx].rdev = inode->i_rdev; | 1311 | context->names[idx].rdev = inode->i_rdev; |
1111 | audit_inode_context(idx, inode); | 1312 | audit_inode_context(idx, inode); |
1112 | } | 1313 | } else |
1314 | context->names[idx].ino = (unsigned long)-1; | ||
1113 | } | 1315 | } |
1114 | 1316 | ||
1115 | /** | 1317 | /** |
@@ -1142,18 +1344,23 @@ void auditsc_get_stamp(struct audit_context *ctx, | |||
1142 | */ | 1344 | */ |
1143 | int audit_set_loginuid(struct task_struct *task, uid_t loginuid) | 1345 | int audit_set_loginuid(struct task_struct *task, uid_t loginuid) |
1144 | { | 1346 | { |
1145 | if (task->audit_context) { | 1347 | struct audit_context *context = task->audit_context; |
1146 | struct audit_buffer *ab; | 1348 | |
1147 | 1349 | if (context) { | |
1148 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); | 1350 | /* Only log if audit is enabled */ |
1149 | if (ab) { | 1351 | if (context->in_syscall) { |
1150 | audit_log_format(ab, "login pid=%d uid=%u " | 1352 | struct audit_buffer *ab; |
1151 | "old auid=%u new auid=%u", | 1353 | |
1152 | task->pid, task->uid, | 1354 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); |
1153 | task->audit_context->loginuid, loginuid); | 1355 | if (ab) { |
1154 | audit_log_end(ab); | 1356 | audit_log_format(ab, "login pid=%d uid=%u " |
1357 | "old auid=%u new auid=%u", | ||
1358 | task->pid, task->uid, | ||
1359 | context->loginuid, loginuid); | ||
1360 | audit_log_end(ab); | ||
1361 | } | ||
1155 | } | 1362 | } |
1156 | task->audit_context->loginuid = loginuid; | 1363 | context->loginuid = loginuid; |
1157 | } | 1364 | } |
1158 | return 0; | 1365 | return 0; |
1159 | } | 1366 | } |
@@ -1170,16 +1377,193 @@ uid_t audit_get_loginuid(struct audit_context *ctx) | |||
1170 | } | 1377 | } |
1171 | 1378 | ||
1172 | /** | 1379 | /** |
1173 | * audit_ipc_obj - record audit data for ipc object | 1380 | * __audit_mq_open - record audit data for a POSIX MQ open |
1174 | * @ipcp: ipc permissions | 1381 | * @oflag: open flag |
1382 | * @mode: mode bits | ||
1383 | * @u_attr: queue attributes | ||
1175 | * | 1384 | * |
1176 | * Returns 0 for success or NULL context or < 0 on error. | 1385 | * Returns 0 for success or NULL context or < 0 on error. |
1177 | */ | 1386 | */ |
1178 | int audit_ipc_obj(struct kern_ipc_perm *ipcp) | 1387 | int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) |
1179 | { | 1388 | { |
1180 | struct audit_aux_data_ipcctl *ax; | 1389 | struct audit_aux_data_mq_open *ax; |
1390 | struct audit_context *context = current->audit_context; | ||
1391 | |||
1392 | if (!audit_enabled) | ||
1393 | return 0; | ||
1394 | |||
1395 | if (likely(!context)) | ||
1396 | return 0; | ||
1397 | |||
1398 | ax = kmalloc(sizeof(*ax), GFP_ATOMIC); | ||
1399 | if (!ax) | ||
1400 | return -ENOMEM; | ||
1401 | |||
1402 | if (u_attr != NULL) { | ||
1403 | if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) { | ||
1404 | kfree(ax); | ||
1405 | return -EFAULT; | ||
1406 | } | ||
1407 | } else | ||
1408 | memset(&ax->attr, 0, sizeof(ax->attr)); | ||
1409 | |||
1410 | ax->oflag = oflag; | ||
1411 | ax->mode = mode; | ||
1412 | |||
1413 | ax->d.type = AUDIT_MQ_OPEN; | ||
1414 | ax->d.next = context->aux; | ||
1415 | context->aux = (void *)ax; | ||
1416 | return 0; | ||
1417 | } | ||
1418 | |||
1419 | /** | ||
1420 | * __audit_mq_timedsend - record audit data for a POSIX MQ timed send | ||
1421 | * @mqdes: MQ descriptor | ||
1422 | * @msg_len: Message length | ||
1423 | * @msg_prio: Message priority | ||
1424 | * @u_abs_timeout: Message timeout in absolute time | ||
1425 | * | ||
1426 | * Returns 0 for success or NULL context or < 0 on error. | ||
1427 | */ | ||
1428 | int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, | ||
1429 | const struct timespec __user *u_abs_timeout) | ||
1430 | { | ||
1431 | struct audit_aux_data_mq_sendrecv *ax; | ||
1432 | struct audit_context *context = current->audit_context; | ||
1433 | |||
1434 | if (!audit_enabled) | ||
1435 | return 0; | ||
1436 | |||
1437 | if (likely(!context)) | ||
1438 | return 0; | ||
1439 | |||
1440 | ax = kmalloc(sizeof(*ax), GFP_ATOMIC); | ||
1441 | if (!ax) | ||
1442 | return -ENOMEM; | ||
1443 | |||
1444 | if (u_abs_timeout != NULL) { | ||
1445 | if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) { | ||
1446 | kfree(ax); | ||
1447 | return -EFAULT; | ||
1448 | } | ||
1449 | } else | ||
1450 | memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout)); | ||
1451 | |||
1452 | ax->mqdes = mqdes; | ||
1453 | ax->msg_len = msg_len; | ||
1454 | ax->msg_prio = msg_prio; | ||
1455 | |||
1456 | ax->d.type = AUDIT_MQ_SENDRECV; | ||
1457 | ax->d.next = context->aux; | ||
1458 | context->aux = (void *)ax; | ||
1459 | return 0; | ||
1460 | } | ||
1461 | |||
1462 | /** | ||
1463 | * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive | ||
1464 | * @mqdes: MQ descriptor | ||
1465 | * @msg_len: Message length | ||
1466 | * @u_msg_prio: Message priority | ||
1467 | * @u_abs_timeout: Message timeout in absolute time | ||
1468 | * | ||
1469 | * Returns 0 for success or NULL context or < 0 on error. | ||
1470 | */ | ||
1471 | int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, | ||
1472 | unsigned int __user *u_msg_prio, | ||
1473 | const struct timespec __user *u_abs_timeout) | ||
1474 | { | ||
1475 | struct audit_aux_data_mq_sendrecv *ax; | ||
1476 | struct audit_context *context = current->audit_context; | ||
1477 | |||
1478 | if (!audit_enabled) | ||
1479 | return 0; | ||
1480 | |||
1481 | if (likely(!context)) | ||
1482 | return 0; | ||
1483 | |||
1484 | ax = kmalloc(sizeof(*ax), GFP_ATOMIC); | ||
1485 | if (!ax) | ||
1486 | return -ENOMEM; | ||
1487 | |||
1488 | if (u_msg_prio != NULL) { | ||
1489 | if (get_user(ax->msg_prio, u_msg_prio)) { | ||
1490 | kfree(ax); | ||
1491 | return -EFAULT; | ||
1492 | } | ||
1493 | } else | ||
1494 | ax->msg_prio = 0; | ||
1495 | |||
1496 | if (u_abs_timeout != NULL) { | ||
1497 | if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) { | ||
1498 | kfree(ax); | ||
1499 | return -EFAULT; | ||
1500 | } | ||
1501 | } else | ||
1502 | memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout)); | ||
1503 | |||
1504 | ax->mqdes = mqdes; | ||
1505 | ax->msg_len = msg_len; | ||
1506 | |||
1507 | ax->d.type = AUDIT_MQ_SENDRECV; | ||
1508 | ax->d.next = context->aux; | ||
1509 | context->aux = (void *)ax; | ||
1510 | return 0; | ||
1511 | } | ||
1512 | |||
1513 | /** | ||
1514 | * __audit_mq_notify - record audit data for a POSIX MQ notify | ||
1515 | * @mqdes: MQ descriptor | ||
1516 | * @u_notification: Notification event | ||
1517 | * | ||
1518 | * Returns 0 for success or NULL context or < 0 on error. | ||
1519 | */ | ||
1520 | |||
1521 | int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) | ||
1522 | { | ||
1523 | struct audit_aux_data_mq_notify *ax; | ||
1524 | struct audit_context *context = current->audit_context; | ||
1525 | |||
1526 | if (!audit_enabled) | ||
1527 | return 0; | ||
1528 | |||
1529 | if (likely(!context)) | ||
1530 | return 0; | ||
1531 | |||
1532 | ax = kmalloc(sizeof(*ax), GFP_ATOMIC); | ||
1533 | if (!ax) | ||
1534 | return -ENOMEM; | ||
1535 | |||
1536 | if (u_notification != NULL) { | ||
1537 | if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) { | ||
1538 | kfree(ax); | ||
1539 | return -EFAULT; | ||
1540 | } | ||
1541 | } else | ||
1542 | memset(&ax->notification, 0, sizeof(ax->notification)); | ||
1543 | |||
1544 | ax->mqdes = mqdes; | ||
1545 | |||
1546 | ax->d.type = AUDIT_MQ_NOTIFY; | ||
1547 | ax->d.next = context->aux; | ||
1548 | context->aux = (void *)ax; | ||
1549 | return 0; | ||
1550 | } | ||
1551 | |||
1552 | /** | ||
1553 | * __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute | ||
1554 | * @mqdes: MQ descriptor | ||
1555 | * @mqstat: MQ flags | ||
1556 | * | ||
1557 | * Returns 0 for success or NULL context or < 0 on error. | ||
1558 | */ | ||
1559 | int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) | ||
1560 | { | ||
1561 | struct audit_aux_data_mq_getsetattr *ax; | ||
1181 | struct audit_context *context = current->audit_context; | 1562 | struct audit_context *context = current->audit_context; |
1182 | 1563 | ||
1564 | if (!audit_enabled) | ||
1565 | return 0; | ||
1566 | |||
1183 | if (likely(!context)) | 1567 | if (likely(!context)) |
1184 | return 0; | 1568 | return 0; |
1185 | 1569 | ||
@@ -1187,6 +1571,30 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp) | |||
1187 | if (!ax) | 1571 | if (!ax) |
1188 | return -ENOMEM; | 1572 | return -ENOMEM; |
1189 | 1573 | ||
1574 | ax->mqdes = mqdes; | ||
1575 | ax->mqstat = *mqstat; | ||
1576 | |||
1577 | ax->d.type = AUDIT_MQ_GETSETATTR; | ||
1578 | ax->d.next = context->aux; | ||
1579 | context->aux = (void *)ax; | ||
1580 | return 0; | ||
1581 | } | ||
1582 | |||
1583 | /** | ||
1584 | * audit_ipc_obj - record audit data for ipc object | ||
1585 | * @ipcp: ipc permissions | ||
1586 | * | ||
1587 | * Returns 0 for success or NULL context or < 0 on error. | ||
1588 | */ | ||
1589 | int __audit_ipc_obj(struct kern_ipc_perm *ipcp) | ||
1590 | { | ||
1591 | struct audit_aux_data_ipcctl *ax; | ||
1592 | struct audit_context *context = current->audit_context; | ||
1593 | |||
1594 | ax = kmalloc(sizeof(*ax), GFP_ATOMIC); | ||
1595 | if (!ax) | ||
1596 | return -ENOMEM; | ||
1597 | |||
1190 | ax->uid = ipcp->uid; | 1598 | ax->uid = ipcp->uid; |
1191 | ax->gid = ipcp->gid; | 1599 | ax->gid = ipcp->gid; |
1192 | ax->mode = ipcp->mode; | 1600 | ax->mode = ipcp->mode; |
@@ -1207,14 +1615,11 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp) | |||
1207 | * | 1615 | * |
1208 | * Returns 0 for success or NULL context or < 0 on error. | 1616 | * Returns 0 for success or NULL context or < 0 on error. |
1209 | */ | 1617 | */ |
1210 | int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp) | 1618 | int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) |
1211 | { | 1619 | { |
1212 | struct audit_aux_data_ipcctl *ax; | 1620 | struct audit_aux_data_ipcctl *ax; |
1213 | struct audit_context *context = current->audit_context; | 1621 | struct audit_context *context = current->audit_context; |
1214 | 1622 | ||
1215 | if (likely(!context)) | ||
1216 | return 0; | ||
1217 | |||
1218 | ax = kmalloc(sizeof(*ax), GFP_ATOMIC); | 1623 | ax = kmalloc(sizeof(*ax), GFP_ATOMIC); |
1219 | if (!ax) | 1624 | if (!ax) |
1220 | return -ENOMEM; | 1625 | return -ENOMEM; |
@@ -1223,7 +1628,6 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, | |||
1223 | ax->uid = uid; | 1628 | ax->uid = uid; |
1224 | ax->gid = gid; | 1629 | ax->gid = gid; |
1225 | ax->mode = mode; | 1630 | ax->mode = mode; |
1226 | selinux_get_ipc_sid(ipcp, &ax->osid); | ||
1227 | 1631 | ||
1228 | ax->d.type = AUDIT_IPC_SET_PERM; | 1632 | ax->d.type = AUDIT_IPC_SET_PERM; |
1229 | ax->d.next = context->aux; | 1633 | ax->d.next = context->aux; |
@@ -1231,6 +1635,39 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, | |||
1231 | return 0; | 1635 | return 0; |
1232 | } | 1636 | } |
1233 | 1637 | ||
1638 | int audit_bprm(struct linux_binprm *bprm) | ||
1639 | { | ||
1640 | struct audit_aux_data_execve *ax; | ||
1641 | struct audit_context *context = current->audit_context; | ||
1642 | unsigned long p, next; | ||
1643 | void *to; | ||
1644 | |||
1645 | if (likely(!audit_enabled || !context)) | ||
1646 | return 0; | ||
1647 | |||
1648 | ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p, | ||
1649 | GFP_KERNEL); | ||
1650 | if (!ax) | ||
1651 | return -ENOMEM; | ||
1652 | |||
1653 | ax->argc = bprm->argc; | ||
1654 | ax->envc = bprm->envc; | ||
1655 | for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) { | ||
1656 | struct page *page = bprm->page[p / PAGE_SIZE]; | ||
1657 | void *kaddr = kmap(page); | ||
1658 | next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1); | ||
1659 | memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p); | ||
1660 | to += next - p; | ||
1661 | kunmap(page); | ||
1662 | } | ||
1663 | |||
1664 | ax->d.type = AUDIT_EXECVE; | ||
1665 | ax->d.next = context->aux; | ||
1666 | context->aux = (void *)ax; | ||
1667 | return 0; | ||
1668 | } | ||
1669 | |||
1670 | |||
1234 | /** | 1671 | /** |
1235 | * audit_socketcall - record audit data for sys_socketcall | 1672 | * audit_socketcall - record audit data for sys_socketcall |
1236 | * @nargs: number of args | 1673 | * @nargs: number of args |
@@ -1325,19 +1762,20 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt) | |||
1325 | * If the audit subsystem is being terminated, record the task (pid) | 1762 | * If the audit subsystem is being terminated, record the task (pid) |
1326 | * and uid that is doing that. | 1763 | * and uid that is doing that. |
1327 | */ | 1764 | */ |
1328 | void audit_signal_info(int sig, struct task_struct *t) | 1765 | void __audit_signal_info(int sig, struct task_struct *t) |
1329 | { | 1766 | { |
1330 | extern pid_t audit_sig_pid; | 1767 | extern pid_t audit_sig_pid; |
1331 | extern uid_t audit_sig_uid; | 1768 | extern uid_t audit_sig_uid; |
1332 | 1769 | extern u32 audit_sig_sid; | |
1333 | if (unlikely(audit_pid && t->tgid == audit_pid)) { | 1770 | |
1334 | if (sig == SIGTERM || sig == SIGHUP) { | 1771 | if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { |
1335 | struct audit_context *ctx = current->audit_context; | 1772 | struct task_struct *tsk = current; |
1336 | audit_sig_pid = current->pid; | 1773 | struct audit_context *ctx = tsk->audit_context; |
1337 | if (ctx) | 1774 | audit_sig_pid = tsk->pid; |
1338 | audit_sig_uid = ctx->loginuid; | 1775 | if (ctx) |
1339 | else | 1776 | audit_sig_uid = ctx->loginuid; |
1340 | audit_sig_uid = current->uid; | 1777 | else |
1341 | } | 1778 | audit_sig_uid = tsk->uid; |
1779 | selinux_get_task_sid(tsk, &audit_sig_sid); | ||
1342 | } | 1780 | } |
1343 | } | 1781 | } |
diff --git a/kernel/capability.c b/kernel/capability.c index 1a4d8a40d3f9..c7685ad00a97 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -46,7 +46,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) | |||
46 | int ret = 0; | 46 | int ret = 0; |
47 | pid_t pid; | 47 | pid_t pid; |
48 | __u32 version; | 48 | __u32 version; |
49 | task_t *target; | 49 | struct task_struct *target; |
50 | struct __user_cap_data_struct data; | 50 | struct __user_cap_data_struct data; |
51 | 51 | ||
52 | if (get_user(version, &header->version)) | 52 | if (get_user(version, &header->version)) |
@@ -96,7 +96,7 @@ static inline int cap_set_pg(int pgrp, kernel_cap_t *effective, | |||
96 | kernel_cap_t *inheritable, | 96 | kernel_cap_t *inheritable, |
97 | kernel_cap_t *permitted) | 97 | kernel_cap_t *permitted) |
98 | { | 98 | { |
99 | task_t *g, *target; | 99 | struct task_struct *g, *target; |
100 | int ret = -EPERM; | 100 | int ret = -EPERM; |
101 | int found = 0; | 101 | int found = 0; |
102 | 102 | ||
@@ -128,7 +128,7 @@ static inline int cap_set_all(kernel_cap_t *effective, | |||
128 | kernel_cap_t *inheritable, | 128 | kernel_cap_t *inheritable, |
129 | kernel_cap_t *permitted) | 129 | kernel_cap_t *permitted) |
130 | { | 130 | { |
131 | task_t *g, *target; | 131 | struct task_struct *g, *target; |
132 | int ret = -EPERM; | 132 | int ret = -EPERM; |
133 | int found = 0; | 133 | int found = 0; |
134 | 134 | ||
@@ -172,7 +172,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) | |||
172 | { | 172 | { |
173 | kernel_cap_t inheritable, permitted, effective; | 173 | kernel_cap_t inheritable, permitted, effective; |
174 | __u32 version; | 174 | __u32 version; |
175 | task_t *target; | 175 | struct task_struct *target; |
176 | int ret; | 176 | int ret; |
177 | pid_t pid; | 177 | pid_t pid; |
178 | 178 | ||
diff --git a/kernel/compat.c b/kernel/compat.c index c1601a84f8d8..126dee9530aa 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/unistd.h> | 21 | #include <linux/unistd.h> |
22 | #include <linux/security.h> | 22 | #include <linux/security.h> |
23 | #include <linux/timex.h> | 23 | #include <linux/timex.h> |
24 | #include <linux/migrate.h> | ||
24 | 25 | ||
25 | #include <asm/uaccess.h> | 26 | #include <asm/uaccess.h> |
26 | 27 | ||
@@ -729,17 +730,10 @@ void | |||
729 | sigset_from_compat (sigset_t *set, compat_sigset_t *compat) | 730 | sigset_from_compat (sigset_t *set, compat_sigset_t *compat) |
730 | { | 731 | { |
731 | switch (_NSIG_WORDS) { | 732 | switch (_NSIG_WORDS) { |
732 | #if defined (__COMPAT_ENDIAN_SWAP__) | ||
733 | case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 ); | ||
734 | case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 ); | ||
735 | case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 ); | ||
736 | case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 ); | ||
737 | #else | ||
738 | case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); | 733 | case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); |
739 | case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); | 734 | case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); |
740 | case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); | 735 | case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); |
741 | case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); | 736 | case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); |
742 | #endif | ||
743 | } | 737 | } |
744 | } | 738 | } |
745 | 739 | ||
@@ -934,3 +928,25 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) | |||
934 | 928 | ||
935 | return ret; | 929 | return ret; |
936 | } | 930 | } |
931 | |||
932 | #ifdef CONFIG_NUMA | ||
933 | asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, | ||
934 | compat_uptr_t __user *pages32, | ||
935 | const int __user *nodes, | ||
936 | int __user *status, | ||
937 | int flags) | ||
938 | { | ||
939 | const void __user * __user *pages; | ||
940 | int i; | ||
941 | |||
942 | pages = compat_alloc_user_space(nr_pages * sizeof(void *)); | ||
943 | for (i = 0; i < nr_pages; i++) { | ||
944 | compat_uptr_t p; | ||
945 | |||
946 | if (get_user(p, pages32 + i) || | ||
947 | put_user(compat_ptr(p), pages + i)) | ||
948 | return -EFAULT; | ||
949 | } | ||
950 | return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); | ||
951 | } | ||
952 | #endif | ||
diff --git a/kernel/configs.c b/kernel/configs.c index 009e1ebdcb88..f9e31974f4ad 100644 --- a/kernel/configs.c +++ b/kernel/configs.c | |||
@@ -23,7 +23,6 @@ | |||
23 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 23 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
24 | */ | 24 | */ |
25 | 25 | ||
26 | #include <linux/config.h> | ||
27 | #include <linux/kernel.h> | 26 | #include <linux/kernel.h> |
28 | #include <linux/module.h> | 27 | #include <linux/module.h> |
29 | #include <linux/proc_fs.h> | 28 | #include <linux/proc_fs.h> |
diff --git a/kernel/cpu.c b/kernel/cpu.c index fe2b8d0bfe4c..70fbf2e83766 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -13,12 +13,12 @@ | |||
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/kthread.h> | 14 | #include <linux/kthread.h> |
15 | #include <linux/stop_machine.h> | 15 | #include <linux/stop_machine.h> |
16 | #include <asm/semaphore.h> | 16 | #include <linux/mutex.h> |
17 | 17 | ||
18 | /* This protects CPUs going up and down... */ | 18 | /* This protects CPUs going up and down... */ |
19 | static DECLARE_MUTEX(cpucontrol); | 19 | static DEFINE_MUTEX(cpucontrol); |
20 | 20 | ||
21 | static BLOCKING_NOTIFIER_HEAD(cpu_chain); | 21 | static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain); |
22 | 22 | ||
23 | #ifdef CONFIG_HOTPLUG_CPU | 23 | #ifdef CONFIG_HOTPLUG_CPU |
24 | static struct task_struct *lock_cpu_hotplug_owner; | 24 | static struct task_struct *lock_cpu_hotplug_owner; |
@@ -30,9 +30,9 @@ static int __lock_cpu_hotplug(int interruptible) | |||
30 | 30 | ||
31 | if (lock_cpu_hotplug_owner != current) { | 31 | if (lock_cpu_hotplug_owner != current) { |
32 | if (interruptible) | 32 | if (interruptible) |
33 | ret = down_interruptible(&cpucontrol); | 33 | ret = mutex_lock_interruptible(&cpucontrol); |
34 | else | 34 | else |
35 | down(&cpucontrol); | 35 | mutex_lock(&cpucontrol); |
36 | } | 36 | } |
37 | 37 | ||
38 | /* | 38 | /* |
@@ -56,7 +56,7 @@ void unlock_cpu_hotplug(void) | |||
56 | { | 56 | { |
57 | if (--lock_cpu_hotplug_depth == 0) { | 57 | if (--lock_cpu_hotplug_depth == 0) { |
58 | lock_cpu_hotplug_owner = NULL; | 58 | lock_cpu_hotplug_owner = NULL; |
59 | up(&cpucontrol); | 59 | mutex_unlock(&cpucontrol); |
60 | } | 60 | } |
61 | } | 61 | } |
62 | EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); | 62 | EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); |
@@ -69,10 +69,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible); | |||
69 | #endif /* CONFIG_HOTPLUG_CPU */ | 69 | #endif /* CONFIG_HOTPLUG_CPU */ |
70 | 70 | ||
71 | /* Need to know about CPUs going up/down? */ | 71 | /* Need to know about CPUs going up/down? */ |
72 | int register_cpu_notifier(struct notifier_block *nb) | 72 | int __cpuinit register_cpu_notifier(struct notifier_block *nb) |
73 | { | 73 | { |
74 | return blocking_notifier_chain_register(&cpu_chain, nb); | 74 | return blocking_notifier_chain_register(&cpu_chain, nb); |
75 | } | 75 | } |
76 | |||
77 | #ifdef CONFIG_HOTPLUG_CPU | ||
78 | |||
76 | EXPORT_SYMBOL(register_cpu_notifier); | 79 | EXPORT_SYMBOL(register_cpu_notifier); |
77 | 80 | ||
78 | void unregister_cpu_notifier(struct notifier_block *nb) | 81 | void unregister_cpu_notifier(struct notifier_block *nb) |
@@ -81,7 +84,6 @@ void unregister_cpu_notifier(struct notifier_block *nb) | |||
81 | } | 84 | } |
82 | EXPORT_SYMBOL(unregister_cpu_notifier); | 85 | EXPORT_SYMBOL(unregister_cpu_notifier); |
83 | 86 | ||
84 | #ifdef CONFIG_HOTPLUG_CPU | ||
85 | static inline void check_for_tasks(int cpu) | 87 | static inline void check_for_tasks(int cpu) |
86 | { | 88 | { |
87 | struct task_struct *p; | 89 | struct task_struct *p; |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ab81fdd4572b..c232dc077438 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -18,7 +18,6 @@ | |||
18 | * distribution for more details. | 18 | * distribution for more details. |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include <linux/config.h> | ||
22 | #include <linux/cpu.h> | 21 | #include <linux/cpu.h> |
23 | #include <linux/cpumask.h> | 22 | #include <linux/cpumask.h> |
24 | #include <linux/cpuset.h> | 23 | #include <linux/cpuset.h> |
@@ -41,6 +40,7 @@ | |||
41 | #include <linux/rcupdate.h> | 40 | #include <linux/rcupdate.h> |
42 | #include <linux/sched.h> | 41 | #include <linux/sched.h> |
43 | #include <linux/seq_file.h> | 42 | #include <linux/seq_file.h> |
43 | #include <linux/security.h> | ||
44 | #include <linux/slab.h> | 44 | #include <linux/slab.h> |
45 | #include <linux/smp_lock.h> | 45 | #include <linux/smp_lock.h> |
46 | #include <linux/spinlock.h> | 46 | #include <linux/spinlock.h> |
@@ -392,11 +392,11 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data, | |||
392 | return 0; | 392 | return 0; |
393 | } | 393 | } |
394 | 394 | ||
395 | static struct super_block *cpuset_get_sb(struct file_system_type *fs_type, | 395 | static int cpuset_get_sb(struct file_system_type *fs_type, |
396 | int flags, const char *unused_dev_name, | 396 | int flags, const char *unused_dev_name, |
397 | void *data) | 397 | void *data, struct vfsmount *mnt) |
398 | { | 398 | { |
399 | return get_sb_single(fs_type, flags, data, cpuset_fill_super); | 399 | return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt); |
400 | } | 400 | } |
401 | 401 | ||
402 | static struct file_system_type cpuset_fs_type = { | 402 | static struct file_system_type cpuset_fs_type = { |
@@ -1063,7 +1063,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
1063 | } | 1063 | } |
1064 | 1064 | ||
1065 | /* | 1065 | /* |
1066 | * Frequency meter - How fast is some event occuring? | 1066 | * Frequency meter - How fast is some event occurring? |
1067 | * | 1067 | * |
1068 | * These routines manage a digitally filtered, constant time based, | 1068 | * These routines manage a digitally filtered, constant time based, |
1069 | * event frequency meter. There are four routines: | 1069 | * event frequency meter. There are four routines: |
@@ -1177,6 +1177,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
1177 | cpumask_t cpus; | 1177 | cpumask_t cpus; |
1178 | nodemask_t from, to; | 1178 | nodemask_t from, to; |
1179 | struct mm_struct *mm; | 1179 | struct mm_struct *mm; |
1180 | int retval; | ||
1180 | 1181 | ||
1181 | if (sscanf(pidbuf, "%d", &pid) != 1) | 1182 | if (sscanf(pidbuf, "%d", &pid) != 1) |
1182 | return -EIO; | 1183 | return -EIO; |
@@ -1205,6 +1206,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
1205 | get_task_struct(tsk); | 1206 | get_task_struct(tsk); |
1206 | } | 1207 | } |
1207 | 1208 | ||
1209 | retval = security_task_setscheduler(tsk, 0, NULL); | ||
1210 | if (retval) { | ||
1211 | put_task_struct(tsk); | ||
1212 | return retval; | ||
1213 | } | ||
1214 | |||
1208 | mutex_lock(&callback_mutex); | 1215 | mutex_lock(&callback_mutex); |
1209 | 1216 | ||
1210 | task_lock(tsk); | 1217 | task_lock(tsk); |
@@ -2434,31 +2441,43 @@ void __cpuset_memory_pressure_bump(void) | |||
2434 | */ | 2441 | */ |
2435 | static int proc_cpuset_show(struct seq_file *m, void *v) | 2442 | static int proc_cpuset_show(struct seq_file *m, void *v) |
2436 | { | 2443 | { |
2444 | struct pid *pid; | ||
2437 | struct task_struct *tsk; | 2445 | struct task_struct *tsk; |
2438 | char *buf; | 2446 | char *buf; |
2439 | int retval = 0; | 2447 | int retval; |
2440 | 2448 | ||
2449 | retval = -ENOMEM; | ||
2441 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); | 2450 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); |
2442 | if (!buf) | 2451 | if (!buf) |
2443 | return -ENOMEM; | 2452 | goto out; |
2444 | 2453 | ||
2445 | tsk = m->private; | 2454 | retval = -ESRCH; |
2455 | pid = m->private; | ||
2456 | tsk = get_pid_task(pid, PIDTYPE_PID); | ||
2457 | if (!tsk) | ||
2458 | goto out_free; | ||
2459 | |||
2460 | retval = -EINVAL; | ||
2446 | mutex_lock(&manage_mutex); | 2461 | mutex_lock(&manage_mutex); |
2462 | |||
2447 | retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); | 2463 | retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); |
2448 | if (retval < 0) | 2464 | if (retval < 0) |
2449 | goto out; | 2465 | goto out_unlock; |
2450 | seq_puts(m, buf); | 2466 | seq_puts(m, buf); |
2451 | seq_putc(m, '\n'); | 2467 | seq_putc(m, '\n'); |
2452 | out: | 2468 | out_unlock: |
2453 | mutex_unlock(&manage_mutex); | 2469 | mutex_unlock(&manage_mutex); |
2470 | put_task_struct(tsk); | ||
2471 | out_free: | ||
2454 | kfree(buf); | 2472 | kfree(buf); |
2473 | out: | ||
2455 | return retval; | 2474 | return retval; |
2456 | } | 2475 | } |
2457 | 2476 | ||
2458 | static int cpuset_open(struct inode *inode, struct file *file) | 2477 | static int cpuset_open(struct inode *inode, struct file *file) |
2459 | { | 2478 | { |
2460 | struct task_struct *tsk = PROC_I(inode)->task; | 2479 | struct pid *pid = PROC_I(inode)->pid; |
2461 | return single_open(file, proc_cpuset_show, tsk); | 2480 | return single_open(file, proc_cpuset_show, pid); |
2462 | } | 2481 | } |
2463 | 2482 | ||
2464 | struct file_operations proc_cpuset_operations = { | 2483 | struct file_operations proc_cpuset_operations = { |
diff --git a/kernel/delayacct.c b/kernel/delayacct.c new file mode 100644 index 000000000000..f05392d64267 --- /dev/null +++ b/kernel/delayacct.c | |||
@@ -0,0 +1,178 @@ | |||
1 | /* delayacct.c - per-task delay accounting | ||
2 | * | ||
3 | * Copyright (C) Shailabh Nagar, IBM Corp. 2006 | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License as published by | ||
7 | * the Free Software Foundation; either version 2 of the License, or | ||
8 | * (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it would be useful, but | ||
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
13 | * the GNU General Public License for more details. | ||
14 | */ | ||
15 | |||
16 | #include <linux/sched.h> | ||
17 | #include <linux/slab.h> | ||
18 | #include <linux/time.h> | ||
19 | #include <linux/sysctl.h> | ||
20 | #include <linux/delayacct.h> | ||
21 | |||
22 | int delayacct_on __read_mostly; /* Delay accounting turned on/off */ | ||
23 | kmem_cache_t *delayacct_cache; | ||
24 | |||
25 | static int __init delayacct_setup_enable(char *str) | ||
26 | { | ||
27 | delayacct_on = 1; | ||
28 | return 1; | ||
29 | } | ||
30 | __setup("delayacct", delayacct_setup_enable); | ||
31 | |||
32 | void delayacct_init(void) | ||
33 | { | ||
34 | delayacct_cache = kmem_cache_create("delayacct_cache", | ||
35 | sizeof(struct task_delay_info), | ||
36 | 0, | ||
37 | SLAB_PANIC, | ||
38 | NULL, NULL); | ||
39 | delayacct_tsk_init(&init_task); | ||
40 | } | ||
41 | |||
42 | void __delayacct_tsk_init(struct task_struct *tsk) | ||
43 | { | ||
44 | spin_lock_init(&tsk->delays_lock); | ||
45 | /* No need to acquire tsk->delays_lock for allocation here unless | ||
46 | __delayacct_tsk_init called after tsk is attached to tasklist | ||
47 | */ | ||
48 | tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL); | ||
49 | if (tsk->delays) | ||
50 | spin_lock_init(&tsk->delays->lock); | ||
51 | } | ||
52 | |||
53 | void __delayacct_tsk_exit(struct task_struct *tsk) | ||
54 | { | ||
55 | struct task_delay_info *delays = tsk->delays; | ||
56 | spin_lock(&tsk->delays_lock); | ||
57 | tsk->delays = NULL; | ||
58 | spin_unlock(&tsk->delays_lock); | ||
59 | kmem_cache_free(delayacct_cache, delays); | ||
60 | } | ||
61 | |||
62 | /* | ||
63 | * Start accounting for a delay statistic using | ||
64 | * its starting timestamp (@start) | ||
65 | */ | ||
66 | |||
67 | static inline void delayacct_start(struct timespec *start) | ||
68 | { | ||
69 | do_posix_clock_monotonic_gettime(start); | ||
70 | } | ||
71 | |||
72 | /* | ||
73 | * Finish delay accounting for a statistic using | ||
74 | * its timestamps (@start, @end), accumalator (@total) and @count | ||
75 | */ | ||
76 | |||
77 | static void delayacct_end(struct timespec *start, struct timespec *end, | ||
78 | u64 *total, u32 *count) | ||
79 | { | ||
80 | struct timespec ts; | ||
81 | s64 ns; | ||
82 | |||
83 | do_posix_clock_monotonic_gettime(end); | ||
84 | ts = timespec_sub(*end, *start); | ||
85 | ns = timespec_to_ns(&ts); | ||
86 | if (ns < 0) | ||
87 | return; | ||
88 | |||
89 | spin_lock(¤t->delays->lock); | ||
90 | *total += ns; | ||
91 | (*count)++; | ||
92 | spin_unlock(¤t->delays->lock); | ||
93 | } | ||
94 | |||
95 | void __delayacct_blkio_start(void) | ||
96 | { | ||
97 | delayacct_start(¤t->delays->blkio_start); | ||
98 | } | ||
99 | |||
100 | void __delayacct_blkio_end(void) | ||
101 | { | ||
102 | if (current->delays->flags & DELAYACCT_PF_SWAPIN) | ||
103 | /* Swapin block I/O */ | ||
104 | delayacct_end(¤t->delays->blkio_start, | ||
105 | ¤t->delays->blkio_end, | ||
106 | ¤t->delays->swapin_delay, | ||
107 | ¤t->delays->swapin_count); | ||
108 | else /* Other block I/O */ | ||
109 | delayacct_end(¤t->delays->blkio_start, | ||
110 | ¤t->delays->blkio_end, | ||
111 | ¤t->delays->blkio_delay, | ||
112 | ¤t->delays->blkio_count); | ||
113 | } | ||
114 | |||
115 | int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) | ||
116 | { | ||
117 | s64 tmp; | ||
118 | struct timespec ts; | ||
119 | unsigned long t1,t2,t3; | ||
120 | |||
121 | spin_lock(&tsk->delays_lock); | ||
122 | |||
123 | /* Though tsk->delays accessed later, early exit avoids | ||
124 | * unnecessary returning of other data | ||
125 | */ | ||
126 | if (!tsk->delays) | ||
127 | goto done; | ||
128 | |||
129 | tmp = (s64)d->cpu_run_real_total; | ||
130 | cputime_to_timespec(tsk->utime + tsk->stime, &ts); | ||
131 | tmp += timespec_to_ns(&ts); | ||
132 | d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; | ||
133 | |||
134 | /* | ||
135 | * No locking available for sched_info (and too expensive to add one) | ||
136 | * Mitigate by taking snapshot of values | ||
137 | */ | ||
138 | t1 = tsk->sched_info.pcnt; | ||
139 | t2 = tsk->sched_info.run_delay; | ||
140 | t3 = tsk->sched_info.cpu_time; | ||
141 | |||
142 | d->cpu_count += t1; | ||
143 | |||
144 | jiffies_to_timespec(t2, &ts); | ||
145 | tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts); | ||
146 | d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; | ||
147 | |||
148 | tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000; | ||
149 | d->cpu_run_virtual_total = | ||
150 | (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; | ||
151 | |||
152 | /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ | ||
153 | |||
154 | spin_lock(&tsk->delays->lock); | ||
155 | tmp = d->blkio_delay_total + tsk->delays->blkio_delay; | ||
156 | d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; | ||
157 | tmp = d->swapin_delay_total + tsk->delays->swapin_delay; | ||
158 | d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; | ||
159 | d->blkio_count += tsk->delays->blkio_count; | ||
160 | d->swapin_count += tsk->delays->swapin_count; | ||
161 | spin_unlock(&tsk->delays->lock); | ||
162 | |||
163 | done: | ||
164 | spin_unlock(&tsk->delays_lock); | ||
165 | return 0; | ||
166 | } | ||
167 | |||
168 | __u64 __delayacct_blkio_ticks(struct task_struct *tsk) | ||
169 | { | ||
170 | __u64 ret; | ||
171 | |||
172 | spin_lock(&tsk->delays->lock); | ||
173 | ret = nsec_to_clock_t(tsk->delays->blkio_delay + | ||
174 | tsk->delays->swapin_delay); | ||
175 | spin_unlock(&tsk->delays->lock); | ||
176 | return ret; | ||
177 | } | ||
178 | |||
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index c01cead2cfd6..3c2eaea66b1e 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c | |||
@@ -7,7 +7,6 @@ | |||
7 | * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org) | 7 | * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org) |
8 | */ | 8 | */ |
9 | 9 | ||
10 | #include <linux/config.h> | ||
11 | #include <linux/init.h> | 10 | #include <linux/init.h> |
12 | #include <linux/kernel.h> | 11 | #include <linux/kernel.h> |
13 | #include <linux/kmod.h> | 12 | #include <linux/kmod.h> |
diff --git a/kernel/exit.c b/kernel/exit.c index e95b93282210..dba194a8d416 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -4,7 +4,6 @@ | |||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | 4 | * Copyright (C) 1991, 1992 Linus Torvalds |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include <linux/config.h> | ||
8 | #include <linux/mm.h> | 7 | #include <linux/mm.h> |
9 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
10 | #include <linux/interrupt.h> | 9 | #include <linux/interrupt.h> |
@@ -26,6 +25,8 @@ | |||
26 | #include <linux/mount.h> | 25 | #include <linux/mount.h> |
27 | #include <linux/proc_fs.h> | 26 | #include <linux/proc_fs.h> |
28 | #include <linux/mempolicy.h> | 27 | #include <linux/mempolicy.h> |
28 | #include <linux/taskstats_kern.h> | ||
29 | #include <linux/delayacct.h> | ||
29 | #include <linux/cpuset.h> | 30 | #include <linux/cpuset.h> |
30 | #include <linux/syscalls.h> | 31 | #include <linux/syscalls.h> |
31 | #include <linux/signal.h> | 32 | #include <linux/signal.h> |
@@ -36,6 +37,7 @@ | |||
36 | #include <linux/compat.h> | 37 | #include <linux/compat.h> |
37 | #include <linux/pipe_fs_i.h> | 38 | #include <linux/pipe_fs_i.h> |
38 | #include <linux/audit.h> /* for audit_free() */ | 39 | #include <linux/audit.h> /* for audit_free() */ |
40 | #include <linux/resource.h> | ||
39 | 41 | ||
40 | #include <asm/uaccess.h> | 42 | #include <asm/uaccess.h> |
41 | #include <asm/unistd.h> | 43 | #include <asm/unistd.h> |
@@ -45,8 +47,6 @@ | |||
45 | extern void sem_exit (void); | 47 | extern void sem_exit (void); |
46 | extern struct task_struct *child_reaper; | 48 | extern struct task_struct *child_reaper; |
47 | 49 | ||
48 | int getrusage(struct task_struct *, int, struct rusage __user *); | ||
49 | |||
50 | static void exit_mm(struct task_struct * tsk); | 50 | static void exit_mm(struct task_struct * tsk); |
51 | 51 | ||
52 | static void __unhash_process(struct task_struct *p) | 52 | static void __unhash_process(struct task_struct *p) |
@@ -136,14 +136,10 @@ static void delayed_put_task_struct(struct rcu_head *rhp) | |||
136 | 136 | ||
137 | void release_task(struct task_struct * p) | 137 | void release_task(struct task_struct * p) |
138 | { | 138 | { |
139 | struct task_struct *leader; | ||
139 | int zap_leader; | 140 | int zap_leader; |
140 | task_t *leader; | ||
141 | struct dentry *proc_dentry; | ||
142 | |||
143 | repeat: | 141 | repeat: |
144 | atomic_dec(&p->user->processes); | 142 | atomic_dec(&p->user->processes); |
145 | spin_lock(&p->proc_lock); | ||
146 | proc_dentry = proc_pid_unhash(p); | ||
147 | write_lock_irq(&tasklist_lock); | 143 | write_lock_irq(&tasklist_lock); |
148 | ptrace_unlink(p); | 144 | ptrace_unlink(p); |
149 | BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); | 145 | BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); |
@@ -172,8 +168,7 @@ repeat: | |||
172 | 168 | ||
173 | sched_exit(p); | 169 | sched_exit(p); |
174 | write_unlock_irq(&tasklist_lock); | 170 | write_unlock_irq(&tasklist_lock); |
175 | spin_unlock(&p->proc_lock); | 171 | proc_flush_task(p); |
176 | proc_pid_flush(proc_dentry); | ||
177 | release_thread(p); | 172 | release_thread(p); |
178 | call_rcu(&p->rcu, delayed_put_task_struct); | 173 | call_rcu(&p->rcu, delayed_put_task_struct); |
179 | 174 | ||
@@ -216,7 +211,7 @@ out: | |||
216 | * | 211 | * |
217 | * "I ask you, have you ever known what it is to be an orphan?" | 212 | * "I ask you, have you ever known what it is to be an orphan?" |
218 | */ | 213 | */ |
219 | static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task) | 214 | static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task) |
220 | { | 215 | { |
221 | struct task_struct *p; | 216 | struct task_struct *p; |
222 | int ret = 1; | 217 | int ret = 1; |
@@ -579,7 +574,7 @@ static void exit_mm(struct task_struct * tsk) | |||
579 | down_read(&mm->mmap_sem); | 574 | down_read(&mm->mmap_sem); |
580 | } | 575 | } |
581 | atomic_inc(&mm->mm_count); | 576 | atomic_inc(&mm->mm_count); |
582 | if (mm != tsk->active_mm) BUG(); | 577 | BUG_ON(mm != tsk->active_mm); |
583 | /* more a memory barrier than a real lock */ | 578 | /* more a memory barrier than a real lock */ |
584 | task_lock(tsk); | 579 | task_lock(tsk); |
585 | tsk->mm = NULL; | 580 | tsk->mm = NULL; |
@@ -589,7 +584,8 @@ static void exit_mm(struct task_struct * tsk) | |||
589 | mmput(mm); | 584 | mmput(mm); |
590 | } | 585 | } |
591 | 586 | ||
592 | static inline void choose_new_parent(task_t *p, task_t *reaper) | 587 | static inline void |
588 | choose_new_parent(struct task_struct *p, struct task_struct *reaper) | ||
593 | { | 589 | { |
594 | /* | 590 | /* |
595 | * Make sure we're not reparenting to ourselves and that | 591 | * Make sure we're not reparenting to ourselves and that |
@@ -599,7 +595,8 @@ static inline void choose_new_parent(task_t *p, task_t *reaper) | |||
599 | p->real_parent = reaper; | 595 | p->real_parent = reaper; |
600 | } | 596 | } |
601 | 597 | ||
602 | static void reparent_thread(task_t *p, task_t *father, int traced) | 598 | static void |
599 | reparent_thread(struct task_struct *p, struct task_struct *father, int traced) | ||
603 | { | 600 | { |
604 | /* We don't want people slaying init. */ | 601 | /* We don't want people slaying init. */ |
605 | if (p->exit_signal != -1) | 602 | if (p->exit_signal != -1) |
@@ -663,8 +660,8 @@ static void reparent_thread(task_t *p, task_t *father, int traced) | |||
663 | * group, and if no such member exists, give it to | 660 | * group, and if no such member exists, give it to |
664 | * the global child reaper process (ie "init") | 661 | * the global child reaper process (ie "init") |
665 | */ | 662 | */ |
666 | static void forget_original_parent(struct task_struct * father, | 663 | static void |
667 | struct list_head *to_release) | 664 | forget_original_parent(struct task_struct *father, struct list_head *to_release) |
668 | { | 665 | { |
669 | struct task_struct *p, *reaper = father; | 666 | struct task_struct *p, *reaper = father; |
670 | struct list_head *_p, *_n; | 667 | struct list_head *_p, *_n; |
@@ -687,7 +684,7 @@ static void forget_original_parent(struct task_struct * father, | |||
687 | */ | 684 | */ |
688 | list_for_each_safe(_p, _n, &father->children) { | 685 | list_for_each_safe(_p, _n, &father->children) { |
689 | int ptrace; | 686 | int ptrace; |
690 | p = list_entry(_p,struct task_struct,sibling); | 687 | p = list_entry(_p, struct task_struct, sibling); |
691 | 688 | ||
692 | ptrace = p->ptrace; | 689 | ptrace = p->ptrace; |
693 | 690 | ||
@@ -716,7 +713,7 @@ static void forget_original_parent(struct task_struct * father, | |||
716 | list_add(&p->ptrace_list, to_release); | 713 | list_add(&p->ptrace_list, to_release); |
717 | } | 714 | } |
718 | list_for_each_safe(_p, _n, &father->ptrace_children) { | 715 | list_for_each_safe(_p, _n, &father->ptrace_children) { |
719 | p = list_entry(_p,struct task_struct,ptrace_list); | 716 | p = list_entry(_p, struct task_struct, ptrace_list); |
720 | choose_new_parent(p, reaper); | 717 | choose_new_parent(p, reaper); |
721 | reparent_thread(p, father, 1); | 718 | reparent_thread(p, father, 1); |
722 | } | 719 | } |
@@ -836,7 +833,7 @@ static void exit_notify(struct task_struct *tsk) | |||
836 | 833 | ||
837 | list_for_each_safe(_p, _n, &ptrace_dead) { | 834 | list_for_each_safe(_p, _n, &ptrace_dead) { |
838 | list_del_init(_p); | 835 | list_del_init(_p); |
839 | t = list_entry(_p,struct task_struct,ptrace_list); | 836 | t = list_entry(_p, struct task_struct, ptrace_list); |
840 | release_task(t); | 837 | release_task(t); |
841 | } | 838 | } |
842 | 839 | ||
@@ -848,7 +845,9 @@ static void exit_notify(struct task_struct *tsk) | |||
848 | fastcall NORET_TYPE void do_exit(long code) | 845 | fastcall NORET_TYPE void do_exit(long code) |
849 | { | 846 | { |
850 | struct task_struct *tsk = current; | 847 | struct task_struct *tsk = current; |
848 | struct taskstats *tidstats; | ||
851 | int group_dead; | 849 | int group_dead; |
850 | unsigned int mycpu; | ||
852 | 851 | ||
853 | profile_task_exit(tsk); | 852 | profile_task_exit(tsk); |
854 | 853 | ||
@@ -881,19 +880,13 @@ fastcall NORET_TYPE void do_exit(long code) | |||
881 | 880 | ||
882 | tsk->flags |= PF_EXITING; | 881 | tsk->flags |= PF_EXITING; |
883 | 882 | ||
884 | /* | ||
885 | * Make sure we don't try to process any timer firings | ||
886 | * while we are already exiting. | ||
887 | */ | ||
888 | tsk->it_virt_expires = cputime_zero; | ||
889 | tsk->it_prof_expires = cputime_zero; | ||
890 | tsk->it_sched_expires = 0; | ||
891 | |||
892 | if (unlikely(in_atomic())) | 883 | if (unlikely(in_atomic())) |
893 | printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", | 884 | printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", |
894 | current->comm, current->pid, | 885 | current->comm, current->pid, |
895 | preempt_count()); | 886 | preempt_count()); |
896 | 887 | ||
888 | taskstats_exit_alloc(&tidstats, &mycpu); | ||
889 | |||
897 | acct_update_integrals(tsk); | 890 | acct_update_integrals(tsk); |
898 | if (tsk->mm) { | 891 | if (tsk->mm) { |
899 | update_hiwater_rss(tsk->mm); | 892 | update_hiwater_rss(tsk->mm); |
@@ -903,18 +896,24 @@ fastcall NORET_TYPE void do_exit(long code) | |||
903 | if (group_dead) { | 896 | if (group_dead) { |
904 | hrtimer_cancel(&tsk->signal->real_timer); | 897 | hrtimer_cancel(&tsk->signal->real_timer); |
905 | exit_itimers(tsk->signal); | 898 | exit_itimers(tsk->signal); |
906 | acct_process(code); | ||
907 | } | 899 | } |
900 | acct_collect(code, group_dead); | ||
908 | if (unlikely(tsk->robust_list)) | 901 | if (unlikely(tsk->robust_list)) |
909 | exit_robust_list(tsk); | 902 | exit_robust_list(tsk); |
910 | #ifdef CONFIG_COMPAT | 903 | #if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT) |
911 | if (unlikely(tsk->compat_robust_list)) | 904 | if (unlikely(tsk->compat_robust_list)) |
912 | compat_exit_robust_list(tsk); | 905 | compat_exit_robust_list(tsk); |
913 | #endif | 906 | #endif |
914 | if (unlikely(tsk->audit_context)) | 907 | if (unlikely(tsk->audit_context)) |
915 | audit_free(tsk); | 908 | audit_free(tsk); |
909 | taskstats_exit_send(tsk, tidstats, group_dead, mycpu); | ||
910 | taskstats_exit_free(tidstats); | ||
911 | delayacct_tsk_exit(tsk); | ||
912 | |||
916 | exit_mm(tsk); | 913 | exit_mm(tsk); |
917 | 914 | ||
915 | if (group_dead) | ||
916 | acct_process(); | ||
918 | exit_sem(tsk); | 917 | exit_sem(tsk); |
919 | __exit_files(tsk); | 918 | __exit_files(tsk); |
920 | __exit_fs(tsk); | 919 | __exit_fs(tsk); |
@@ -938,9 +937,17 @@ fastcall NORET_TYPE void do_exit(long code) | |||
938 | tsk->mempolicy = NULL; | 937 | tsk->mempolicy = NULL; |
939 | #endif | 938 | #endif |
940 | /* | 939 | /* |
941 | * If DEBUG_MUTEXES is on, make sure we are holding no locks: | 940 | * This must happen late, after the PID is not |
941 | * hashed anymore: | ||
942 | */ | 942 | */ |
943 | mutex_debug_check_no_locks_held(tsk); | 943 | if (unlikely(!list_empty(&tsk->pi_state_list))) |
944 | exit_pi_state_list(tsk); | ||
945 | if (unlikely(current->pi_state_cache)) | ||
946 | kfree(current->pi_state_cache); | ||
947 | /* | ||
948 | * Make sure we are holding no locks: | ||
949 | */ | ||
950 | debug_check_no_locks_held(tsk); | ||
944 | 951 | ||
945 | if (tsk->io_context) | 952 | if (tsk->io_context) |
946 | exit_io_context(); | 953 | exit_io_context(); |
@@ -1015,7 +1022,7 @@ asmlinkage void sys_exit_group(int error_code) | |||
1015 | do_group_exit((error_code & 0xff) << 8); | 1022 | do_group_exit((error_code & 0xff) << 8); |
1016 | } | 1023 | } |
1017 | 1024 | ||
1018 | static int eligible_child(pid_t pid, int options, task_t *p) | 1025 | static int eligible_child(pid_t pid, int options, struct task_struct *p) |
1019 | { | 1026 | { |
1020 | if (pid > 0) { | 1027 | if (pid > 0) { |
1021 | if (p->pid != pid) | 1028 | if (p->pid != pid) |
@@ -1056,12 +1063,13 @@ static int eligible_child(pid_t pid, int options, task_t *p) | |||
1056 | return 1; | 1063 | return 1; |
1057 | } | 1064 | } |
1058 | 1065 | ||
1059 | static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid, | 1066 | static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, |
1060 | int why, int status, | 1067 | int why, int status, |
1061 | struct siginfo __user *infop, | 1068 | struct siginfo __user *infop, |
1062 | struct rusage __user *rusagep) | 1069 | struct rusage __user *rusagep) |
1063 | { | 1070 | { |
1064 | int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; | 1071 | int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; |
1072 | |||
1065 | put_task_struct(p); | 1073 | put_task_struct(p); |
1066 | if (!retval) | 1074 | if (!retval) |
1067 | retval = put_user(SIGCHLD, &infop->si_signo); | 1075 | retval = put_user(SIGCHLD, &infop->si_signo); |
@@ -1086,7 +1094,7 @@ static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid, | |||
1086 | * the lock and this task is uninteresting. If we return nonzero, we have | 1094 | * the lock and this task is uninteresting. If we return nonzero, we have |
1087 | * released the lock and the system call should return. | 1095 | * released the lock and the system call should return. |
1088 | */ | 1096 | */ |
1089 | static int wait_task_zombie(task_t *p, int noreap, | 1097 | static int wait_task_zombie(struct task_struct *p, int noreap, |
1090 | struct siginfo __user *infop, | 1098 | struct siginfo __user *infop, |
1091 | int __user *stat_addr, struct rusage __user *ru) | 1099 | int __user *stat_addr, struct rusage __user *ru) |
1092 | { | 1100 | { |
@@ -1248,8 +1256,8 @@ static int wait_task_zombie(task_t *p, int noreap, | |||
1248 | * the lock and this task is uninteresting. If we return nonzero, we have | 1256 | * the lock and this task is uninteresting. If we return nonzero, we have |
1249 | * released the lock and the system call should return. | 1257 | * released the lock and the system call should return. |
1250 | */ | 1258 | */ |
1251 | static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap, | 1259 | static int wait_task_stopped(struct task_struct *p, int delayed_group_leader, |
1252 | struct siginfo __user *infop, | 1260 | int noreap, struct siginfo __user *infop, |
1253 | int __user *stat_addr, struct rusage __user *ru) | 1261 | int __user *stat_addr, struct rusage __user *ru) |
1254 | { | 1262 | { |
1255 | int retval, exit_code; | 1263 | int retval, exit_code; |
@@ -1363,7 +1371,7 @@ bail_ref: | |||
1363 | * the lock and this task is uninteresting. If we return nonzero, we have | 1371 | * the lock and this task is uninteresting. If we return nonzero, we have |
1364 | * released the lock and the system call should return. | 1372 | * released the lock and the system call should return. |
1365 | */ | 1373 | */ |
1366 | static int wait_task_continued(task_t *p, int noreap, | 1374 | static int wait_task_continued(struct task_struct *p, int noreap, |
1367 | struct siginfo __user *infop, | 1375 | struct siginfo __user *infop, |
1368 | int __user *stat_addr, struct rusage __user *ru) | 1376 | int __user *stat_addr, struct rusage __user *ru) |
1369 | { | 1377 | { |
@@ -1449,7 +1457,7 @@ repeat: | |||
1449 | int ret; | 1457 | int ret; |
1450 | 1458 | ||
1451 | list_for_each(_p,&tsk->children) { | 1459 | list_for_each(_p,&tsk->children) { |
1452 | p = list_entry(_p,struct task_struct,sibling); | 1460 | p = list_entry(_p, struct task_struct, sibling); |
1453 | 1461 | ||
1454 | ret = eligible_child(pid, options, p); | 1462 | ret = eligible_child(pid, options, p); |
1455 | if (!ret) | 1463 | if (!ret) |
@@ -1538,8 +1546,7 @@ check_continued: | |||
1538 | if (options & __WNOTHREAD) | 1546 | if (options & __WNOTHREAD) |
1539 | break; | 1547 | break; |
1540 | tsk = next_thread(tsk); | 1548 | tsk = next_thread(tsk); |
1541 | if (tsk->signal != current->signal) | 1549 | BUG_ON(tsk->signal != current->signal); |
1542 | BUG(); | ||
1543 | } while (tsk != current); | 1550 | } while (tsk != current); |
1544 | 1551 | ||
1545 | read_unlock(&tasklist_lock); | 1552 | read_unlock(&tasklist_lock); |
diff --git a/kernel/fork.c b/kernel/fork.c index ac8100e3088a..1b0f7b1e0881 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -11,7 +11,6 @@ | |||
11 | * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' | 11 | * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' |
12 | */ | 12 | */ |
13 | 13 | ||
14 | #include <linux/config.h> | ||
15 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
16 | #include <linux/init.h> | 15 | #include <linux/init.h> |
17 | #include <linux/unistd.h> | 16 | #include <linux/unistd.h> |
@@ -44,6 +43,8 @@ | |||
44 | #include <linux/rmap.h> | 43 | #include <linux/rmap.h> |
45 | #include <linux/acct.h> | 44 | #include <linux/acct.h> |
46 | #include <linux/cn_proc.h> | 45 | #include <linux/cn_proc.h> |
46 | #include <linux/delayacct.h> | ||
47 | #include <linux/taskstats_kern.h> | ||
47 | 48 | ||
48 | #include <asm/pgtable.h> | 49 | #include <asm/pgtable.h> |
49 | #include <asm/pgalloc.h> | 50 | #include <asm/pgalloc.h> |
@@ -62,9 +63,7 @@ int max_threads; /* tunable limit on nr_threads */ | |||
62 | 63 | ||
63 | DEFINE_PER_CPU(unsigned long, process_counts) = 0; | 64 | DEFINE_PER_CPU(unsigned long, process_counts) = 0; |
64 | 65 | ||
65 | __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ | 66 | __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ |
66 | |||
67 | EXPORT_SYMBOL(tasklist_lock); | ||
68 | 67 | ||
69 | int nr_processes(void) | 68 | int nr_processes(void) |
70 | { | 69 | { |
@@ -104,6 +103,7 @@ static kmem_cache_t *mm_cachep; | |||
104 | void free_task(struct task_struct *tsk) | 103 | void free_task(struct task_struct *tsk) |
105 | { | 104 | { |
106 | free_thread_info(tsk->thread_info); | 105 | free_thread_info(tsk->thread_info); |
106 | rt_mutex_debug_task_free(tsk); | ||
107 | free_task_struct(tsk); | 107 | free_task_struct(tsk); |
108 | } | 108 | } |
109 | EXPORT_SYMBOL(free_task); | 109 | EXPORT_SYMBOL(free_task); |
@@ -193,7 +193,10 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
193 | 193 | ||
194 | down_write(&oldmm->mmap_sem); | 194 | down_write(&oldmm->mmap_sem); |
195 | flush_cache_mm(oldmm); | 195 | flush_cache_mm(oldmm); |
196 | down_write(&mm->mmap_sem); | 196 | /* |
197 | * Not linked in yet - no deadlock potential: | ||
198 | */ | ||
199 | down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); | ||
197 | 200 | ||
198 | mm->locked_vm = 0; | 201 | mm->locked_vm = 0; |
199 | mm->mmap = NULL; | 202 | mm->mmap = NULL; |
@@ -368,6 +371,8 @@ void fastcall __mmdrop(struct mm_struct *mm) | |||
368 | */ | 371 | */ |
369 | void mmput(struct mm_struct *mm) | 372 | void mmput(struct mm_struct *mm) |
370 | { | 373 | { |
374 | might_sleep(); | ||
375 | |||
371 | if (atomic_dec_and_test(&mm->mm_users)) { | 376 | if (atomic_dec_and_test(&mm->mm_users)) { |
372 | exit_aio(mm); | 377 | exit_aio(mm); |
373 | exit_mmap(mm); | 378 | exit_mmap(mm); |
@@ -623,6 +628,7 @@ out: | |||
623 | /* | 628 | /* |
624 | * Allocate a new files structure and copy contents from the | 629 | * Allocate a new files structure and copy contents from the |
625 | * passed in files structure. | 630 | * passed in files structure. |
631 | * errorp will be valid only when the returned files_struct is NULL. | ||
626 | */ | 632 | */ |
627 | static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) | 633 | static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) |
628 | { | 634 | { |
@@ -631,6 +637,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) | |||
631 | int open_files, size, i, expand; | 637 | int open_files, size, i, expand; |
632 | struct fdtable *old_fdt, *new_fdt; | 638 | struct fdtable *old_fdt, *new_fdt; |
633 | 639 | ||
640 | *errorp = -ENOMEM; | ||
634 | newf = alloc_files(); | 641 | newf = alloc_files(); |
635 | if (!newf) | 642 | if (!newf) |
636 | goto out; | 643 | goto out; |
@@ -744,7 +751,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) | |||
744 | * break this. | 751 | * break this. |
745 | */ | 752 | */ |
746 | tsk->files = NULL; | 753 | tsk->files = NULL; |
747 | error = -ENOMEM; | ||
748 | newf = dup_fd(oldf, &error); | 754 | newf = dup_fd(oldf, &error); |
749 | if (!newf) | 755 | if (!newf) |
750 | goto out; | 756 | goto out; |
@@ -814,6 +820,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
814 | if (clone_flags & CLONE_THREAD) { | 820 | if (clone_flags & CLONE_THREAD) { |
815 | atomic_inc(¤t->signal->count); | 821 | atomic_inc(¤t->signal->count); |
816 | atomic_inc(¤t->signal->live); | 822 | atomic_inc(¤t->signal->live); |
823 | taskstats_tgid_alloc(current->signal); | ||
817 | return 0; | 824 | return 0; |
818 | } | 825 | } |
819 | sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); | 826 | sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); |
@@ -858,6 +865,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
858 | INIT_LIST_HEAD(&sig->cpu_timers[0]); | 865 | INIT_LIST_HEAD(&sig->cpu_timers[0]); |
859 | INIT_LIST_HEAD(&sig->cpu_timers[1]); | 866 | INIT_LIST_HEAD(&sig->cpu_timers[1]); |
860 | INIT_LIST_HEAD(&sig->cpu_timers[2]); | 867 | INIT_LIST_HEAD(&sig->cpu_timers[2]); |
868 | taskstats_tgid_init(sig); | ||
861 | 869 | ||
862 | task_lock(current->group_leader); | 870 | task_lock(current->group_leader); |
863 | memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); | 871 | memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); |
@@ -871,6 +879,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
871 | tsk->it_prof_expires = | 879 | tsk->it_prof_expires = |
872 | secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); | 880 | secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); |
873 | } | 881 | } |
882 | acct_init_pacct(&sig->pacct); | ||
874 | 883 | ||
875 | return 0; | 884 | return 0; |
876 | } | 885 | } |
@@ -878,6 +887,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
878 | void __cleanup_signal(struct signal_struct *sig) | 887 | void __cleanup_signal(struct signal_struct *sig) |
879 | { | 888 | { |
880 | exit_thread_group_keys(sig); | 889 | exit_thread_group_keys(sig); |
890 | taskstats_tgid_free(sig); | ||
881 | kmem_cache_free(signal_cachep, sig); | 891 | kmem_cache_free(signal_cachep, sig); |
882 | } | 892 | } |
883 | 893 | ||
@@ -909,6 +919,15 @@ asmlinkage long sys_set_tid_address(int __user *tidptr) | |||
909 | return current->pid; | 919 | return current->pid; |
910 | } | 920 | } |
911 | 921 | ||
922 | static inline void rt_mutex_init_task(struct task_struct *p) | ||
923 | { | ||
924 | #ifdef CONFIG_RT_MUTEXES | ||
925 | spin_lock_init(&p->pi_lock); | ||
926 | plist_head_init(&p->pi_waiters, &p->pi_lock); | ||
927 | p->pi_blocked_on = NULL; | ||
928 | #endif | ||
929 | } | ||
930 | |||
912 | /* | 931 | /* |
913 | * This creates a new process as a copy of the old one, | 932 | * This creates a new process as a copy of the old one, |
914 | * but does not actually start it yet. | 933 | * but does not actually start it yet. |
@@ -917,13 +936,13 @@ asmlinkage long sys_set_tid_address(int __user *tidptr) | |||
917 | * parts of the process environment (as per the clone | 936 | * parts of the process environment (as per the clone |
918 | * flags). The actual kick-off is left to the caller. | 937 | * flags). The actual kick-off is left to the caller. |
919 | */ | 938 | */ |
920 | static task_t *copy_process(unsigned long clone_flags, | 939 | static struct task_struct *copy_process(unsigned long clone_flags, |
921 | unsigned long stack_start, | 940 | unsigned long stack_start, |
922 | struct pt_regs *regs, | 941 | struct pt_regs *regs, |
923 | unsigned long stack_size, | 942 | unsigned long stack_size, |
924 | int __user *parent_tidptr, | 943 | int __user *parent_tidptr, |
925 | int __user *child_tidptr, | 944 | int __user *child_tidptr, |
926 | int pid) | 945 | int pid) |
927 | { | 946 | { |
928 | int retval; | 947 | int retval; |
929 | struct task_struct *p = NULL; | 948 | struct task_struct *p = NULL; |
@@ -955,6 +974,10 @@ static task_t *copy_process(unsigned long clone_flags, | |||
955 | if (!p) | 974 | if (!p) |
956 | goto fork_out; | 975 | goto fork_out; |
957 | 976 | ||
977 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
978 | DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); | ||
979 | DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); | ||
980 | #endif | ||
958 | retval = -EAGAIN; | 981 | retval = -EAGAIN; |
959 | if (atomic_read(&p->user->processes) >= | 982 | if (atomic_read(&p->user->processes) >= |
960 | p->signal->rlim[RLIMIT_NPROC].rlim_cur) { | 983 | p->signal->rlim[RLIMIT_NPROC].rlim_cur) { |
@@ -982,6 +1005,7 @@ static task_t *copy_process(unsigned long clone_flags, | |||
982 | goto bad_fork_cleanup_put_domain; | 1005 | goto bad_fork_cleanup_put_domain; |
983 | 1006 | ||
984 | p->did_exec = 0; | 1007 | p->did_exec = 0; |
1008 | delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ | ||
985 | copy_flags(clone_flags, p); | 1009 | copy_flags(clone_flags, p); |
986 | p->pid = pid; | 1010 | p->pid = pid; |
987 | retval = -EFAULT; | 1011 | retval = -EFAULT; |
@@ -989,13 +1013,10 @@ static task_t *copy_process(unsigned long clone_flags, | |||
989 | if (put_user(p->pid, parent_tidptr)) | 1013 | if (put_user(p->pid, parent_tidptr)) |
990 | goto bad_fork_cleanup; | 1014 | goto bad_fork_cleanup; |
991 | 1015 | ||
992 | p->proc_dentry = NULL; | ||
993 | |||
994 | INIT_LIST_HEAD(&p->children); | 1016 | INIT_LIST_HEAD(&p->children); |
995 | INIT_LIST_HEAD(&p->sibling); | 1017 | INIT_LIST_HEAD(&p->sibling); |
996 | p->vfork_done = NULL; | 1018 | p->vfork_done = NULL; |
997 | spin_lock_init(&p->alloc_lock); | 1019 | spin_lock_init(&p->alloc_lock); |
998 | spin_lock_init(&p->proc_lock); | ||
999 | 1020 | ||
1000 | clear_tsk_thread_flag(p, TIF_SIGPENDING); | 1021 | clear_tsk_thread_flag(p, TIF_SIGPENDING); |
1001 | init_sigpending(&p->pending); | 1022 | init_sigpending(&p->pending); |
@@ -1032,6 +1053,28 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1032 | } | 1053 | } |
1033 | mpol_fix_fork_child_flag(p); | 1054 | mpol_fix_fork_child_flag(p); |
1034 | #endif | 1055 | #endif |
1056 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
1057 | p->irq_events = 0; | ||
1058 | p->hardirqs_enabled = 0; | ||
1059 | p->hardirq_enable_ip = 0; | ||
1060 | p->hardirq_enable_event = 0; | ||
1061 | p->hardirq_disable_ip = _THIS_IP_; | ||
1062 | p->hardirq_disable_event = 0; | ||
1063 | p->softirqs_enabled = 1; | ||
1064 | p->softirq_enable_ip = _THIS_IP_; | ||
1065 | p->softirq_enable_event = 0; | ||
1066 | p->softirq_disable_ip = 0; | ||
1067 | p->softirq_disable_event = 0; | ||
1068 | p->hardirq_context = 0; | ||
1069 | p->softirq_context = 0; | ||
1070 | #endif | ||
1071 | #ifdef CONFIG_LOCKDEP | ||
1072 | p->lockdep_depth = 0; /* no locks held yet */ | ||
1073 | p->curr_chain_key = 0; | ||
1074 | p->lockdep_recursion = 0; | ||
1075 | #endif | ||
1076 | |||
1077 | rt_mutex_init_task(p); | ||
1035 | 1078 | ||
1036 | #ifdef CONFIG_DEBUG_MUTEXES | 1079 | #ifdef CONFIG_DEBUG_MUTEXES |
1037 | p->blocked_on = NULL; /* not blocked yet */ | 1080 | p->blocked_on = NULL; /* not blocked yet */ |
@@ -1075,6 +1118,9 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1075 | #ifdef CONFIG_COMPAT | 1118 | #ifdef CONFIG_COMPAT |
1076 | p->compat_robust_list = NULL; | 1119 | p->compat_robust_list = NULL; |
1077 | #endif | 1120 | #endif |
1121 | INIT_LIST_HEAD(&p->pi_state_list); | ||
1122 | p->pi_state_cache = NULL; | ||
1123 | |||
1078 | /* | 1124 | /* |
1079 | * sigaltstack should be cleared when sharing the same VM | 1125 | * sigaltstack should be cleared when sharing the same VM |
1080 | */ | 1126 | */ |
@@ -1155,18 +1201,6 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1155 | } | 1201 | } |
1156 | 1202 | ||
1157 | if (clone_flags & CLONE_THREAD) { | 1203 | if (clone_flags & CLONE_THREAD) { |
1158 | /* | ||
1159 | * Important: if an exit-all has been started then | ||
1160 | * do not create this new thread - the whole thread | ||
1161 | * group is supposed to exit anyway. | ||
1162 | */ | ||
1163 | if (current->signal->flags & SIGNAL_GROUP_EXIT) { | ||
1164 | spin_unlock(¤t->sighand->siglock); | ||
1165 | write_unlock_irq(&tasklist_lock); | ||
1166 | retval = -EAGAIN; | ||
1167 | goto bad_fork_cleanup_namespace; | ||
1168 | } | ||
1169 | |||
1170 | p->group_leader = current->group_leader; | 1204 | p->group_leader = current->group_leader; |
1171 | list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); | 1205 | list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); |
1172 | 1206 | ||
@@ -1264,9 +1298,9 @@ struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) | |||
1264 | return regs; | 1298 | return regs; |
1265 | } | 1299 | } |
1266 | 1300 | ||
1267 | task_t * __devinit fork_idle(int cpu) | 1301 | struct task_struct * __devinit fork_idle(int cpu) |
1268 | { | 1302 | { |
1269 | task_t *task; | 1303 | struct task_struct *task; |
1270 | struct pt_regs regs; | 1304 | struct pt_regs regs; |
1271 | 1305 | ||
1272 | task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0); | 1306 | task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0); |
diff --git a/kernel/futex.c b/kernel/futex.c index 5699c512057b..cf0c8e21d1ab 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -12,6 +12,10 @@ | |||
12 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved | 12 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved |
13 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. | 13 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. |
14 | * | 14 | * |
15 | * PI-futex support started by Ingo Molnar and Thomas Gleixner | ||
16 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
17 | * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
18 | * | ||
15 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly | 19 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly |
16 | * enough at me, Linus for the original (flawed) idea, Matthew | 20 | * enough at me, Linus for the original (flawed) idea, Matthew |
17 | * Kirkwood for proof-of-concept implementation. | 21 | * Kirkwood for proof-of-concept implementation. |
@@ -46,6 +50,8 @@ | |||
46 | #include <linux/signal.h> | 50 | #include <linux/signal.h> |
47 | #include <asm/futex.h> | 51 | #include <asm/futex.h> |
48 | 52 | ||
53 | #include "rtmutex_common.h" | ||
54 | |||
49 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) | 55 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) |
50 | 56 | ||
51 | /* | 57 | /* |
@@ -63,7 +69,7 @@ union futex_key { | |||
63 | int offset; | 69 | int offset; |
64 | } shared; | 70 | } shared; |
65 | struct { | 71 | struct { |
66 | unsigned long uaddr; | 72 | unsigned long address; |
67 | struct mm_struct *mm; | 73 | struct mm_struct *mm; |
68 | int offset; | 74 | int offset; |
69 | } private; | 75 | } private; |
@@ -75,6 +81,27 @@ union futex_key { | |||
75 | }; | 81 | }; |
76 | 82 | ||
77 | /* | 83 | /* |
84 | * Priority Inheritance state: | ||
85 | */ | ||
86 | struct futex_pi_state { | ||
87 | /* | ||
88 | * list of 'owned' pi_state instances - these have to be | ||
89 | * cleaned up in do_exit() if the task exits prematurely: | ||
90 | */ | ||
91 | struct list_head list; | ||
92 | |||
93 | /* | ||
94 | * The PI object: | ||
95 | */ | ||
96 | struct rt_mutex pi_mutex; | ||
97 | |||
98 | struct task_struct *owner; | ||
99 | atomic_t refcount; | ||
100 | |||
101 | union futex_key key; | ||
102 | }; | ||
103 | |||
104 | /* | ||
78 | * We use this hashed waitqueue instead of a normal wait_queue_t, so | 105 | * We use this hashed waitqueue instead of a normal wait_queue_t, so |
79 | * we can wake only the relevant ones (hashed queues may be shared). | 106 | * we can wake only the relevant ones (hashed queues may be shared). |
80 | * | 107 | * |
@@ -87,15 +114,19 @@ struct futex_q { | |||
87 | struct list_head list; | 114 | struct list_head list; |
88 | wait_queue_head_t waiters; | 115 | wait_queue_head_t waiters; |
89 | 116 | ||
90 | /* Which hash list lock to use. */ | 117 | /* Which hash list lock to use: */ |
91 | spinlock_t *lock_ptr; | 118 | spinlock_t *lock_ptr; |
92 | 119 | ||
93 | /* Key which the futex is hashed on. */ | 120 | /* Key which the futex is hashed on: */ |
94 | union futex_key key; | 121 | union futex_key key; |
95 | 122 | ||
96 | /* For fd, sigio sent using these. */ | 123 | /* For fd, sigio sent using these: */ |
97 | int fd; | 124 | int fd; |
98 | struct file *filp; | 125 | struct file *filp; |
126 | |||
127 | /* Optional priority inheritance state: */ | ||
128 | struct futex_pi_state *pi_state; | ||
129 | struct task_struct *task; | ||
99 | }; | 130 | }; |
100 | 131 | ||
101 | /* | 132 | /* |
@@ -144,8 +175,9 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) | |||
144 | * | 175 | * |
145 | * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. | 176 | * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. |
146 | */ | 177 | */ |
147 | static int get_futex_key(unsigned long uaddr, union futex_key *key) | 178 | static int get_futex_key(u32 __user *uaddr, union futex_key *key) |
148 | { | 179 | { |
180 | unsigned long address = (unsigned long)uaddr; | ||
149 | struct mm_struct *mm = current->mm; | 181 | struct mm_struct *mm = current->mm; |
150 | struct vm_area_struct *vma; | 182 | struct vm_area_struct *vma; |
151 | struct page *page; | 183 | struct page *page; |
@@ -154,16 +186,16 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
154 | /* | 186 | /* |
155 | * The futex address must be "naturally" aligned. | 187 | * The futex address must be "naturally" aligned. |
156 | */ | 188 | */ |
157 | key->both.offset = uaddr % PAGE_SIZE; | 189 | key->both.offset = address % PAGE_SIZE; |
158 | if (unlikely((key->both.offset % sizeof(u32)) != 0)) | 190 | if (unlikely((key->both.offset % sizeof(u32)) != 0)) |
159 | return -EINVAL; | 191 | return -EINVAL; |
160 | uaddr -= key->both.offset; | 192 | address -= key->both.offset; |
161 | 193 | ||
162 | /* | 194 | /* |
163 | * The futex is hashed differently depending on whether | 195 | * The futex is hashed differently depending on whether |
164 | * it's in a shared or private mapping. So check vma first. | 196 | * it's in a shared or private mapping. So check vma first. |
165 | */ | 197 | */ |
166 | vma = find_extend_vma(mm, uaddr); | 198 | vma = find_extend_vma(mm, address); |
167 | if (unlikely(!vma)) | 199 | if (unlikely(!vma)) |
168 | return -EFAULT; | 200 | return -EFAULT; |
169 | 201 | ||
@@ -184,7 +216,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
184 | */ | 216 | */ |
185 | if (likely(!(vma->vm_flags & VM_MAYSHARE))) { | 217 | if (likely(!(vma->vm_flags & VM_MAYSHARE))) { |
186 | key->private.mm = mm; | 218 | key->private.mm = mm; |
187 | key->private.uaddr = uaddr; | 219 | key->private.address = address; |
188 | return 0; | 220 | return 0; |
189 | } | 221 | } |
190 | 222 | ||
@@ -194,7 +226,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
194 | key->shared.inode = vma->vm_file->f_dentry->d_inode; | 226 | key->shared.inode = vma->vm_file->f_dentry->d_inode; |
195 | key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ | 227 | key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ |
196 | if (likely(!(vma->vm_flags & VM_NONLINEAR))) { | 228 | if (likely(!(vma->vm_flags & VM_NONLINEAR))) { |
197 | key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) | 229 | key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) |
198 | + vma->vm_pgoff); | 230 | + vma->vm_pgoff); |
199 | return 0; | 231 | return 0; |
200 | } | 232 | } |
@@ -205,7 +237,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
205 | * from swap. But that's a lot of code to duplicate here | 237 | * from swap. But that's a lot of code to duplicate here |
206 | * for a rare case, so we simply fetch the page. | 238 | * for a rare case, so we simply fetch the page. |
207 | */ | 239 | */ |
208 | err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); | 240 | err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); |
209 | if (err >= 0) { | 241 | if (err >= 0) { |
210 | key->shared.pgoff = | 242 | key->shared.pgoff = |
211 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 243 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
@@ -246,18 +278,250 @@ static void drop_key_refs(union futex_key *key) | |||
246 | } | 278 | } |
247 | } | 279 | } |
248 | 280 | ||
249 | static inline int get_futex_value_locked(int *dest, int __user *from) | 281 | static inline int get_futex_value_locked(u32 *dest, u32 __user *from) |
250 | { | 282 | { |
251 | int ret; | 283 | int ret; |
252 | 284 | ||
253 | inc_preempt_count(); | 285 | inc_preempt_count(); |
254 | ret = __copy_from_user_inatomic(dest, from, sizeof(int)); | 286 | ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); |
255 | dec_preempt_count(); | 287 | dec_preempt_count(); |
256 | 288 | ||
257 | return ret ? -EFAULT : 0; | 289 | return ret ? -EFAULT : 0; |
258 | } | 290 | } |
259 | 291 | ||
260 | /* | 292 | /* |
293 | * Fault handling. Called with current->mm->mmap_sem held. | ||
294 | */ | ||
295 | static int futex_handle_fault(unsigned long address, int attempt) | ||
296 | { | ||
297 | struct vm_area_struct * vma; | ||
298 | struct mm_struct *mm = current->mm; | ||
299 | |||
300 | if (attempt >= 2 || !(vma = find_vma(mm, address)) || | ||
301 | vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) | ||
302 | return -EFAULT; | ||
303 | |||
304 | switch (handle_mm_fault(mm, vma, address, 1)) { | ||
305 | case VM_FAULT_MINOR: | ||
306 | current->min_flt++; | ||
307 | break; | ||
308 | case VM_FAULT_MAJOR: | ||
309 | current->maj_flt++; | ||
310 | break; | ||
311 | default: | ||
312 | return -EFAULT; | ||
313 | } | ||
314 | return 0; | ||
315 | } | ||
316 | |||
317 | /* | ||
318 | * PI code: | ||
319 | */ | ||
320 | static int refill_pi_state_cache(void) | ||
321 | { | ||
322 | struct futex_pi_state *pi_state; | ||
323 | |||
324 | if (likely(current->pi_state_cache)) | ||
325 | return 0; | ||
326 | |||
327 | pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); | ||
328 | |||
329 | if (!pi_state) | ||
330 | return -ENOMEM; | ||
331 | |||
332 | memset(pi_state, 0, sizeof(*pi_state)); | ||
333 | INIT_LIST_HEAD(&pi_state->list); | ||
334 | /* pi_mutex gets initialized later */ | ||
335 | pi_state->owner = NULL; | ||
336 | atomic_set(&pi_state->refcount, 1); | ||
337 | |||
338 | current->pi_state_cache = pi_state; | ||
339 | |||
340 | return 0; | ||
341 | } | ||
342 | |||
343 | static struct futex_pi_state * alloc_pi_state(void) | ||
344 | { | ||
345 | struct futex_pi_state *pi_state = current->pi_state_cache; | ||
346 | |||
347 | WARN_ON(!pi_state); | ||
348 | current->pi_state_cache = NULL; | ||
349 | |||
350 | return pi_state; | ||
351 | } | ||
352 | |||
353 | static void free_pi_state(struct futex_pi_state *pi_state) | ||
354 | { | ||
355 | if (!atomic_dec_and_test(&pi_state->refcount)) | ||
356 | return; | ||
357 | |||
358 | /* | ||
359 | * If pi_state->owner is NULL, the owner is most probably dying | ||
360 | * and has cleaned up the pi_state already | ||
361 | */ | ||
362 | if (pi_state->owner) { | ||
363 | spin_lock_irq(&pi_state->owner->pi_lock); | ||
364 | list_del_init(&pi_state->list); | ||
365 | spin_unlock_irq(&pi_state->owner->pi_lock); | ||
366 | |||
367 | rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); | ||
368 | } | ||
369 | |||
370 | if (current->pi_state_cache) | ||
371 | kfree(pi_state); | ||
372 | else { | ||
373 | /* | ||
374 | * pi_state->list is already empty. | ||
375 | * clear pi_state->owner. | ||
376 | * refcount is at 0 - put it back to 1. | ||
377 | */ | ||
378 | pi_state->owner = NULL; | ||
379 | atomic_set(&pi_state->refcount, 1); | ||
380 | current->pi_state_cache = pi_state; | ||
381 | } | ||
382 | } | ||
383 | |||
384 | /* | ||
385 | * Look up the task based on what TID userspace gave us. | ||
386 | * We dont trust it. | ||
387 | */ | ||
388 | static struct task_struct * futex_find_get_task(pid_t pid) | ||
389 | { | ||
390 | struct task_struct *p; | ||
391 | |||
392 | read_lock(&tasklist_lock); | ||
393 | p = find_task_by_pid(pid); | ||
394 | if (!p) | ||
395 | goto out_unlock; | ||
396 | if ((current->euid != p->euid) && (current->euid != p->uid)) { | ||
397 | p = NULL; | ||
398 | goto out_unlock; | ||
399 | } | ||
400 | if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) { | ||
401 | p = NULL; | ||
402 | goto out_unlock; | ||
403 | } | ||
404 | get_task_struct(p); | ||
405 | out_unlock: | ||
406 | read_unlock(&tasklist_lock); | ||
407 | |||
408 | return p; | ||
409 | } | ||
410 | |||
411 | /* | ||
412 | * This task is holding PI mutexes at exit time => bad. | ||
413 | * Kernel cleans up PI-state, but userspace is likely hosed. | ||
414 | * (Robust-futex cleanup is separate and might save the day for userspace.) | ||
415 | */ | ||
416 | void exit_pi_state_list(struct task_struct *curr) | ||
417 | { | ||
418 | struct futex_hash_bucket *hb; | ||
419 | struct list_head *next, *head = &curr->pi_state_list; | ||
420 | struct futex_pi_state *pi_state; | ||
421 | union futex_key key; | ||
422 | |||
423 | /* | ||
424 | * We are a ZOMBIE and nobody can enqueue itself on | ||
425 | * pi_state_list anymore, but we have to be careful | ||
426 | * versus waiters unqueueing themselfs | ||
427 | */ | ||
428 | spin_lock_irq(&curr->pi_lock); | ||
429 | while (!list_empty(head)) { | ||
430 | |||
431 | next = head->next; | ||
432 | pi_state = list_entry(next, struct futex_pi_state, list); | ||
433 | key = pi_state->key; | ||
434 | spin_unlock_irq(&curr->pi_lock); | ||
435 | |||
436 | hb = hash_futex(&key); | ||
437 | spin_lock(&hb->lock); | ||
438 | |||
439 | spin_lock_irq(&curr->pi_lock); | ||
440 | if (head->next != next) { | ||
441 | spin_unlock(&hb->lock); | ||
442 | continue; | ||
443 | } | ||
444 | |||
445 | list_del_init(&pi_state->list); | ||
446 | |||
447 | WARN_ON(pi_state->owner != curr); | ||
448 | |||
449 | pi_state->owner = NULL; | ||
450 | spin_unlock_irq(&curr->pi_lock); | ||
451 | |||
452 | rt_mutex_unlock(&pi_state->pi_mutex); | ||
453 | |||
454 | spin_unlock(&hb->lock); | ||
455 | |||
456 | spin_lock_irq(&curr->pi_lock); | ||
457 | } | ||
458 | spin_unlock_irq(&curr->pi_lock); | ||
459 | } | ||
460 | |||
461 | static int | ||
462 | lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) | ||
463 | { | ||
464 | struct futex_pi_state *pi_state = NULL; | ||
465 | struct futex_q *this, *next; | ||
466 | struct list_head *head; | ||
467 | struct task_struct *p; | ||
468 | pid_t pid; | ||
469 | |||
470 | head = &hb->chain; | ||
471 | |||
472 | list_for_each_entry_safe(this, next, head, list) { | ||
473 | if (match_futex (&this->key, &me->key)) { | ||
474 | /* | ||
475 | * Another waiter already exists - bump up | ||
476 | * the refcount and return its pi_state: | ||
477 | */ | ||
478 | pi_state = this->pi_state; | ||
479 | /* | ||
480 | * Userspace might have messed up non PI and PI futexes | ||
481 | */ | ||
482 | if (unlikely(!pi_state)) | ||
483 | return -EINVAL; | ||
484 | |||
485 | atomic_inc(&pi_state->refcount); | ||
486 | me->pi_state = pi_state; | ||
487 | |||
488 | return 0; | ||
489 | } | ||
490 | } | ||
491 | |||
492 | /* | ||
493 | * We are the first waiter - try to look up the real owner and | ||
494 | * attach the new pi_state to it: | ||
495 | */ | ||
496 | pid = uval & FUTEX_TID_MASK; | ||
497 | p = futex_find_get_task(pid); | ||
498 | if (!p) | ||
499 | return -ESRCH; | ||
500 | |||
501 | pi_state = alloc_pi_state(); | ||
502 | |||
503 | /* | ||
504 | * Initialize the pi_mutex in locked state and make 'p' | ||
505 | * the owner of it: | ||
506 | */ | ||
507 | rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); | ||
508 | |||
509 | /* Store the key for possible exit cleanups: */ | ||
510 | pi_state->key = me->key; | ||
511 | |||
512 | spin_lock_irq(&p->pi_lock); | ||
513 | list_add(&pi_state->list, &p->pi_state_list); | ||
514 | pi_state->owner = p; | ||
515 | spin_unlock_irq(&p->pi_lock); | ||
516 | |||
517 | put_task_struct(p); | ||
518 | |||
519 | me->pi_state = pi_state; | ||
520 | |||
521 | return 0; | ||
522 | } | ||
523 | |||
524 | /* | ||
261 | * The hash bucket lock must be held when this is called. | 525 | * The hash bucket lock must be held when this is called. |
262 | * Afterwards, the futex_q must not be accessed. | 526 | * Afterwards, the futex_q must not be accessed. |
263 | */ | 527 | */ |
@@ -284,16 +548,96 @@ static void wake_futex(struct futex_q *q) | |||
284 | q->lock_ptr = NULL; | 548 | q->lock_ptr = NULL; |
285 | } | 549 | } |
286 | 550 | ||
551 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | ||
552 | { | ||
553 | struct task_struct *new_owner; | ||
554 | struct futex_pi_state *pi_state = this->pi_state; | ||
555 | u32 curval, newval; | ||
556 | |||
557 | if (!pi_state) | ||
558 | return -EINVAL; | ||
559 | |||
560 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); | ||
561 | |||
562 | /* | ||
563 | * This happens when we have stolen the lock and the original | ||
564 | * pending owner did not enqueue itself back on the rt_mutex. | ||
565 | * Thats not a tragedy. We know that way, that a lock waiter | ||
566 | * is on the fly. We make the futex_q waiter the pending owner. | ||
567 | */ | ||
568 | if (!new_owner) | ||
569 | new_owner = this->task; | ||
570 | |||
571 | /* | ||
572 | * We pass it to the next owner. (The WAITERS bit is always | ||
573 | * kept enabled while there is PI state around. We must also | ||
574 | * preserve the owner died bit.) | ||
575 | */ | ||
576 | newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid; | ||
577 | |||
578 | inc_preempt_count(); | ||
579 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | ||
580 | dec_preempt_count(); | ||
581 | |||
582 | if (curval == -EFAULT) | ||
583 | return -EFAULT; | ||
584 | if (curval != uval) | ||
585 | return -EINVAL; | ||
586 | |||
587 | list_del_init(&pi_state->owner->pi_state_list); | ||
588 | list_add(&pi_state->list, &new_owner->pi_state_list); | ||
589 | pi_state->owner = new_owner; | ||
590 | rt_mutex_unlock(&pi_state->pi_mutex); | ||
591 | |||
592 | return 0; | ||
593 | } | ||
594 | |||
595 | static int unlock_futex_pi(u32 __user *uaddr, u32 uval) | ||
596 | { | ||
597 | u32 oldval; | ||
598 | |||
599 | /* | ||
600 | * There is no waiter, so we unlock the futex. The owner died | ||
601 | * bit has not to be preserved here. We are the owner: | ||
602 | */ | ||
603 | inc_preempt_count(); | ||
604 | oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); | ||
605 | dec_preempt_count(); | ||
606 | |||
607 | if (oldval == -EFAULT) | ||
608 | return oldval; | ||
609 | if (oldval != uval) | ||
610 | return -EAGAIN; | ||
611 | |||
612 | return 0; | ||
613 | } | ||
614 | |||
615 | /* | ||
616 | * Express the locking dependencies for lockdep: | ||
617 | */ | ||
618 | static inline void | ||
619 | double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) | ||
620 | { | ||
621 | if (hb1 <= hb2) { | ||
622 | spin_lock(&hb1->lock); | ||
623 | if (hb1 < hb2) | ||
624 | spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); | ||
625 | } else { /* hb1 > hb2 */ | ||
626 | spin_lock(&hb2->lock); | ||
627 | spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); | ||
628 | } | ||
629 | } | ||
630 | |||
287 | /* | 631 | /* |
288 | * Wake up all waiters hashed on the physical page that is mapped | 632 | * Wake up all waiters hashed on the physical page that is mapped |
289 | * to this virtual address: | 633 | * to this virtual address: |
290 | */ | 634 | */ |
291 | static int futex_wake(unsigned long uaddr, int nr_wake) | 635 | static int futex_wake(u32 __user *uaddr, int nr_wake) |
292 | { | 636 | { |
293 | union futex_key key; | 637 | struct futex_hash_bucket *hb; |
294 | struct futex_hash_bucket *bh; | ||
295 | struct list_head *head; | ||
296 | struct futex_q *this, *next; | 638 | struct futex_q *this, *next; |
639 | struct list_head *head; | ||
640 | union futex_key key; | ||
297 | int ret; | 641 | int ret; |
298 | 642 | ||
299 | down_read(¤t->mm->mmap_sem); | 643 | down_read(¤t->mm->mmap_sem); |
@@ -302,19 +646,23 @@ static int futex_wake(unsigned long uaddr, int nr_wake) | |||
302 | if (unlikely(ret != 0)) | 646 | if (unlikely(ret != 0)) |
303 | goto out; | 647 | goto out; |
304 | 648 | ||
305 | bh = hash_futex(&key); | 649 | hb = hash_futex(&key); |
306 | spin_lock(&bh->lock); | 650 | spin_lock(&hb->lock); |
307 | head = &bh->chain; | 651 | head = &hb->chain; |
308 | 652 | ||
309 | list_for_each_entry_safe(this, next, head, list) { | 653 | list_for_each_entry_safe(this, next, head, list) { |
310 | if (match_futex (&this->key, &key)) { | 654 | if (match_futex (&this->key, &key)) { |
655 | if (this->pi_state) { | ||
656 | ret = -EINVAL; | ||
657 | break; | ||
658 | } | ||
311 | wake_futex(this); | 659 | wake_futex(this); |
312 | if (++ret >= nr_wake) | 660 | if (++ret >= nr_wake) |
313 | break; | 661 | break; |
314 | } | 662 | } |
315 | } | 663 | } |
316 | 664 | ||
317 | spin_unlock(&bh->lock); | 665 | spin_unlock(&hb->lock); |
318 | out: | 666 | out: |
319 | up_read(¤t->mm->mmap_sem); | 667 | up_read(¤t->mm->mmap_sem); |
320 | return ret; | 668 | return ret; |
@@ -324,10 +672,12 @@ out: | |||
324 | * Wake up all waiters hashed on the physical page that is mapped | 672 | * Wake up all waiters hashed on the physical page that is mapped |
325 | * to this virtual address: | 673 | * to this virtual address: |
326 | */ | 674 | */ |
327 | static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) | 675 | static int |
676 | futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2, | ||
677 | int nr_wake, int nr_wake2, int op) | ||
328 | { | 678 | { |
329 | union futex_key key1, key2; | 679 | union futex_key key1, key2; |
330 | struct futex_hash_bucket *bh1, *bh2; | 680 | struct futex_hash_bucket *hb1, *hb2; |
331 | struct list_head *head; | 681 | struct list_head *head; |
332 | struct futex_q *this, *next; | 682 | struct futex_q *this, *next; |
333 | int ret, op_ret, attempt = 0; | 683 | int ret, op_ret, attempt = 0; |
@@ -342,27 +692,25 @@ retryfull: | |||
342 | if (unlikely(ret != 0)) | 692 | if (unlikely(ret != 0)) |
343 | goto out; | 693 | goto out; |
344 | 694 | ||
345 | bh1 = hash_futex(&key1); | 695 | hb1 = hash_futex(&key1); |
346 | bh2 = hash_futex(&key2); | 696 | hb2 = hash_futex(&key2); |
347 | 697 | ||
348 | retry: | 698 | retry: |
349 | if (bh1 < bh2) | 699 | double_lock_hb(hb1, hb2); |
350 | spin_lock(&bh1->lock); | ||
351 | spin_lock(&bh2->lock); | ||
352 | if (bh1 > bh2) | ||
353 | spin_lock(&bh1->lock); | ||
354 | 700 | ||
355 | op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); | 701 | op_ret = futex_atomic_op_inuser(op, uaddr2); |
356 | if (unlikely(op_ret < 0)) { | 702 | if (unlikely(op_ret < 0)) { |
357 | int dummy; | 703 | u32 dummy; |
358 | 704 | ||
359 | spin_unlock(&bh1->lock); | 705 | spin_unlock(&hb1->lock); |
360 | if (bh1 != bh2) | 706 | if (hb1 != hb2) |
361 | spin_unlock(&bh2->lock); | 707 | spin_unlock(&hb2->lock); |
362 | 708 | ||
363 | #ifndef CONFIG_MMU | 709 | #ifndef CONFIG_MMU |
364 | /* we don't get EFAULT from MMU faults if we don't have an MMU, | 710 | /* |
365 | * but we might get them from range checking */ | 711 | * we don't get EFAULT from MMU faults if we don't have an MMU, |
712 | * but we might get them from range checking | ||
713 | */ | ||
366 | ret = op_ret; | 714 | ret = op_ret; |
367 | goto out; | 715 | goto out; |
368 | #endif | 716 | #endif |
@@ -372,47 +720,34 @@ retry: | |||
372 | goto out; | 720 | goto out; |
373 | } | 721 | } |
374 | 722 | ||
375 | /* futex_atomic_op_inuser needs to both read and write | 723 | /* |
724 | * futex_atomic_op_inuser needs to both read and write | ||
376 | * *(int __user *)uaddr2, but we can't modify it | 725 | * *(int __user *)uaddr2, but we can't modify it |
377 | * non-atomically. Therefore, if get_user below is not | 726 | * non-atomically. Therefore, if get_user below is not |
378 | * enough, we need to handle the fault ourselves, while | 727 | * enough, we need to handle the fault ourselves, while |
379 | * still holding the mmap_sem. */ | 728 | * still holding the mmap_sem. |
729 | */ | ||
380 | if (attempt++) { | 730 | if (attempt++) { |
381 | struct vm_area_struct * vma; | 731 | if (futex_handle_fault((unsigned long)uaddr2, |
382 | struct mm_struct *mm = current->mm; | 732 | attempt)) |
383 | |||
384 | ret = -EFAULT; | ||
385 | if (attempt >= 2 || | ||
386 | !(vma = find_vma(mm, uaddr2)) || | ||
387 | vma->vm_start > uaddr2 || | ||
388 | !(vma->vm_flags & VM_WRITE)) | ||
389 | goto out; | ||
390 | |||
391 | switch (handle_mm_fault(mm, vma, uaddr2, 1)) { | ||
392 | case VM_FAULT_MINOR: | ||
393 | current->min_flt++; | ||
394 | break; | ||
395 | case VM_FAULT_MAJOR: | ||
396 | current->maj_flt++; | ||
397 | break; | ||
398 | default: | ||
399 | goto out; | 733 | goto out; |
400 | } | ||
401 | goto retry; | 734 | goto retry; |
402 | } | 735 | } |
403 | 736 | ||
404 | /* If we would have faulted, release mmap_sem, | 737 | /* |
405 | * fault it in and start all over again. */ | 738 | * If we would have faulted, release mmap_sem, |
739 | * fault it in and start all over again. | ||
740 | */ | ||
406 | up_read(¤t->mm->mmap_sem); | 741 | up_read(¤t->mm->mmap_sem); |
407 | 742 | ||
408 | ret = get_user(dummy, (int __user *)uaddr2); | 743 | ret = get_user(dummy, uaddr2); |
409 | if (ret) | 744 | if (ret) |
410 | return ret; | 745 | return ret; |
411 | 746 | ||
412 | goto retryfull; | 747 | goto retryfull; |
413 | } | 748 | } |
414 | 749 | ||
415 | head = &bh1->chain; | 750 | head = &hb1->chain; |
416 | 751 | ||
417 | list_for_each_entry_safe(this, next, head, list) { | 752 | list_for_each_entry_safe(this, next, head, list) { |
418 | if (match_futex (&this->key, &key1)) { | 753 | if (match_futex (&this->key, &key1)) { |
@@ -423,7 +758,7 @@ retry: | |||
423 | } | 758 | } |
424 | 759 | ||
425 | if (op_ret > 0) { | 760 | if (op_ret > 0) { |
426 | head = &bh2->chain; | 761 | head = &hb2->chain; |
427 | 762 | ||
428 | op_ret = 0; | 763 | op_ret = 0; |
429 | list_for_each_entry_safe(this, next, head, list) { | 764 | list_for_each_entry_safe(this, next, head, list) { |
@@ -436,9 +771,9 @@ retry: | |||
436 | ret += op_ret; | 771 | ret += op_ret; |
437 | } | 772 | } |
438 | 773 | ||
439 | spin_unlock(&bh1->lock); | 774 | spin_unlock(&hb1->lock); |
440 | if (bh1 != bh2) | 775 | if (hb1 != hb2) |
441 | spin_unlock(&bh2->lock); | 776 | spin_unlock(&hb2->lock); |
442 | out: | 777 | out: |
443 | up_read(¤t->mm->mmap_sem); | 778 | up_read(¤t->mm->mmap_sem); |
444 | return ret; | 779 | return ret; |
@@ -448,11 +783,11 @@ out: | |||
448 | * Requeue all waiters hashed on one physical page to another | 783 | * Requeue all waiters hashed on one physical page to another |
449 | * physical page. | 784 | * physical page. |
450 | */ | 785 | */ |
451 | static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, | 786 | static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, |
452 | int nr_wake, int nr_requeue, int *valp) | 787 | int nr_wake, int nr_requeue, u32 *cmpval) |
453 | { | 788 | { |
454 | union futex_key key1, key2; | 789 | union futex_key key1, key2; |
455 | struct futex_hash_bucket *bh1, *bh2; | 790 | struct futex_hash_bucket *hb1, *hb2; |
456 | struct list_head *head1; | 791 | struct list_head *head1; |
457 | struct futex_q *this, *next; | 792 | struct futex_q *this, *next; |
458 | int ret, drop_count = 0; | 793 | int ret, drop_count = 0; |
@@ -467,68 +802,68 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, | |||
467 | if (unlikely(ret != 0)) | 802 | if (unlikely(ret != 0)) |
468 | goto out; | 803 | goto out; |
469 | 804 | ||
470 | bh1 = hash_futex(&key1); | 805 | hb1 = hash_futex(&key1); |
471 | bh2 = hash_futex(&key2); | 806 | hb2 = hash_futex(&key2); |
472 | 807 | ||
473 | if (bh1 < bh2) | 808 | double_lock_hb(hb1, hb2); |
474 | spin_lock(&bh1->lock); | ||
475 | spin_lock(&bh2->lock); | ||
476 | if (bh1 > bh2) | ||
477 | spin_lock(&bh1->lock); | ||
478 | 809 | ||
479 | if (likely(valp != NULL)) { | 810 | if (likely(cmpval != NULL)) { |
480 | int curval; | 811 | u32 curval; |
481 | 812 | ||
482 | ret = get_futex_value_locked(&curval, (int __user *)uaddr1); | 813 | ret = get_futex_value_locked(&curval, uaddr1); |
483 | 814 | ||
484 | if (unlikely(ret)) { | 815 | if (unlikely(ret)) { |
485 | spin_unlock(&bh1->lock); | 816 | spin_unlock(&hb1->lock); |
486 | if (bh1 != bh2) | 817 | if (hb1 != hb2) |
487 | spin_unlock(&bh2->lock); | 818 | spin_unlock(&hb2->lock); |
488 | 819 | ||
489 | /* If we would have faulted, release mmap_sem, fault | 820 | /* |
821 | * If we would have faulted, release mmap_sem, fault | ||
490 | * it in and start all over again. | 822 | * it in and start all over again. |
491 | */ | 823 | */ |
492 | up_read(¤t->mm->mmap_sem); | 824 | up_read(¤t->mm->mmap_sem); |
493 | 825 | ||
494 | ret = get_user(curval, (int __user *)uaddr1); | 826 | ret = get_user(curval, uaddr1); |
495 | 827 | ||
496 | if (!ret) | 828 | if (!ret) |
497 | goto retry; | 829 | goto retry; |
498 | 830 | ||
499 | return ret; | 831 | return ret; |
500 | } | 832 | } |
501 | if (curval != *valp) { | 833 | if (curval != *cmpval) { |
502 | ret = -EAGAIN; | 834 | ret = -EAGAIN; |
503 | goto out_unlock; | 835 | goto out_unlock; |
504 | } | 836 | } |
505 | } | 837 | } |
506 | 838 | ||
507 | head1 = &bh1->chain; | 839 | head1 = &hb1->chain; |
508 | list_for_each_entry_safe(this, next, head1, list) { | 840 | list_for_each_entry_safe(this, next, head1, list) { |
509 | if (!match_futex (&this->key, &key1)) | 841 | if (!match_futex (&this->key, &key1)) |
510 | continue; | 842 | continue; |
511 | if (++ret <= nr_wake) { | 843 | if (++ret <= nr_wake) { |
512 | wake_futex(this); | 844 | wake_futex(this); |
513 | } else { | 845 | } else { |
514 | list_move_tail(&this->list, &bh2->chain); | 846 | /* |
515 | this->lock_ptr = &bh2->lock; | 847 | * If key1 and key2 hash to the same bucket, no need to |
848 | * requeue. | ||
849 | */ | ||
850 | if (likely(head1 != &hb2->chain)) { | ||
851 | list_move_tail(&this->list, &hb2->chain); | ||
852 | this->lock_ptr = &hb2->lock; | ||
853 | } | ||
516 | this->key = key2; | 854 | this->key = key2; |
517 | get_key_refs(&key2); | 855 | get_key_refs(&key2); |
518 | drop_count++; | 856 | drop_count++; |
519 | 857 | ||
520 | if (ret - nr_wake >= nr_requeue) | 858 | if (ret - nr_wake >= nr_requeue) |
521 | break; | 859 | break; |
522 | /* Make sure to stop if key1 == key2 */ | ||
523 | if (head1 == &bh2->chain && head1 != &next->list) | ||
524 | head1 = &this->list; | ||
525 | } | 860 | } |
526 | } | 861 | } |
527 | 862 | ||
528 | out_unlock: | 863 | out_unlock: |
529 | spin_unlock(&bh1->lock); | 864 | spin_unlock(&hb1->lock); |
530 | if (bh1 != bh2) | 865 | if (hb1 != hb2) |
531 | spin_unlock(&bh2->lock); | 866 | spin_unlock(&hb2->lock); |
532 | 867 | ||
533 | /* drop_key_refs() must be called outside the spinlocks. */ | 868 | /* drop_key_refs() must be called outside the spinlocks. */ |
534 | while (--drop_count >= 0) | 869 | while (--drop_count >= 0) |
@@ -543,7 +878,7 @@ out: | |||
543 | static inline struct futex_hash_bucket * | 878 | static inline struct futex_hash_bucket * |
544 | queue_lock(struct futex_q *q, int fd, struct file *filp) | 879 | queue_lock(struct futex_q *q, int fd, struct file *filp) |
545 | { | 880 | { |
546 | struct futex_hash_bucket *bh; | 881 | struct futex_hash_bucket *hb; |
547 | 882 | ||
548 | q->fd = fd; | 883 | q->fd = fd; |
549 | q->filp = filp; | 884 | q->filp = filp; |
@@ -551,23 +886,24 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) | |||
551 | init_waitqueue_head(&q->waiters); | 886 | init_waitqueue_head(&q->waiters); |
552 | 887 | ||
553 | get_key_refs(&q->key); | 888 | get_key_refs(&q->key); |
554 | bh = hash_futex(&q->key); | 889 | hb = hash_futex(&q->key); |
555 | q->lock_ptr = &bh->lock; | 890 | q->lock_ptr = &hb->lock; |
556 | 891 | ||
557 | spin_lock(&bh->lock); | 892 | spin_lock(&hb->lock); |
558 | return bh; | 893 | return hb; |
559 | } | 894 | } |
560 | 895 | ||
561 | static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh) | 896 | static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) |
562 | { | 897 | { |
563 | list_add_tail(&q->list, &bh->chain); | 898 | list_add_tail(&q->list, &hb->chain); |
564 | spin_unlock(&bh->lock); | 899 | q->task = current; |
900 | spin_unlock(&hb->lock); | ||
565 | } | 901 | } |
566 | 902 | ||
567 | static inline void | 903 | static inline void |
568 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) | 904 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) |
569 | { | 905 | { |
570 | spin_unlock(&bh->lock); | 906 | spin_unlock(&hb->lock); |
571 | drop_key_refs(&q->key); | 907 | drop_key_refs(&q->key); |
572 | } | 908 | } |
573 | 909 | ||
@@ -579,16 +915,17 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) | |||
579 | /* The key must be already stored in q->key. */ | 915 | /* The key must be already stored in q->key. */ |
580 | static void queue_me(struct futex_q *q, int fd, struct file *filp) | 916 | static void queue_me(struct futex_q *q, int fd, struct file *filp) |
581 | { | 917 | { |
582 | struct futex_hash_bucket *bh; | 918 | struct futex_hash_bucket *hb; |
583 | bh = queue_lock(q, fd, filp); | 919 | |
584 | __queue_me(q, bh); | 920 | hb = queue_lock(q, fd, filp); |
921 | __queue_me(q, hb); | ||
585 | } | 922 | } |
586 | 923 | ||
587 | /* Return 1 if we were still queued (ie. 0 means we were woken) */ | 924 | /* Return 1 if we were still queued (ie. 0 means we were woken) */ |
588 | static int unqueue_me(struct futex_q *q) | 925 | static int unqueue_me(struct futex_q *q) |
589 | { | 926 | { |
590 | int ret = 0; | ||
591 | spinlock_t *lock_ptr; | 927 | spinlock_t *lock_ptr; |
928 | int ret = 0; | ||
592 | 929 | ||
593 | /* In the common case we don't take the spinlock, which is nice. */ | 930 | /* In the common case we don't take the spinlock, which is nice. */ |
594 | retry: | 931 | retry: |
@@ -614,6 +951,9 @@ static int unqueue_me(struct futex_q *q) | |||
614 | } | 951 | } |
615 | WARN_ON(list_empty(&q->list)); | 952 | WARN_ON(list_empty(&q->list)); |
616 | list_del(&q->list); | 953 | list_del(&q->list); |
954 | |||
955 | BUG_ON(q->pi_state); | ||
956 | |||
617 | spin_unlock(lock_ptr); | 957 | spin_unlock(lock_ptr); |
618 | ret = 1; | 958 | ret = 1; |
619 | } | 959 | } |
@@ -622,21 +962,42 @@ static int unqueue_me(struct futex_q *q) | |||
622 | return ret; | 962 | return ret; |
623 | } | 963 | } |
624 | 964 | ||
625 | static int futex_wait(unsigned long uaddr, int val, unsigned long time) | 965 | /* |
966 | * PI futexes can not be requeued and must remove themself from the | ||
967 | * hash bucket. The hash bucket lock is held on entry and dropped here. | ||
968 | */ | ||
969 | static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) | ||
626 | { | 970 | { |
627 | DECLARE_WAITQUEUE(wait, current); | 971 | WARN_ON(list_empty(&q->list)); |
628 | int ret, curval; | 972 | list_del(&q->list); |
973 | |||
974 | BUG_ON(!q->pi_state); | ||
975 | free_pi_state(q->pi_state); | ||
976 | q->pi_state = NULL; | ||
977 | |||
978 | spin_unlock(&hb->lock); | ||
979 | |||
980 | drop_key_refs(&q->key); | ||
981 | } | ||
982 | |||
983 | static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) | ||
984 | { | ||
985 | struct task_struct *curr = current; | ||
986 | DECLARE_WAITQUEUE(wait, curr); | ||
987 | struct futex_hash_bucket *hb; | ||
629 | struct futex_q q; | 988 | struct futex_q q; |
630 | struct futex_hash_bucket *bh; | 989 | u32 uval; |
990 | int ret; | ||
631 | 991 | ||
992 | q.pi_state = NULL; | ||
632 | retry: | 993 | retry: |
633 | down_read(¤t->mm->mmap_sem); | 994 | down_read(&curr->mm->mmap_sem); |
634 | 995 | ||
635 | ret = get_futex_key(uaddr, &q.key); | 996 | ret = get_futex_key(uaddr, &q.key); |
636 | if (unlikely(ret != 0)) | 997 | if (unlikely(ret != 0)) |
637 | goto out_release_sem; | 998 | goto out_release_sem; |
638 | 999 | ||
639 | bh = queue_lock(&q, -1, NULL); | 1000 | hb = queue_lock(&q, -1, NULL); |
640 | 1001 | ||
641 | /* | 1002 | /* |
642 | * Access the page AFTER the futex is queued. | 1003 | * Access the page AFTER the futex is queued. |
@@ -658,37 +1019,35 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time) | |||
658 | * We hold the mmap semaphore, so the mapping cannot have changed | 1019 | * We hold the mmap semaphore, so the mapping cannot have changed |
659 | * since we looked it up in get_futex_key. | 1020 | * since we looked it up in get_futex_key. |
660 | */ | 1021 | */ |
661 | 1022 | ret = get_futex_value_locked(&uval, uaddr); | |
662 | ret = get_futex_value_locked(&curval, (int __user *)uaddr); | ||
663 | 1023 | ||
664 | if (unlikely(ret)) { | 1024 | if (unlikely(ret)) { |
665 | queue_unlock(&q, bh); | 1025 | queue_unlock(&q, hb); |
666 | 1026 | ||
667 | /* If we would have faulted, release mmap_sem, fault it in and | 1027 | /* |
1028 | * If we would have faulted, release mmap_sem, fault it in and | ||
668 | * start all over again. | 1029 | * start all over again. |
669 | */ | 1030 | */ |
670 | up_read(¤t->mm->mmap_sem); | 1031 | up_read(&curr->mm->mmap_sem); |
671 | 1032 | ||
672 | ret = get_user(curval, (int __user *)uaddr); | 1033 | ret = get_user(uval, uaddr); |
673 | 1034 | ||
674 | if (!ret) | 1035 | if (!ret) |
675 | goto retry; | 1036 | goto retry; |
676 | return ret; | 1037 | return ret; |
677 | } | 1038 | } |
678 | if (curval != val) { | 1039 | ret = -EWOULDBLOCK; |
679 | ret = -EWOULDBLOCK; | 1040 | if (uval != val) |
680 | queue_unlock(&q, bh); | 1041 | goto out_unlock_release_sem; |
681 | goto out_release_sem; | ||
682 | } | ||
683 | 1042 | ||
684 | /* Only actually queue if *uaddr contained val. */ | 1043 | /* Only actually queue if *uaddr contained val. */ |
685 | __queue_me(&q, bh); | 1044 | __queue_me(&q, hb); |
686 | 1045 | ||
687 | /* | 1046 | /* |
688 | * Now the futex is queued and we have checked the data, we | 1047 | * Now the futex is queued and we have checked the data, we |
689 | * don't want to hold mmap_sem while we sleep. | 1048 | * don't want to hold mmap_sem while we sleep. |
690 | */ | 1049 | */ |
691 | up_read(¤t->mm->mmap_sem); | 1050 | up_read(&curr->mm->mmap_sem); |
692 | 1051 | ||
693 | /* | 1052 | /* |
694 | * There might have been scheduling since the queue_me(), as we | 1053 | * There might have been scheduling since the queue_me(), as we |
@@ -720,12 +1079,421 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time) | |||
720 | return 0; | 1079 | return 0; |
721 | if (time == 0) | 1080 | if (time == 0) |
722 | return -ETIMEDOUT; | 1081 | return -ETIMEDOUT; |
723 | /* We expect signal_pending(current), but another thread may | 1082 | /* |
724 | * have handled it for us already. */ | 1083 | * We expect signal_pending(current), but another thread may |
1084 | * have handled it for us already. | ||
1085 | */ | ||
725 | return -EINTR; | 1086 | return -EINTR; |
726 | 1087 | ||
1088 | out_unlock_release_sem: | ||
1089 | queue_unlock(&q, hb); | ||
1090 | |||
727 | out_release_sem: | 1091 | out_release_sem: |
1092 | up_read(&curr->mm->mmap_sem); | ||
1093 | return ret; | ||
1094 | } | ||
1095 | |||
1096 | /* | ||
1097 | * Userspace tried a 0 -> TID atomic transition of the futex value | ||
1098 | * and failed. The kernel side here does the whole locking operation: | ||
1099 | * if there are waiters then it will block, it does PI, etc. (Due to | ||
1100 | * races the kernel might see a 0 value of the futex too.) | ||
1101 | */ | ||
1102 | static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, | ||
1103 | struct hrtimer_sleeper *to) | ||
1104 | { | ||
1105 | struct task_struct *curr = current; | ||
1106 | struct futex_hash_bucket *hb; | ||
1107 | u32 uval, newval, curval; | ||
1108 | struct futex_q q; | ||
1109 | int ret, attempt = 0; | ||
1110 | |||
1111 | if (refill_pi_state_cache()) | ||
1112 | return -ENOMEM; | ||
1113 | |||
1114 | q.pi_state = NULL; | ||
1115 | retry: | ||
1116 | down_read(&curr->mm->mmap_sem); | ||
1117 | |||
1118 | ret = get_futex_key(uaddr, &q.key); | ||
1119 | if (unlikely(ret != 0)) | ||
1120 | goto out_release_sem; | ||
1121 | |||
1122 | hb = queue_lock(&q, -1, NULL); | ||
1123 | |||
1124 | retry_locked: | ||
1125 | /* | ||
1126 | * To avoid races, we attempt to take the lock here again | ||
1127 | * (by doing a 0 -> TID atomic cmpxchg), while holding all | ||
1128 | * the locks. It will most likely not succeed. | ||
1129 | */ | ||
1130 | newval = current->pid; | ||
1131 | |||
1132 | inc_preempt_count(); | ||
1133 | curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); | ||
1134 | dec_preempt_count(); | ||
1135 | |||
1136 | if (unlikely(curval == -EFAULT)) | ||
1137 | goto uaddr_faulted; | ||
1138 | |||
1139 | /* We own the lock already */ | ||
1140 | if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { | ||
1141 | if (!detect && 0) | ||
1142 | force_sig(SIGKILL, current); | ||
1143 | ret = -EDEADLK; | ||
1144 | goto out_unlock_release_sem; | ||
1145 | } | ||
1146 | |||
1147 | /* | ||
1148 | * Surprise - we got the lock. Just return | ||
1149 | * to userspace: | ||
1150 | */ | ||
1151 | if (unlikely(!curval)) | ||
1152 | goto out_unlock_release_sem; | ||
1153 | |||
1154 | uval = curval; | ||
1155 | newval = uval | FUTEX_WAITERS; | ||
1156 | |||
1157 | inc_preempt_count(); | ||
1158 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | ||
1159 | dec_preempt_count(); | ||
1160 | |||
1161 | if (unlikely(curval == -EFAULT)) | ||
1162 | goto uaddr_faulted; | ||
1163 | if (unlikely(curval != uval)) | ||
1164 | goto retry_locked; | ||
1165 | |||
1166 | /* | ||
1167 | * We dont have the lock. Look up the PI state (or create it if | ||
1168 | * we are the first waiter): | ||
1169 | */ | ||
1170 | ret = lookup_pi_state(uval, hb, &q); | ||
1171 | |||
1172 | if (unlikely(ret)) { | ||
1173 | /* | ||
1174 | * There were no waiters and the owner task lookup | ||
1175 | * failed. When the OWNER_DIED bit is set, then we | ||
1176 | * know that this is a robust futex and we actually | ||
1177 | * take the lock. This is safe as we are protected by | ||
1178 | * the hash bucket lock. We also set the waiters bit | ||
1179 | * unconditionally here, to simplify glibc handling of | ||
1180 | * multiple tasks racing to acquire the lock and | ||
1181 | * cleanup the problems which were left by the dead | ||
1182 | * owner. | ||
1183 | */ | ||
1184 | if (curval & FUTEX_OWNER_DIED) { | ||
1185 | uval = newval; | ||
1186 | newval = current->pid | | ||
1187 | FUTEX_OWNER_DIED | FUTEX_WAITERS; | ||
1188 | |||
1189 | inc_preempt_count(); | ||
1190 | curval = futex_atomic_cmpxchg_inatomic(uaddr, | ||
1191 | uval, newval); | ||
1192 | dec_preempt_count(); | ||
1193 | |||
1194 | if (unlikely(curval == -EFAULT)) | ||
1195 | goto uaddr_faulted; | ||
1196 | if (unlikely(curval != uval)) | ||
1197 | goto retry_locked; | ||
1198 | ret = 0; | ||
1199 | } | ||
1200 | goto out_unlock_release_sem; | ||
1201 | } | ||
1202 | |||
1203 | /* | ||
1204 | * Only actually queue now that the atomic ops are done: | ||
1205 | */ | ||
1206 | __queue_me(&q, hb); | ||
1207 | |||
1208 | /* | ||
1209 | * Now the futex is queued and we have checked the data, we | ||
1210 | * don't want to hold mmap_sem while we sleep. | ||
1211 | */ | ||
1212 | up_read(&curr->mm->mmap_sem); | ||
1213 | |||
1214 | WARN_ON(!q.pi_state); | ||
1215 | /* | ||
1216 | * Block on the PI mutex: | ||
1217 | */ | ||
1218 | if (!trylock) | ||
1219 | ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); | ||
1220 | else { | ||
1221 | ret = rt_mutex_trylock(&q.pi_state->pi_mutex); | ||
1222 | /* Fixup the trylock return value: */ | ||
1223 | ret = ret ? 0 : -EWOULDBLOCK; | ||
1224 | } | ||
1225 | |||
1226 | down_read(&curr->mm->mmap_sem); | ||
1227 | spin_lock(q.lock_ptr); | ||
1228 | |||
1229 | /* | ||
1230 | * Got the lock. We might not be the anticipated owner if we | ||
1231 | * did a lock-steal - fix up the PI-state in that case. | ||
1232 | */ | ||
1233 | if (!ret && q.pi_state->owner != curr) { | ||
1234 | u32 newtid = current->pid | FUTEX_WAITERS; | ||
1235 | |||
1236 | /* Owner died? */ | ||
1237 | if (q.pi_state->owner != NULL) { | ||
1238 | spin_lock_irq(&q.pi_state->owner->pi_lock); | ||
1239 | list_del_init(&q.pi_state->list); | ||
1240 | spin_unlock_irq(&q.pi_state->owner->pi_lock); | ||
1241 | } else | ||
1242 | newtid |= FUTEX_OWNER_DIED; | ||
1243 | |||
1244 | q.pi_state->owner = current; | ||
1245 | |||
1246 | spin_lock_irq(¤t->pi_lock); | ||
1247 | list_add(&q.pi_state->list, ¤t->pi_state_list); | ||
1248 | spin_unlock_irq(¤t->pi_lock); | ||
1249 | |||
1250 | /* Unqueue and drop the lock */ | ||
1251 | unqueue_me_pi(&q, hb); | ||
1252 | up_read(&curr->mm->mmap_sem); | ||
1253 | /* | ||
1254 | * We own it, so we have to replace the pending owner | ||
1255 | * TID. This must be atomic as we have preserve the | ||
1256 | * owner died bit here. | ||
1257 | */ | ||
1258 | ret = get_user(uval, uaddr); | ||
1259 | while (!ret) { | ||
1260 | newval = (uval & FUTEX_OWNER_DIED) | newtid; | ||
1261 | curval = futex_atomic_cmpxchg_inatomic(uaddr, | ||
1262 | uval, newval); | ||
1263 | if (curval == -EFAULT) | ||
1264 | ret = -EFAULT; | ||
1265 | if (curval == uval) | ||
1266 | break; | ||
1267 | uval = curval; | ||
1268 | } | ||
1269 | } else { | ||
1270 | /* | ||
1271 | * Catch the rare case, where the lock was released | ||
1272 | * when we were on the way back before we locked | ||
1273 | * the hash bucket. | ||
1274 | */ | ||
1275 | if (ret && q.pi_state->owner == curr) { | ||
1276 | if (rt_mutex_trylock(&q.pi_state->pi_mutex)) | ||
1277 | ret = 0; | ||
1278 | } | ||
1279 | /* Unqueue and drop the lock */ | ||
1280 | unqueue_me_pi(&q, hb); | ||
1281 | up_read(&curr->mm->mmap_sem); | ||
1282 | } | ||
1283 | |||
1284 | if (!detect && ret == -EDEADLK && 0) | ||
1285 | force_sig(SIGKILL, current); | ||
1286 | |||
1287 | return ret; | ||
1288 | |||
1289 | out_unlock_release_sem: | ||
1290 | queue_unlock(&q, hb); | ||
1291 | |||
1292 | out_release_sem: | ||
1293 | up_read(&curr->mm->mmap_sem); | ||
1294 | return ret; | ||
1295 | |||
1296 | uaddr_faulted: | ||
1297 | /* | ||
1298 | * We have to r/w *(int __user *)uaddr, but we can't modify it | ||
1299 | * non-atomically. Therefore, if get_user below is not | ||
1300 | * enough, we need to handle the fault ourselves, while | ||
1301 | * still holding the mmap_sem. | ||
1302 | */ | ||
1303 | if (attempt++) { | ||
1304 | if (futex_handle_fault((unsigned long)uaddr, attempt)) | ||
1305 | goto out_unlock_release_sem; | ||
1306 | |||
1307 | goto retry_locked; | ||
1308 | } | ||
1309 | |||
1310 | queue_unlock(&q, hb); | ||
1311 | up_read(&curr->mm->mmap_sem); | ||
1312 | |||
1313 | ret = get_user(uval, uaddr); | ||
1314 | if (!ret && (uval != -EFAULT)) | ||
1315 | goto retry; | ||
1316 | |||
1317 | return ret; | ||
1318 | } | ||
1319 | |||
1320 | /* | ||
1321 | * Restart handler | ||
1322 | */ | ||
1323 | static long futex_lock_pi_restart(struct restart_block *restart) | ||
1324 | { | ||
1325 | struct hrtimer_sleeper timeout, *to = NULL; | ||
1326 | int ret; | ||
1327 | |||
1328 | restart->fn = do_no_restart_syscall; | ||
1329 | |||
1330 | if (restart->arg2 || restart->arg3) { | ||
1331 | to = &timeout; | ||
1332 | hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); | ||
1333 | hrtimer_init_sleeper(to, current); | ||
1334 | to->timer.expires.tv64 = ((u64)restart->arg1 << 32) | | ||
1335 | (u64) restart->arg0; | ||
1336 | } | ||
1337 | |||
1338 | pr_debug("lock_pi restart: %p, %d (%d)\n", | ||
1339 | (u32 __user *)restart->arg0, current->pid); | ||
1340 | |||
1341 | ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1, | ||
1342 | 0, to); | ||
1343 | |||
1344 | if (ret != -EINTR) | ||
1345 | return ret; | ||
1346 | |||
1347 | restart->fn = futex_lock_pi_restart; | ||
1348 | |||
1349 | /* The other values are filled in */ | ||
1350 | return -ERESTART_RESTARTBLOCK; | ||
1351 | } | ||
1352 | |||
1353 | /* | ||
1354 | * Called from the syscall entry below. | ||
1355 | */ | ||
1356 | static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, | ||
1357 | long nsec, int trylock) | ||
1358 | { | ||
1359 | struct hrtimer_sleeper timeout, *to = NULL; | ||
1360 | struct restart_block *restart; | ||
1361 | int ret; | ||
1362 | |||
1363 | if (sec != MAX_SCHEDULE_TIMEOUT) { | ||
1364 | to = &timeout; | ||
1365 | hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); | ||
1366 | hrtimer_init_sleeper(to, current); | ||
1367 | to->timer.expires = ktime_set(sec, nsec); | ||
1368 | } | ||
1369 | |||
1370 | ret = do_futex_lock_pi(uaddr, detect, trylock, to); | ||
1371 | |||
1372 | if (ret != -EINTR) | ||
1373 | return ret; | ||
1374 | |||
1375 | pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid); | ||
1376 | |||
1377 | restart = ¤t_thread_info()->restart_block; | ||
1378 | restart->fn = futex_lock_pi_restart; | ||
1379 | restart->arg0 = (unsigned long) uaddr; | ||
1380 | restart->arg1 = detect; | ||
1381 | if (to) { | ||
1382 | restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF; | ||
1383 | restart->arg3 = to->timer.expires.tv64 >> 32; | ||
1384 | } else | ||
1385 | restart->arg2 = restart->arg3 = 0; | ||
1386 | |||
1387 | return -ERESTART_RESTARTBLOCK; | ||
1388 | } | ||
1389 | |||
1390 | /* | ||
1391 | * Userspace attempted a TID -> 0 atomic transition, and failed. | ||
1392 | * This is the in-kernel slowpath: we look up the PI state (if any), | ||
1393 | * and do the rt-mutex unlock. | ||
1394 | */ | ||
1395 | static int futex_unlock_pi(u32 __user *uaddr) | ||
1396 | { | ||
1397 | struct futex_hash_bucket *hb; | ||
1398 | struct futex_q *this, *next; | ||
1399 | u32 uval; | ||
1400 | struct list_head *head; | ||
1401 | union futex_key key; | ||
1402 | int ret, attempt = 0; | ||
1403 | |||
1404 | retry: | ||
1405 | if (get_user(uval, uaddr)) | ||
1406 | return -EFAULT; | ||
1407 | /* | ||
1408 | * We release only a lock we actually own: | ||
1409 | */ | ||
1410 | if ((uval & FUTEX_TID_MASK) != current->pid) | ||
1411 | return -EPERM; | ||
1412 | /* | ||
1413 | * First take all the futex related locks: | ||
1414 | */ | ||
1415 | down_read(¤t->mm->mmap_sem); | ||
1416 | |||
1417 | ret = get_futex_key(uaddr, &key); | ||
1418 | if (unlikely(ret != 0)) | ||
1419 | goto out; | ||
1420 | |||
1421 | hb = hash_futex(&key); | ||
1422 | spin_lock(&hb->lock); | ||
1423 | |||
1424 | retry_locked: | ||
1425 | /* | ||
1426 | * To avoid races, try to do the TID -> 0 atomic transition | ||
1427 | * again. If it succeeds then we can return without waking | ||
1428 | * anyone else up: | ||
1429 | */ | ||
1430 | inc_preempt_count(); | ||
1431 | uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); | ||
1432 | dec_preempt_count(); | ||
1433 | |||
1434 | if (unlikely(uval == -EFAULT)) | ||
1435 | goto pi_faulted; | ||
1436 | /* | ||
1437 | * Rare case: we managed to release the lock atomically, | ||
1438 | * no need to wake anyone else up: | ||
1439 | */ | ||
1440 | if (unlikely(uval == current->pid)) | ||
1441 | goto out_unlock; | ||
1442 | |||
1443 | /* | ||
1444 | * Ok, other tasks may need to be woken up - check waiters | ||
1445 | * and do the wakeup if necessary: | ||
1446 | */ | ||
1447 | head = &hb->chain; | ||
1448 | |||
1449 | list_for_each_entry_safe(this, next, head, list) { | ||
1450 | if (!match_futex (&this->key, &key)) | ||
1451 | continue; | ||
1452 | ret = wake_futex_pi(uaddr, uval, this); | ||
1453 | /* | ||
1454 | * The atomic access to the futex value | ||
1455 | * generated a pagefault, so retry the | ||
1456 | * user-access and the wakeup: | ||
1457 | */ | ||
1458 | if (ret == -EFAULT) | ||
1459 | goto pi_faulted; | ||
1460 | goto out_unlock; | ||
1461 | } | ||
1462 | /* | ||
1463 | * No waiters - kernel unlocks the futex: | ||
1464 | */ | ||
1465 | ret = unlock_futex_pi(uaddr, uval); | ||
1466 | if (ret == -EFAULT) | ||
1467 | goto pi_faulted; | ||
1468 | |||
1469 | out_unlock: | ||
1470 | spin_unlock(&hb->lock); | ||
1471 | out: | ||
728 | up_read(¤t->mm->mmap_sem); | 1472 | up_read(¤t->mm->mmap_sem); |
1473 | |||
1474 | return ret; | ||
1475 | |||
1476 | pi_faulted: | ||
1477 | /* | ||
1478 | * We have to r/w *(int __user *)uaddr, but we can't modify it | ||
1479 | * non-atomically. Therefore, if get_user below is not | ||
1480 | * enough, we need to handle the fault ourselves, while | ||
1481 | * still holding the mmap_sem. | ||
1482 | */ | ||
1483 | if (attempt++) { | ||
1484 | if (futex_handle_fault((unsigned long)uaddr, attempt)) | ||
1485 | goto out_unlock; | ||
1486 | |||
1487 | goto retry_locked; | ||
1488 | } | ||
1489 | |||
1490 | spin_unlock(&hb->lock); | ||
1491 | up_read(¤t->mm->mmap_sem); | ||
1492 | |||
1493 | ret = get_user(uval, uaddr); | ||
1494 | if (!ret && (uval != -EFAULT)) | ||
1495 | goto retry; | ||
1496 | |||
729 | return ret; | 1497 | return ret; |
730 | } | 1498 | } |
731 | 1499 | ||
@@ -735,6 +1503,7 @@ static int futex_close(struct inode *inode, struct file *filp) | |||
735 | 1503 | ||
736 | unqueue_me(q); | 1504 | unqueue_me(q); |
737 | kfree(q); | 1505 | kfree(q); |
1506 | |||
738 | return 0; | 1507 | return 0; |
739 | } | 1508 | } |
740 | 1509 | ||
@@ -766,7 +1535,7 @@ static struct file_operations futex_fops = { | |||
766 | * Signal allows caller to avoid the race which would occur if they | 1535 | * Signal allows caller to avoid the race which would occur if they |
767 | * set the sigio stuff up afterwards. | 1536 | * set the sigio stuff up afterwards. |
768 | */ | 1537 | */ |
769 | static int futex_fd(unsigned long uaddr, int signal) | 1538 | static int futex_fd(u32 __user *uaddr, int signal) |
770 | { | 1539 | { |
771 | struct futex_q *q; | 1540 | struct futex_q *q; |
772 | struct file *filp; | 1541 | struct file *filp; |
@@ -803,6 +1572,7 @@ static int futex_fd(unsigned long uaddr, int signal) | |||
803 | err = -ENOMEM; | 1572 | err = -ENOMEM; |
804 | goto error; | 1573 | goto error; |
805 | } | 1574 | } |
1575 | q->pi_state = NULL; | ||
806 | 1576 | ||
807 | down_read(¤t->mm->mmap_sem); | 1577 | down_read(¤t->mm->mmap_sem); |
808 | err = get_futex_key(uaddr, &q->key); | 1578 | err = get_futex_key(uaddr, &q->key); |
@@ -840,7 +1610,7 @@ error: | |||
840 | * Implementation: user-space maintains a per-thread list of locks it | 1610 | * Implementation: user-space maintains a per-thread list of locks it |
841 | * is holding. Upon do_exit(), the kernel carefully walks this list, | 1611 | * is holding. Upon do_exit(), the kernel carefully walks this list, |
842 | * and marks all locks that are owned by this thread with the | 1612 | * and marks all locks that are owned by this thread with the |
843 | * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is | 1613 | * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is |
844 | * always manipulated with the lock held, so the list is private and | 1614 | * always manipulated with the lock held, so the list is private and |
845 | * per-thread. Userspace also maintains a per-thread 'list_op_pending' | 1615 | * per-thread. Userspace also maintains a per-thread 'list_op_pending' |
846 | * field, to allow the kernel to clean up if the thread dies after | 1616 | * field, to allow the kernel to clean up if the thread dies after |
@@ -915,7 +1685,7 @@ err_unlock: | |||
915 | */ | 1685 | */ |
916 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) | 1686 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) |
917 | { | 1687 | { |
918 | u32 uval; | 1688 | u32 uval, nval; |
919 | 1689 | ||
920 | retry: | 1690 | retry: |
921 | if (get_user(uval, uaddr)) | 1691 | if (get_user(uval, uaddr)) |
@@ -932,12 +1702,16 @@ retry: | |||
932 | * thread-death.) The rest of the cleanup is done in | 1702 | * thread-death.) The rest of the cleanup is done in |
933 | * userspace. | 1703 | * userspace. |
934 | */ | 1704 | */ |
935 | if (futex_atomic_cmpxchg_inatomic(uaddr, uval, | 1705 | nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, |
936 | uval | FUTEX_OWNER_DIED) != uval) | 1706 | uval | FUTEX_OWNER_DIED); |
1707 | if (nval == -EFAULT) | ||
1708 | return -1; | ||
1709 | |||
1710 | if (nval != uval) | ||
937 | goto retry; | 1711 | goto retry; |
938 | 1712 | ||
939 | if (uval & FUTEX_WAITERS) | 1713 | if (uval & FUTEX_WAITERS) |
940 | futex_wake((unsigned long)uaddr, 1); | 1714 | futex_wake(uaddr, 1); |
941 | } | 1715 | } |
942 | return 0; | 1716 | return 0; |
943 | } | 1717 | } |
@@ -978,7 +1752,7 @@ void exit_robust_list(struct task_struct *curr) | |||
978 | while (entry != &head->list) { | 1752 | while (entry != &head->list) { |
979 | /* | 1753 | /* |
980 | * A pending lock might already be on the list, so | 1754 | * A pending lock might already be on the list, so |
981 | * dont process it twice: | 1755 | * don't process it twice: |
982 | */ | 1756 | */ |
983 | if (entry != pending) | 1757 | if (entry != pending) |
984 | if (handle_futex_death((void *)entry + futex_offset, | 1758 | if (handle_futex_death((void *)entry + futex_offset, |
@@ -999,8 +1773,8 @@ void exit_robust_list(struct task_struct *curr) | |||
999 | } | 1773 | } |
1000 | } | 1774 | } |
1001 | 1775 | ||
1002 | long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | 1776 | long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, |
1003 | unsigned long uaddr2, int val2, int val3) | 1777 | u32 __user *uaddr2, u32 val2, u32 val3) |
1004 | { | 1778 | { |
1005 | int ret; | 1779 | int ret; |
1006 | 1780 | ||
@@ -1024,6 +1798,15 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | |||
1024 | case FUTEX_WAKE_OP: | 1798 | case FUTEX_WAKE_OP: |
1025 | ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); | 1799 | ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); |
1026 | break; | 1800 | break; |
1801 | case FUTEX_LOCK_PI: | ||
1802 | ret = futex_lock_pi(uaddr, val, timeout, val2, 0); | ||
1803 | break; | ||
1804 | case FUTEX_UNLOCK_PI: | ||
1805 | ret = futex_unlock_pi(uaddr); | ||
1806 | break; | ||
1807 | case FUTEX_TRYLOCK_PI: | ||
1808 | ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); | ||
1809 | break; | ||
1027 | default: | 1810 | default: |
1028 | ret = -ENOSYS; | 1811 | ret = -ENOSYS; |
1029 | } | 1812 | } |
@@ -1031,36 +1814,40 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | |||
1031 | } | 1814 | } |
1032 | 1815 | ||
1033 | 1816 | ||
1034 | asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, | 1817 | asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, |
1035 | struct timespec __user *utime, u32 __user *uaddr2, | 1818 | struct timespec __user *utime, u32 __user *uaddr2, |
1036 | int val3) | 1819 | u32 val3) |
1037 | { | 1820 | { |
1038 | struct timespec t; | 1821 | struct timespec t; |
1039 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; | 1822 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; |
1040 | int val2 = 0; | 1823 | u32 val2 = 0; |
1041 | 1824 | ||
1042 | if (utime && (op == FUTEX_WAIT)) { | 1825 | if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { |
1043 | if (copy_from_user(&t, utime, sizeof(t)) != 0) | 1826 | if (copy_from_user(&t, utime, sizeof(t)) != 0) |
1044 | return -EFAULT; | 1827 | return -EFAULT; |
1045 | if (!timespec_valid(&t)) | 1828 | if (!timespec_valid(&t)) |
1046 | return -EINVAL; | 1829 | return -EINVAL; |
1047 | timeout = timespec_to_jiffies(&t) + 1; | 1830 | if (op == FUTEX_WAIT) |
1831 | timeout = timespec_to_jiffies(&t) + 1; | ||
1832 | else { | ||
1833 | timeout = t.tv_sec; | ||
1834 | val2 = t.tv_nsec; | ||
1835 | } | ||
1048 | } | 1836 | } |
1049 | /* | 1837 | /* |
1050 | * requeue parameter in 'utime' if op == FUTEX_REQUEUE. | 1838 | * requeue parameter in 'utime' if op == FUTEX_REQUEUE. |
1051 | */ | 1839 | */ |
1052 | if (op >= FUTEX_REQUEUE) | 1840 | if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) |
1053 | val2 = (int) (unsigned long) utime; | 1841 | val2 = (u32) (unsigned long) utime; |
1054 | 1842 | ||
1055 | return do_futex((unsigned long)uaddr, op, val, timeout, | 1843 | return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); |
1056 | (unsigned long)uaddr2, val2, val3); | ||
1057 | } | 1844 | } |
1058 | 1845 | ||
1059 | static struct super_block * | 1846 | static int futexfs_get_sb(struct file_system_type *fs_type, |
1060 | futexfs_get_sb(struct file_system_type *fs_type, | 1847 | int flags, const char *dev_name, void *data, |
1061 | int flags, const char *dev_name, void *data) | 1848 | struct vfsmount *mnt) |
1062 | { | 1849 | { |
1063 | return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA); | 1850 | return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA, mnt); |
1064 | } | 1851 | } |
1065 | 1852 | ||
1066 | static struct file_system_type futex_fs_type = { | 1853 | static struct file_system_type futex_fs_type = { |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 1ab6a0ea3d14..d1d92b441fb7 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
@@ -129,16 +129,20 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, | |||
129 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; | 129 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; |
130 | int val2 = 0; | 130 | int val2 = 0; |
131 | 131 | ||
132 | if (utime && (op == FUTEX_WAIT)) { | 132 | if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { |
133 | if (get_compat_timespec(&t, utime)) | 133 | if (get_compat_timespec(&t, utime)) |
134 | return -EFAULT; | 134 | return -EFAULT; |
135 | if (!timespec_valid(&t)) | 135 | if (!timespec_valid(&t)) |
136 | return -EINVAL; | 136 | return -EINVAL; |
137 | timeout = timespec_to_jiffies(&t) + 1; | 137 | if (op == FUTEX_WAIT) |
138 | timeout = timespec_to_jiffies(&t) + 1; | ||
139 | else { | ||
140 | timeout = t.tv_sec; | ||
141 | val2 = t.tv_nsec; | ||
142 | } | ||
138 | } | 143 | } |
139 | if (op >= FUTEX_REQUEUE) | 144 | if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) |
140 | val2 = (int) (unsigned long) utime; | 145 | val2 = (int) (unsigned long) utime; |
141 | 146 | ||
142 | return do_futex((unsigned long)uaddr, op, val, timeout, | 147 | return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); |
143 | (unsigned long)uaddr2, val2, val3); | ||
144 | } | 148 | } |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 01fa2ae98a85..d17766d40dab 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -98,7 +98,6 @@ static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) = | |||
98 | 98 | ||
99 | /** | 99 | /** |
100 | * ktime_get_ts - get the monotonic clock in timespec format | 100 | * ktime_get_ts - get the monotonic clock in timespec format |
101 | * | ||
102 | * @ts: pointer to timespec variable | 101 | * @ts: pointer to timespec variable |
103 | * | 102 | * |
104 | * The function calculates the monotonic clock from the realtime | 103 | * The function calculates the monotonic clock from the realtime |
@@ -238,7 +237,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) | |||
238 | # ifndef CONFIG_KTIME_SCALAR | 237 | # ifndef CONFIG_KTIME_SCALAR |
239 | /** | 238 | /** |
240 | * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable | 239 | * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable |
241 | * | ||
242 | * @kt: addend | 240 | * @kt: addend |
243 | * @nsec: the scalar nsec value to add | 241 | * @nsec: the scalar nsec value to add |
244 | * | 242 | * |
@@ -299,7 +297,6 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) | |||
299 | 297 | ||
300 | /** | 298 | /** |
301 | * hrtimer_forward - forward the timer expiry | 299 | * hrtimer_forward - forward the timer expiry |
302 | * | ||
303 | * @timer: hrtimer to forward | 300 | * @timer: hrtimer to forward |
304 | * @now: forward past this time | 301 | * @now: forward past this time |
305 | * @interval: the interval to forward | 302 | * @interval: the interval to forward |
@@ -393,7 +390,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | |||
393 | if (base->first == &timer->node) | 390 | if (base->first == &timer->node) |
394 | base->first = rb_next(&timer->node); | 391 | base->first = rb_next(&timer->node); |
395 | rb_erase(&timer->node, &base->active); | 392 | rb_erase(&timer->node, &base->active); |
396 | timer->node.rb_parent = HRTIMER_INACTIVE; | 393 | rb_set_parent(&timer->node, &timer->node); |
397 | } | 394 | } |
398 | 395 | ||
399 | /* | 396 | /* |
@@ -411,7 +408,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | |||
411 | 408 | ||
412 | /** | 409 | /** |
413 | * hrtimer_start - (re)start an relative timer on the current CPU | 410 | * hrtimer_start - (re)start an relative timer on the current CPU |
414 | * | ||
415 | * @timer: the timer to be added | 411 | * @timer: the timer to be added |
416 | * @tim: expiry time | 412 | * @tim: expiry time |
417 | * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) | 413 | * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) |
@@ -460,14 +456,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start); | |||
460 | 456 | ||
461 | /** | 457 | /** |
462 | * hrtimer_try_to_cancel - try to deactivate a timer | 458 | * hrtimer_try_to_cancel - try to deactivate a timer |
463 | * | ||
464 | * @timer: hrtimer to stop | 459 | * @timer: hrtimer to stop |
465 | * | 460 | * |
466 | * Returns: | 461 | * Returns: |
467 | * 0 when the timer was not active | 462 | * 0 when the timer was not active |
468 | * 1 when the timer was active | 463 | * 1 when the timer was active |
469 | * -1 when the timer is currently excuting the callback function and | 464 | * -1 when the timer is currently excuting the callback function and |
470 | * can not be stopped | 465 | * cannot be stopped |
471 | */ | 466 | */ |
472 | int hrtimer_try_to_cancel(struct hrtimer *timer) | 467 | int hrtimer_try_to_cancel(struct hrtimer *timer) |
473 | { | 468 | { |
@@ -489,7 +484,6 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); | |||
489 | 484 | ||
490 | /** | 485 | /** |
491 | * hrtimer_cancel - cancel a timer and wait for the handler to finish. | 486 | * hrtimer_cancel - cancel a timer and wait for the handler to finish. |
492 | * | ||
493 | * @timer: the timer to be cancelled | 487 | * @timer: the timer to be cancelled |
494 | * | 488 | * |
495 | * Returns: | 489 | * Returns: |
@@ -510,7 +504,6 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); | |||
510 | 504 | ||
511 | /** | 505 | /** |
512 | * hrtimer_get_remaining - get remaining time for the timer | 506 | * hrtimer_get_remaining - get remaining time for the timer |
513 | * | ||
514 | * @timer: the timer to read | 507 | * @timer: the timer to read |
515 | */ | 508 | */ |
516 | ktime_t hrtimer_get_remaining(const struct hrtimer *timer) | 509 | ktime_t hrtimer_get_remaining(const struct hrtimer *timer) |
@@ -564,7 +557,6 @@ ktime_t hrtimer_get_next_event(void) | |||
564 | 557 | ||
565 | /** | 558 | /** |
566 | * hrtimer_init - initialize a timer to the given clock | 559 | * hrtimer_init - initialize a timer to the given clock |
567 | * | ||
568 | * @timer: the timer to be initialized | 560 | * @timer: the timer to be initialized |
569 | * @clock_id: the clock to be used | 561 | * @clock_id: the clock to be used |
570 | * @mode: timer mode abs/rel | 562 | * @mode: timer mode abs/rel |
@@ -576,19 +568,18 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
576 | 568 | ||
577 | memset(timer, 0, sizeof(struct hrtimer)); | 569 | memset(timer, 0, sizeof(struct hrtimer)); |
578 | 570 | ||
579 | bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); | 571 | bases = __raw_get_cpu_var(hrtimer_bases); |
580 | 572 | ||
581 | if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) | 573 | if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) |
582 | clock_id = CLOCK_MONOTONIC; | 574 | clock_id = CLOCK_MONOTONIC; |
583 | 575 | ||
584 | timer->base = &bases[clock_id]; | 576 | timer->base = &bases[clock_id]; |
585 | timer->node.rb_parent = HRTIMER_INACTIVE; | 577 | rb_set_parent(&timer->node, &timer->node); |
586 | } | 578 | } |
587 | EXPORT_SYMBOL_GPL(hrtimer_init); | 579 | EXPORT_SYMBOL_GPL(hrtimer_init); |
588 | 580 | ||
589 | /** | 581 | /** |
590 | * hrtimer_get_res - get the timer resolution for a clock | 582 | * hrtimer_get_res - get the timer resolution for a clock |
591 | * | ||
592 | * @which_clock: which clock to query | 583 | * @which_clock: which clock to query |
593 | * @tp: pointer to timespec variable to store the resolution | 584 | * @tp: pointer to timespec variable to store the resolution |
594 | * | 585 | * |
@@ -599,7 +590,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) | |||
599 | { | 590 | { |
600 | struct hrtimer_base *bases; | 591 | struct hrtimer_base *bases; |
601 | 592 | ||
602 | bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); | 593 | bases = __raw_get_cpu_var(hrtimer_bases); |
603 | *tp = ktime_to_timespec(bases[which_clock].resolution); | 594 | *tp = ktime_to_timespec(bases[which_clock].resolution); |
604 | 595 | ||
605 | return 0; | 596 | return 0; |
@@ -678,7 +669,7 @@ static int hrtimer_wakeup(struct hrtimer *timer) | |||
678 | return HRTIMER_NORESTART; | 669 | return HRTIMER_NORESTART; |
679 | } | 670 | } |
680 | 671 | ||
681 | void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, task_t *task) | 672 | void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) |
682 | { | 673 | { |
683 | sl->timer.function = hrtimer_wakeup; | 674 | sl->timer.function = hrtimer_wakeup; |
684 | sl->task = task; | 675 | sl->task = task; |
@@ -791,8 +782,10 @@ static void __devinit init_hrtimers_cpu(int cpu) | |||
791 | struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu); | 782 | struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu); |
792 | int i; | 783 | int i; |
793 | 784 | ||
794 | for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) | 785 | for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { |
795 | spin_lock_init(&base->lock); | 786 | spin_lock_init(&base->lock); |
787 | lockdep_set_class(&base->lock, &base->lock_key); | ||
788 | } | ||
796 | } | 789 | } |
797 | 790 | ||
798 | #ifdef CONFIG_HOTPLUG_CPU | 791 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -842,7 +835,7 @@ static void migrate_hrtimers(int cpu) | |||
842 | } | 835 | } |
843 | #endif /* CONFIG_HOTPLUG_CPU */ | 836 | #endif /* CONFIG_HOTPLUG_CPU */ |
844 | 837 | ||
845 | static int hrtimer_cpu_notify(struct notifier_block *self, | 838 | static int __devinit hrtimer_cpu_notify(struct notifier_block *self, |
846 | unsigned long action, void *hcpu) | 839 | unsigned long action, void *hcpu) |
847 | { | 840 | { |
848 | long cpu = (long)hcpu; | 841 | long cpu = (long)hcpu; |
@@ -866,7 +859,7 @@ static int hrtimer_cpu_notify(struct notifier_block *self, | |||
866 | return NOTIFY_OK; | 859 | return NOTIFY_OK; |
867 | } | 860 | } |
868 | 861 | ||
869 | static struct notifier_block hrtimers_nb = { | 862 | static struct notifier_block __devinitdata hrtimers_nb = { |
870 | .notifier_call = hrtimer_cpu_notify, | 863 | .notifier_call = hrtimer_cpu_notify, |
871 | }; | 864 | }; |
872 | 865 | ||
diff --git a/kernel/intermodule.c b/kernel/intermodule.c deleted file mode 100644 index 55b1e5b85db9..000000000000 --- a/kernel/intermodule.c +++ /dev/null | |||
@@ -1,184 +0,0 @@ | |||
1 | /* Deprecated, do not use. Moved from module.c to here. --RR */ | ||
2 | |||
3 | /* Written by Keith Owens <kaos@ocs.com.au> Oct 2000 */ | ||
4 | #include <linux/module.h> | ||
5 | #include <linux/kmod.h> | ||
6 | #include <linux/spinlock.h> | ||
7 | #include <linux/list.h> | ||
8 | #include <linux/slab.h> | ||
9 | |||
10 | /* inter_module functions are always available, even when the kernel is | ||
11 | * compiled without modules. Consumers of inter_module_xxx routines | ||
12 | * will always work, even when both are built into the kernel, this | ||
13 | * approach removes lots of #ifdefs in mainline code. | ||
14 | */ | ||
15 | |||
16 | static struct list_head ime_list = LIST_HEAD_INIT(ime_list); | ||
17 | static DEFINE_SPINLOCK(ime_lock); | ||
18 | static int kmalloc_failed; | ||
19 | |||
20 | struct inter_module_entry { | ||
21 | struct list_head list; | ||
22 | const char *im_name; | ||
23 | struct module *owner; | ||
24 | const void *userdata; | ||
25 | }; | ||
26 | |||
27 | /** | ||
28 | * inter_module_register - register a new set of inter module data. | ||
29 | * @im_name: an arbitrary string to identify the data, must be unique | ||
30 | * @owner: module that is registering the data, always use THIS_MODULE | ||
31 | * @userdata: pointer to arbitrary userdata to be registered | ||
32 | * | ||
33 | * Description: Check that the im_name has not already been registered, | ||
34 | * complain if it has. For new data, add it to the inter_module_entry | ||
35 | * list. | ||
36 | */ | ||
37 | void inter_module_register(const char *im_name, struct module *owner, const void *userdata) | ||
38 | { | ||
39 | struct list_head *tmp; | ||
40 | struct inter_module_entry *ime, *ime_new; | ||
41 | |||
42 | if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) { | ||
43 | /* Overloaded kernel, not fatal */ | ||
44 | printk(KERN_ERR | ||
45 | "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n", | ||
46 | im_name); | ||
47 | kmalloc_failed = 1; | ||
48 | return; | ||
49 | } | ||
50 | ime_new->im_name = im_name; | ||
51 | ime_new->owner = owner; | ||
52 | ime_new->userdata = userdata; | ||
53 | |||
54 | spin_lock(&ime_lock); | ||
55 | list_for_each(tmp, &ime_list) { | ||
56 | ime = list_entry(tmp, struct inter_module_entry, list); | ||
57 | if (strcmp(ime->im_name, im_name) == 0) { | ||
58 | spin_unlock(&ime_lock); | ||
59 | kfree(ime_new); | ||
60 | /* Program logic error, fatal */ | ||
61 | printk(KERN_ERR "inter_module_register: duplicate im_name '%s'", im_name); | ||
62 | BUG(); | ||
63 | } | ||
64 | } | ||
65 | list_add(&(ime_new->list), &ime_list); | ||
66 | spin_unlock(&ime_lock); | ||
67 | } | ||
68 | |||
69 | /** | ||
70 | * inter_module_unregister - unregister a set of inter module data. | ||
71 | * @im_name: an arbitrary string to identify the data, must be unique | ||
72 | * | ||
73 | * Description: Check that the im_name has been registered, complain if | ||
74 | * it has not. For existing data, remove it from the | ||
75 | * inter_module_entry list. | ||
76 | */ | ||
77 | void inter_module_unregister(const char *im_name) | ||
78 | { | ||
79 | struct list_head *tmp; | ||
80 | struct inter_module_entry *ime; | ||
81 | |||
82 | spin_lock(&ime_lock); | ||
83 | list_for_each(tmp, &ime_list) { | ||
84 | ime = list_entry(tmp, struct inter_module_entry, list); | ||
85 | if (strcmp(ime->im_name, im_name) == 0) { | ||
86 | list_del(&(ime->list)); | ||
87 | spin_unlock(&ime_lock); | ||
88 | kfree(ime); | ||
89 | return; | ||
90 | } | ||
91 | } | ||
92 | spin_unlock(&ime_lock); | ||
93 | if (kmalloc_failed) { | ||
94 | printk(KERN_ERR | ||
95 | "inter_module_unregister: no entry for '%s', " | ||
96 | "probably caused by previous kmalloc failure\n", | ||
97 | im_name); | ||
98 | return; | ||
99 | } | ||
100 | else { | ||
101 | /* Program logic error, fatal */ | ||
102 | printk(KERN_ERR "inter_module_unregister: no entry for '%s'", im_name); | ||
103 | BUG(); | ||
104 | } | ||
105 | } | ||
106 | |||
107 | /** | ||
108 | * inter_module_get - return arbitrary userdata from another module. | ||
109 | * @im_name: an arbitrary string to identify the data, must be unique | ||
110 | * | ||
111 | * Description: If the im_name has not been registered, return NULL. | ||
112 | * Try to increment the use count on the owning module, if that fails | ||
113 | * then return NULL. Otherwise return the userdata. | ||
114 | */ | ||
115 | static const void *inter_module_get(const char *im_name) | ||
116 | { | ||
117 | struct list_head *tmp; | ||
118 | struct inter_module_entry *ime; | ||
119 | const void *result = NULL; | ||
120 | |||
121 | spin_lock(&ime_lock); | ||
122 | list_for_each(tmp, &ime_list) { | ||
123 | ime = list_entry(tmp, struct inter_module_entry, list); | ||
124 | if (strcmp(ime->im_name, im_name) == 0) { | ||
125 | if (try_module_get(ime->owner)) | ||
126 | result = ime->userdata; | ||
127 | break; | ||
128 | } | ||
129 | } | ||
130 | spin_unlock(&ime_lock); | ||
131 | return(result); | ||
132 | } | ||
133 | |||
134 | /** | ||
135 | * inter_module_get_request - im get with automatic request_module. | ||
136 | * @im_name: an arbitrary string to identify the data, must be unique | ||
137 | * @modname: module that is expected to register im_name | ||
138 | * | ||
139 | * Description: If inter_module_get fails, do request_module then retry. | ||
140 | */ | ||
141 | const void *inter_module_get_request(const char *im_name, const char *modname) | ||
142 | { | ||
143 | const void *result = inter_module_get(im_name); | ||
144 | if (!result) { | ||
145 | request_module("%s", modname); | ||
146 | result = inter_module_get(im_name); | ||
147 | } | ||
148 | return(result); | ||
149 | } | ||
150 | |||
151 | /** | ||
152 | * inter_module_put - release use of data from another module. | ||
153 | * @im_name: an arbitrary string to identify the data, must be unique | ||
154 | * | ||
155 | * Description: If the im_name has not been registered, complain, | ||
156 | * otherwise decrement the use count on the owning module. | ||
157 | */ | ||
158 | void inter_module_put(const char *im_name) | ||
159 | { | ||
160 | struct list_head *tmp; | ||
161 | struct inter_module_entry *ime; | ||
162 | |||
163 | spin_lock(&ime_lock); | ||
164 | list_for_each(tmp, &ime_list) { | ||
165 | ime = list_entry(tmp, struct inter_module_entry, list); | ||
166 | if (strcmp(ime->im_name, im_name) == 0) { | ||
167 | if (ime->owner) | ||
168 | module_put(ime->owner); | ||
169 | spin_unlock(&ime_lock); | ||
170 | return; | ||
171 | } | ||
172 | } | ||
173 | spin_unlock(&ime_lock); | ||
174 | printk(KERN_ERR "inter_module_put: no entry for '%s'", im_name); | ||
175 | BUG(); | ||
176 | } | ||
177 | |||
178 | EXPORT_SYMBOL(inter_module_register); | ||
179 | EXPORT_SYMBOL(inter_module_unregister); | ||
180 | EXPORT_SYMBOL(inter_module_get_request); | ||
181 | EXPORT_SYMBOL(inter_module_put); | ||
182 | |||
183 | MODULE_LICENSE("GPL"); | ||
184 | |||
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 9f77f50d8143..1dab0ac3f797 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile | |||
@@ -1,5 +1,5 @@ | |||
1 | 1 | ||
2 | obj-y := handle.o manage.o spurious.o | 2 | obj-y := handle.o manage.o spurious.o resend.o chip.o |
3 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o | 3 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o |
4 | obj-$(CONFIG_PROC_FS) += proc.o | 4 | obj-$(CONFIG_PROC_FS) += proc.o |
5 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o | 5 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o |
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 3467097ca61a..533068cfb607 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c | |||
@@ -11,12 +11,14 @@ | |||
11 | #include <linux/interrupt.h> | 11 | #include <linux/interrupt.h> |
12 | #include <linux/delay.h> | 12 | #include <linux/delay.h> |
13 | 13 | ||
14 | #include "internals.h" | ||
15 | |||
14 | /* | 16 | /* |
15 | * Autodetection depends on the fact that any interrupt that | 17 | * Autodetection depends on the fact that any interrupt that |
16 | * comes in on to an unassigned handler will get stuck with | 18 | * comes in on to an unassigned handler will get stuck with |
17 | * "IRQ_WAITING" cleared and the interrupt disabled. | 19 | * "IRQ_WAITING" cleared and the interrupt disabled. |
18 | */ | 20 | */ |
19 | static DECLARE_MUTEX(probe_sem); | 21 | static DEFINE_MUTEX(probing_active); |
20 | 22 | ||
21 | /** | 23 | /** |
22 | * probe_irq_on - begin an interrupt autodetect | 24 | * probe_irq_on - begin an interrupt autodetect |
@@ -27,11 +29,11 @@ static DECLARE_MUTEX(probe_sem); | |||
27 | */ | 29 | */ |
28 | unsigned long probe_irq_on(void) | 30 | unsigned long probe_irq_on(void) |
29 | { | 31 | { |
30 | unsigned long val; | 32 | struct irq_desc *desc; |
31 | irq_desc_t *desc; | 33 | unsigned long mask; |
32 | unsigned int i; | 34 | unsigned int i; |
33 | 35 | ||
34 | down(&probe_sem); | 36 | mutex_lock(&probing_active); |
35 | /* | 37 | /* |
36 | * something may have generated an irq long ago and we want to | 38 | * something may have generated an irq long ago and we want to |
37 | * flush such a longstanding irq before considering it as spurious. | 39 | * flush such a longstanding irq before considering it as spurious. |
@@ -40,8 +42,21 @@ unsigned long probe_irq_on(void) | |||
40 | desc = irq_desc + i; | 42 | desc = irq_desc + i; |
41 | 43 | ||
42 | spin_lock_irq(&desc->lock); | 44 | spin_lock_irq(&desc->lock); |
43 | if (!irq_desc[i].action) | 45 | if (!desc->action && !(desc->status & IRQ_NOPROBE)) { |
44 | irq_desc[i].handler->startup(i); | 46 | /* |
47 | * An old-style architecture might still have | ||
48 | * the handle_bad_irq handler there: | ||
49 | */ | ||
50 | compat_irq_chip_set_default_handler(desc); | ||
51 | |||
52 | /* | ||
53 | * Some chips need to know about probing in | ||
54 | * progress: | ||
55 | */ | ||
56 | if (desc->chip->set_type) | ||
57 | desc->chip->set_type(i, IRQ_TYPE_PROBE); | ||
58 | desc->chip->startup(i); | ||
59 | } | ||
45 | spin_unlock_irq(&desc->lock); | 60 | spin_unlock_irq(&desc->lock); |
46 | } | 61 | } |
47 | 62 | ||
@@ -57,9 +72,9 @@ unsigned long probe_irq_on(void) | |||
57 | desc = irq_desc + i; | 72 | desc = irq_desc + i; |
58 | 73 | ||
59 | spin_lock_irq(&desc->lock); | 74 | spin_lock_irq(&desc->lock); |
60 | if (!desc->action) { | 75 | if (!desc->action && !(desc->status & IRQ_NOPROBE)) { |
61 | desc->status |= IRQ_AUTODETECT | IRQ_WAITING; | 76 | desc->status |= IRQ_AUTODETECT | IRQ_WAITING; |
62 | if (desc->handler->startup(i)) | 77 | if (desc->chip->startup(i)) |
63 | desc->status |= IRQ_PENDING; | 78 | desc->status |= IRQ_PENDING; |
64 | } | 79 | } |
65 | spin_unlock_irq(&desc->lock); | 80 | spin_unlock_irq(&desc->lock); |
@@ -73,11 +88,11 @@ unsigned long probe_irq_on(void) | |||
73 | /* | 88 | /* |
74 | * Now filter out any obviously spurious interrupts | 89 | * Now filter out any obviously spurious interrupts |
75 | */ | 90 | */ |
76 | val = 0; | 91 | mask = 0; |
77 | for (i = 0; i < NR_IRQS; i++) { | 92 | for (i = 0; i < NR_IRQS; i++) { |
78 | irq_desc_t *desc = irq_desc + i; | ||
79 | unsigned int status; | 93 | unsigned int status; |
80 | 94 | ||
95 | desc = irq_desc + i; | ||
81 | spin_lock_irq(&desc->lock); | 96 | spin_lock_irq(&desc->lock); |
82 | status = desc->status; | 97 | status = desc->status; |
83 | 98 | ||
@@ -85,17 +100,16 @@ unsigned long probe_irq_on(void) | |||
85 | /* It triggered already - consider it spurious. */ | 100 | /* It triggered already - consider it spurious. */ |
86 | if (!(status & IRQ_WAITING)) { | 101 | if (!(status & IRQ_WAITING)) { |
87 | desc->status = status & ~IRQ_AUTODETECT; | 102 | desc->status = status & ~IRQ_AUTODETECT; |
88 | desc->handler->shutdown(i); | 103 | desc->chip->shutdown(i); |
89 | } else | 104 | } else |
90 | if (i < 32) | 105 | if (i < 32) |
91 | val |= 1 << i; | 106 | mask |= 1 << i; |
92 | } | 107 | } |
93 | spin_unlock_irq(&desc->lock); | 108 | spin_unlock_irq(&desc->lock); |
94 | } | 109 | } |
95 | 110 | ||
96 | return val; | 111 | return mask; |
97 | } | 112 | } |
98 | |||
99 | EXPORT_SYMBOL(probe_irq_on); | 113 | EXPORT_SYMBOL(probe_irq_on); |
100 | 114 | ||
101 | /** | 115 | /** |
@@ -117,7 +131,7 @@ unsigned int probe_irq_mask(unsigned long val) | |||
117 | 131 | ||
118 | mask = 0; | 132 | mask = 0; |
119 | for (i = 0; i < NR_IRQS; i++) { | 133 | for (i = 0; i < NR_IRQS; i++) { |
120 | irq_desc_t *desc = irq_desc + i; | 134 | struct irq_desc *desc = irq_desc + i; |
121 | unsigned int status; | 135 | unsigned int status; |
122 | 136 | ||
123 | spin_lock_irq(&desc->lock); | 137 | spin_lock_irq(&desc->lock); |
@@ -128,11 +142,11 @@ unsigned int probe_irq_mask(unsigned long val) | |||
128 | mask |= 1 << i; | 142 | mask |= 1 << i; |
129 | 143 | ||
130 | desc->status = status & ~IRQ_AUTODETECT; | 144 | desc->status = status & ~IRQ_AUTODETECT; |
131 | desc->handler->shutdown(i); | 145 | desc->chip->shutdown(i); |
132 | } | 146 | } |
133 | spin_unlock_irq(&desc->lock); | 147 | spin_unlock_irq(&desc->lock); |
134 | } | 148 | } |
135 | up(&probe_sem); | 149 | mutex_unlock(&probing_active); |
136 | 150 | ||
137 | return mask & val; | 151 | return mask & val; |
138 | } | 152 | } |
@@ -160,7 +174,7 @@ int probe_irq_off(unsigned long val) | |||
160 | int i, irq_found = 0, nr_irqs = 0; | 174 | int i, irq_found = 0, nr_irqs = 0; |
161 | 175 | ||
162 | for (i = 0; i < NR_IRQS; i++) { | 176 | for (i = 0; i < NR_IRQS; i++) { |
163 | irq_desc_t *desc = irq_desc + i; | 177 | struct irq_desc *desc = irq_desc + i; |
164 | unsigned int status; | 178 | unsigned int status; |
165 | 179 | ||
166 | spin_lock_irq(&desc->lock); | 180 | spin_lock_irq(&desc->lock); |
@@ -173,16 +187,16 @@ int probe_irq_off(unsigned long val) | |||
173 | nr_irqs++; | 187 | nr_irqs++; |
174 | } | 188 | } |
175 | desc->status = status & ~IRQ_AUTODETECT; | 189 | desc->status = status & ~IRQ_AUTODETECT; |
176 | desc->handler->shutdown(i); | 190 | desc->chip->shutdown(i); |
177 | } | 191 | } |
178 | spin_unlock_irq(&desc->lock); | 192 | spin_unlock_irq(&desc->lock); |
179 | } | 193 | } |
180 | up(&probe_sem); | 194 | mutex_unlock(&probing_active); |
181 | 195 | ||
182 | if (nr_irqs > 1) | 196 | if (nr_irqs > 1) |
183 | irq_found = -irq_found; | 197 | irq_found = -irq_found; |
198 | |||
184 | return irq_found; | 199 | return irq_found; |
185 | } | 200 | } |
186 | |||
187 | EXPORT_SYMBOL(probe_irq_off); | 201 | EXPORT_SYMBOL(probe_irq_off); |
188 | 202 | ||
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c new file mode 100644 index 000000000000..9336f2e89e40 --- /dev/null +++ b/kernel/irq/chip.c | |||
@@ -0,0 +1,537 @@ | |||
1 | /* | ||
2 | * linux/kernel/irq/chip.c | ||
3 | * | ||
4 | * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar | ||
5 | * Copyright (C) 2005-2006, Thomas Gleixner, Russell King | ||
6 | * | ||
7 | * This file contains the core interrupt handling code, for irq-chip | ||
8 | * based architectures. | ||
9 | * | ||
10 | * Detailed information is available in Documentation/DocBook/genericirq | ||
11 | */ | ||
12 | |||
13 | #include <linux/irq.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <linux/interrupt.h> | ||
16 | #include <linux/kernel_stat.h> | ||
17 | |||
18 | #include "internals.h" | ||
19 | |||
20 | /** | ||
21 | * set_irq_chip - set the irq chip for an irq | ||
22 | * @irq: irq number | ||
23 | * @chip: pointer to irq chip description structure | ||
24 | */ | ||
25 | int set_irq_chip(unsigned int irq, struct irq_chip *chip) | ||
26 | { | ||
27 | struct irq_desc *desc; | ||
28 | unsigned long flags; | ||
29 | |||
30 | if (irq >= NR_IRQS) { | ||
31 | printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq); | ||
32 | WARN_ON(1); | ||
33 | return -EINVAL; | ||
34 | } | ||
35 | |||
36 | if (!chip) | ||
37 | chip = &no_irq_chip; | ||
38 | |||
39 | desc = irq_desc + irq; | ||
40 | spin_lock_irqsave(&desc->lock, flags); | ||
41 | irq_chip_set_defaults(chip); | ||
42 | desc->chip = chip; | ||
43 | /* | ||
44 | * For compatibility only: | ||
45 | */ | ||
46 | desc->chip = chip; | ||
47 | spin_unlock_irqrestore(&desc->lock, flags); | ||
48 | |||
49 | return 0; | ||
50 | } | ||
51 | EXPORT_SYMBOL(set_irq_chip); | ||
52 | |||
53 | /** | ||
54 | * set_irq_type - set the irq type for an irq | ||
55 | * @irq: irq number | ||
56 | * @type: interrupt type - see include/linux/interrupt.h | ||
57 | */ | ||
58 | int set_irq_type(unsigned int irq, unsigned int type) | ||
59 | { | ||
60 | struct irq_desc *desc; | ||
61 | unsigned long flags; | ||
62 | int ret = -ENXIO; | ||
63 | |||
64 | if (irq >= NR_IRQS) { | ||
65 | printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); | ||
66 | return -ENODEV; | ||
67 | } | ||
68 | |||
69 | desc = irq_desc + irq; | ||
70 | if (desc->chip->set_type) { | ||
71 | spin_lock_irqsave(&desc->lock, flags); | ||
72 | ret = desc->chip->set_type(irq, type); | ||
73 | spin_unlock_irqrestore(&desc->lock, flags); | ||
74 | } | ||
75 | return ret; | ||
76 | } | ||
77 | EXPORT_SYMBOL(set_irq_type); | ||
78 | |||
79 | /** | ||
80 | * set_irq_data - set irq type data for an irq | ||
81 | * @irq: Interrupt number | ||
82 | * @data: Pointer to interrupt specific data | ||
83 | * | ||
84 | * Set the hardware irq controller data for an irq | ||
85 | */ | ||
86 | int set_irq_data(unsigned int irq, void *data) | ||
87 | { | ||
88 | struct irq_desc *desc; | ||
89 | unsigned long flags; | ||
90 | |||
91 | if (irq >= NR_IRQS) { | ||
92 | printk(KERN_ERR | ||
93 | "Trying to install controller data for IRQ%d\n", irq); | ||
94 | return -EINVAL; | ||
95 | } | ||
96 | |||
97 | desc = irq_desc + irq; | ||
98 | spin_lock_irqsave(&desc->lock, flags); | ||
99 | desc->handler_data = data; | ||
100 | spin_unlock_irqrestore(&desc->lock, flags); | ||
101 | return 0; | ||
102 | } | ||
103 | EXPORT_SYMBOL(set_irq_data); | ||
104 | |||
105 | /** | ||
106 | * set_irq_chip_data - set irq chip data for an irq | ||
107 | * @irq: Interrupt number | ||
108 | * @data: Pointer to chip specific data | ||
109 | * | ||
110 | * Set the hardware irq chip data for an irq | ||
111 | */ | ||
112 | int set_irq_chip_data(unsigned int irq, void *data) | ||
113 | { | ||
114 | struct irq_desc *desc = irq_desc + irq; | ||
115 | unsigned long flags; | ||
116 | |||
117 | if (irq >= NR_IRQS || !desc->chip) { | ||
118 | printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); | ||
119 | return -EINVAL; | ||
120 | } | ||
121 | |||
122 | spin_lock_irqsave(&desc->lock, flags); | ||
123 | desc->chip_data = data; | ||
124 | spin_unlock_irqrestore(&desc->lock, flags); | ||
125 | |||
126 | return 0; | ||
127 | } | ||
128 | EXPORT_SYMBOL(set_irq_chip_data); | ||
129 | |||
130 | /* | ||
131 | * default enable function | ||
132 | */ | ||
133 | static void default_enable(unsigned int irq) | ||
134 | { | ||
135 | struct irq_desc *desc = irq_desc + irq; | ||
136 | |||
137 | desc->chip->unmask(irq); | ||
138 | desc->status &= ~IRQ_MASKED; | ||
139 | } | ||
140 | |||
141 | /* | ||
142 | * default disable function | ||
143 | */ | ||
144 | static void default_disable(unsigned int irq) | ||
145 | { | ||
146 | struct irq_desc *desc = irq_desc + irq; | ||
147 | |||
148 | if (!(desc->status & IRQ_DELAYED_DISABLE)) | ||
149 | irq_desc[irq].chip->mask(irq); | ||
150 | } | ||
151 | |||
152 | /* | ||
153 | * default startup function | ||
154 | */ | ||
155 | static unsigned int default_startup(unsigned int irq) | ||
156 | { | ||
157 | irq_desc[irq].chip->enable(irq); | ||
158 | |||
159 | return 0; | ||
160 | } | ||
161 | |||
162 | /* | ||
163 | * Fixup enable/disable function pointers | ||
164 | */ | ||
165 | void irq_chip_set_defaults(struct irq_chip *chip) | ||
166 | { | ||
167 | if (!chip->enable) | ||
168 | chip->enable = default_enable; | ||
169 | if (!chip->disable) | ||
170 | chip->disable = default_disable; | ||
171 | if (!chip->startup) | ||
172 | chip->startup = default_startup; | ||
173 | if (!chip->shutdown) | ||
174 | chip->shutdown = chip->disable; | ||
175 | if (!chip->name) | ||
176 | chip->name = chip->typename; | ||
177 | } | ||
178 | |||
179 | static inline void mask_ack_irq(struct irq_desc *desc, int irq) | ||
180 | { | ||
181 | if (desc->chip->mask_ack) | ||
182 | desc->chip->mask_ack(irq); | ||
183 | else { | ||
184 | desc->chip->mask(irq); | ||
185 | desc->chip->ack(irq); | ||
186 | } | ||
187 | } | ||
188 | |||
189 | /** | ||
190 | * handle_simple_irq - Simple and software-decoded IRQs. | ||
191 | * @irq: the interrupt number | ||
192 | * @desc: the interrupt description structure for this irq | ||
193 | * @regs: pointer to a register structure | ||
194 | * | ||
195 | * Simple interrupts are either sent from a demultiplexing interrupt | ||
196 | * handler or come from hardware, where no interrupt hardware control | ||
197 | * is necessary. | ||
198 | * | ||
199 | * Note: The caller is expected to handle the ack, clear, mask and | ||
200 | * unmask issues if necessary. | ||
201 | */ | ||
202 | void fastcall | ||
203 | handle_simple_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) | ||
204 | { | ||
205 | struct irqaction *action; | ||
206 | irqreturn_t action_ret; | ||
207 | const unsigned int cpu = smp_processor_id(); | ||
208 | |||
209 | spin_lock(&desc->lock); | ||
210 | |||
211 | if (unlikely(desc->status & IRQ_INPROGRESS)) | ||
212 | goto out_unlock; | ||
213 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | ||
214 | kstat_cpu(cpu).irqs[irq]++; | ||
215 | |||
216 | action = desc->action; | ||
217 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) | ||
218 | goto out_unlock; | ||
219 | |||
220 | desc->status |= IRQ_INPROGRESS; | ||
221 | spin_unlock(&desc->lock); | ||
222 | |||
223 | action_ret = handle_IRQ_event(irq, regs, action); | ||
224 | if (!noirqdebug) | ||
225 | note_interrupt(irq, desc, action_ret, regs); | ||
226 | |||
227 | spin_lock(&desc->lock); | ||
228 | desc->status &= ~IRQ_INPROGRESS; | ||
229 | out_unlock: | ||
230 | spin_unlock(&desc->lock); | ||
231 | } | ||
232 | |||
233 | /** | ||
234 | * handle_level_irq - Level type irq handler | ||
235 | * @irq: the interrupt number | ||
236 | * @desc: the interrupt description structure for this irq | ||
237 | * @regs: pointer to a register structure | ||
238 | * | ||
239 | * Level type interrupts are active as long as the hardware line has | ||
240 | * the active level. This may require to mask the interrupt and unmask | ||
241 | * it after the associated handler has acknowledged the device, so the | ||
242 | * interrupt line is back to inactive. | ||
243 | */ | ||
244 | void fastcall | ||
245 | handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) | ||
246 | { | ||
247 | unsigned int cpu = smp_processor_id(); | ||
248 | struct irqaction *action; | ||
249 | irqreturn_t action_ret; | ||
250 | |||
251 | spin_lock(&desc->lock); | ||
252 | mask_ack_irq(desc, irq); | ||
253 | |||
254 | if (unlikely(desc->status & IRQ_INPROGRESS)) | ||
255 | goto out; | ||
256 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | ||
257 | kstat_cpu(cpu).irqs[irq]++; | ||
258 | |||
259 | /* | ||
260 | * If its disabled or no action available | ||
261 | * keep it masked and get out of here | ||
262 | */ | ||
263 | action = desc->action; | ||
264 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { | ||
265 | desc->status |= IRQ_PENDING; | ||
266 | goto out; | ||
267 | } | ||
268 | |||
269 | desc->status |= IRQ_INPROGRESS; | ||
270 | desc->status &= ~IRQ_PENDING; | ||
271 | spin_unlock(&desc->lock); | ||
272 | |||
273 | action_ret = handle_IRQ_event(irq, regs, action); | ||
274 | if (!noirqdebug) | ||
275 | note_interrupt(irq, desc, action_ret, regs); | ||
276 | |||
277 | spin_lock(&desc->lock); | ||
278 | desc->status &= ~IRQ_INPROGRESS; | ||
279 | out: | ||
280 | if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) | ||
281 | desc->chip->unmask(irq); | ||
282 | spin_unlock(&desc->lock); | ||
283 | } | ||
284 | |||
285 | /** | ||
286 | * handle_fasteoi_irq - irq handler for transparent controllers | ||
287 | * @irq: the interrupt number | ||
288 | * @desc: the interrupt description structure for this irq | ||
289 | * @regs: pointer to a register structure | ||
290 | * | ||
291 | * Only a single callback will be issued to the chip: an ->eoi() | ||
292 | * call when the interrupt has been serviced. This enables support | ||
293 | * for modern forms of interrupt handlers, which handle the flow | ||
294 | * details in hardware, transparently. | ||
295 | */ | ||
296 | void fastcall | ||
297 | handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc, | ||
298 | struct pt_regs *regs) | ||
299 | { | ||
300 | unsigned int cpu = smp_processor_id(); | ||
301 | struct irqaction *action; | ||
302 | irqreturn_t action_ret; | ||
303 | |||
304 | spin_lock(&desc->lock); | ||
305 | |||
306 | if (unlikely(desc->status & IRQ_INPROGRESS)) | ||
307 | goto out; | ||
308 | |||
309 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | ||
310 | kstat_cpu(cpu).irqs[irq]++; | ||
311 | |||
312 | /* | ||
313 | * If its disabled or no action available | ||
314 | * keep it masked and get out of here | ||
315 | */ | ||
316 | action = desc->action; | ||
317 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { | ||
318 | desc->status |= IRQ_PENDING; | ||
319 | goto out; | ||
320 | } | ||
321 | |||
322 | desc->status |= IRQ_INPROGRESS; | ||
323 | desc->status &= ~IRQ_PENDING; | ||
324 | spin_unlock(&desc->lock); | ||
325 | |||
326 | action_ret = handle_IRQ_event(irq, regs, action); | ||
327 | if (!noirqdebug) | ||
328 | note_interrupt(irq, desc, action_ret, regs); | ||
329 | |||
330 | spin_lock(&desc->lock); | ||
331 | desc->status &= ~IRQ_INPROGRESS; | ||
332 | out: | ||
333 | desc->chip->eoi(irq); | ||
334 | |||
335 | spin_unlock(&desc->lock); | ||
336 | } | ||
337 | |||
338 | /** | ||
339 | * handle_edge_irq - edge type IRQ handler | ||
340 | * @irq: the interrupt number | ||
341 | * @desc: the interrupt description structure for this irq | ||
342 | * @regs: pointer to a register structure | ||
343 | * | ||
344 | * Interrupt occures on the falling and/or rising edge of a hardware | ||
345 | * signal. The occurence is latched into the irq controller hardware | ||
346 | * and must be acked in order to be reenabled. After the ack another | ||
347 | * interrupt can happen on the same source even before the first one | ||
348 | * is handled by the assosiacted event handler. If this happens it | ||
349 | * might be necessary to disable (mask) the interrupt depending on the | ||
350 | * controller hardware. This requires to reenable the interrupt inside | ||
351 | * of the loop which handles the interrupts which have arrived while | ||
352 | * the handler was running. If all pending interrupts are handled, the | ||
353 | * loop is left. | ||
354 | */ | ||
355 | void fastcall | ||
356 | handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) | ||
357 | { | ||
358 | const unsigned int cpu = smp_processor_id(); | ||
359 | |||
360 | spin_lock(&desc->lock); | ||
361 | |||
362 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | ||
363 | |||
364 | /* | ||
365 | * If we're currently running this IRQ, or its disabled, | ||
366 | * we shouldn't process the IRQ. Mark it pending, handle | ||
367 | * the necessary masking and go out | ||
368 | */ | ||
369 | if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || | ||
370 | !desc->action)) { | ||
371 | desc->status |= (IRQ_PENDING | IRQ_MASKED); | ||
372 | mask_ack_irq(desc, irq); | ||
373 | goto out_unlock; | ||
374 | } | ||
375 | |||
376 | kstat_cpu(cpu).irqs[irq]++; | ||
377 | |||
378 | /* Start handling the irq */ | ||
379 | desc->chip->ack(irq); | ||
380 | |||
381 | /* Mark the IRQ currently in progress.*/ | ||
382 | desc->status |= IRQ_INPROGRESS; | ||
383 | |||
384 | do { | ||
385 | struct irqaction *action = desc->action; | ||
386 | irqreturn_t action_ret; | ||
387 | |||
388 | if (unlikely(!action)) { | ||
389 | desc->chip->mask(irq); | ||
390 | goto out_unlock; | ||
391 | } | ||
392 | |||
393 | /* | ||
394 | * When another irq arrived while we were handling | ||
395 | * one, we could have masked the irq. | ||
396 | * Renable it, if it was not disabled in meantime. | ||
397 | */ | ||
398 | if (unlikely((desc->status & | ||
399 | (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == | ||
400 | (IRQ_PENDING | IRQ_MASKED))) { | ||
401 | desc->chip->unmask(irq); | ||
402 | desc->status &= ~IRQ_MASKED; | ||
403 | } | ||
404 | |||
405 | desc->status &= ~IRQ_PENDING; | ||
406 | spin_unlock(&desc->lock); | ||
407 | action_ret = handle_IRQ_event(irq, regs, action); | ||
408 | if (!noirqdebug) | ||
409 | note_interrupt(irq, desc, action_ret, regs); | ||
410 | spin_lock(&desc->lock); | ||
411 | |||
412 | } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); | ||
413 | |||
414 | desc->status &= ~IRQ_INPROGRESS; | ||
415 | out_unlock: | ||
416 | spin_unlock(&desc->lock); | ||
417 | } | ||
418 | |||
419 | #ifdef CONFIG_SMP | ||
420 | /** | ||
421 | * handle_percpu_IRQ - Per CPU local irq handler | ||
422 | * @irq: the interrupt number | ||
423 | * @desc: the interrupt description structure for this irq | ||
424 | * @regs: pointer to a register structure | ||
425 | * | ||
426 | * Per CPU interrupts on SMP machines without locking requirements | ||
427 | */ | ||
428 | void fastcall | ||
429 | handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) | ||
430 | { | ||
431 | irqreturn_t action_ret; | ||
432 | |||
433 | kstat_this_cpu.irqs[irq]++; | ||
434 | |||
435 | if (desc->chip->ack) | ||
436 | desc->chip->ack(irq); | ||
437 | |||
438 | action_ret = handle_IRQ_event(irq, regs, desc->action); | ||
439 | if (!noirqdebug) | ||
440 | note_interrupt(irq, desc, action_ret, regs); | ||
441 | |||
442 | if (desc->chip->eoi) | ||
443 | desc->chip->eoi(irq); | ||
444 | } | ||
445 | |||
446 | #endif /* CONFIG_SMP */ | ||
447 | |||
448 | void | ||
449 | __set_irq_handler(unsigned int irq, | ||
450 | void fastcall (*handle)(unsigned int, irq_desc_t *, | ||
451 | struct pt_regs *), | ||
452 | int is_chained) | ||
453 | { | ||
454 | struct irq_desc *desc; | ||
455 | unsigned long flags; | ||
456 | |||
457 | if (irq >= NR_IRQS) { | ||
458 | printk(KERN_ERR | ||
459 | "Trying to install type control for IRQ%d\n", irq); | ||
460 | return; | ||
461 | } | ||
462 | |||
463 | desc = irq_desc + irq; | ||
464 | |||
465 | if (!handle) | ||
466 | handle = handle_bad_irq; | ||
467 | |||
468 | if (desc->chip == &no_irq_chip) { | ||
469 | printk(KERN_WARNING "Trying to install %sinterrupt handler " | ||
470 | "for IRQ%d\n", is_chained ? "chained " : " ", irq); | ||
471 | /* | ||
472 | * Some ARM implementations install a handler for really dumb | ||
473 | * interrupt hardware without setting an irq_chip. This worked | ||
474 | * with the ARM no_irq_chip but the check in setup_irq would | ||
475 | * prevent us to setup the interrupt at all. Switch it to | ||
476 | * dummy_irq_chip for easy transition. | ||
477 | */ | ||
478 | desc->chip = &dummy_irq_chip; | ||
479 | } | ||
480 | |||
481 | spin_lock_irqsave(&desc->lock, flags); | ||
482 | |||
483 | /* Uninstall? */ | ||
484 | if (handle == handle_bad_irq) { | ||
485 | if (desc->chip != &no_irq_chip) { | ||
486 | desc->chip->mask(irq); | ||
487 | desc->chip->ack(irq); | ||
488 | } | ||
489 | desc->status |= IRQ_DISABLED; | ||
490 | desc->depth = 1; | ||
491 | } | ||
492 | desc->handle_irq = handle; | ||
493 | |||
494 | if (handle != handle_bad_irq && is_chained) { | ||
495 | desc->status &= ~IRQ_DISABLED; | ||
496 | desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; | ||
497 | desc->depth = 0; | ||
498 | desc->chip->unmask(irq); | ||
499 | } | ||
500 | spin_unlock_irqrestore(&desc->lock, flags); | ||
501 | } | ||
502 | |||
503 | void | ||
504 | set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, | ||
505 | void fastcall (*handle)(unsigned int, | ||
506 | struct irq_desc *, | ||
507 | struct pt_regs *)) | ||
508 | { | ||
509 | set_irq_chip(irq, chip); | ||
510 | __set_irq_handler(irq, handle, 0); | ||
511 | } | ||
512 | |||
513 | /* | ||
514 | * Get a descriptive string for the highlevel handler, for | ||
515 | * /proc/interrupts output: | ||
516 | */ | ||
517 | const char * | ||
518 | handle_irq_name(void fastcall (*handle)(unsigned int, struct irq_desc *, | ||
519 | struct pt_regs *)) | ||
520 | { | ||
521 | if (handle == handle_level_irq) | ||
522 | return "level "; | ||
523 | if (handle == handle_fasteoi_irq) | ||
524 | return "fasteoi"; | ||
525 | if (handle == handle_edge_irq) | ||
526 | return "edge "; | ||
527 | if (handle == handle_simple_irq) | ||
528 | return "simple "; | ||
529 | #ifdef CONFIG_SMP | ||
530 | if (handle == handle_percpu_irq) | ||
531 | return "percpu "; | ||
532 | #endif | ||
533 | if (handle == handle_bad_irq) | ||
534 | return "bad "; | ||
535 | |||
536 | return NULL; | ||
537 | } | ||
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 51df337b37db..fc4e906aedbd 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -1,9 +1,13 @@ | |||
1 | /* | 1 | /* |
2 | * linux/kernel/irq/handle.c | 2 | * linux/kernel/irq/handle.c |
3 | * | 3 | * |
4 | * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar | 4 | * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar |
5 | * Copyright (C) 2005-2006, Thomas Gleixner, Russell King | ||
5 | * | 6 | * |
6 | * This file contains the core interrupt handling code. | 7 | * This file contains the core interrupt handling code. |
8 | * | ||
9 | * Detailed information is available in Documentation/DocBook/genericirq | ||
10 | * | ||
7 | */ | 11 | */ |
8 | 12 | ||
9 | #include <linux/irq.h> | 13 | #include <linux/irq.h> |
@@ -14,11 +18,22 @@ | |||
14 | 18 | ||
15 | #include "internals.h" | 19 | #include "internals.h" |
16 | 20 | ||
21 | /** | ||
22 | * handle_bad_irq - handle spurious and unhandled irqs | ||
23 | */ | ||
24 | void fastcall | ||
25 | handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) | ||
26 | { | ||
27 | print_irq_desc(irq, desc); | ||
28 | kstat_this_cpu.irqs[irq]++; | ||
29 | ack_bad_irq(irq); | ||
30 | } | ||
31 | |||
17 | /* | 32 | /* |
18 | * Linux has a controller-independent interrupt architecture. | 33 | * Linux has a controller-independent interrupt architecture. |
19 | * Every controller has a 'controller-template', that is used | 34 | * Every controller has a 'controller-template', that is used |
20 | * by the main code to do the right thing. Each driver-visible | 35 | * by the main code to do the right thing. Each driver-visible |
21 | * interrupt source is transparently wired to the apropriate | 36 | * interrupt source is transparently wired to the appropriate |
22 | * controller. Thus drivers need not be aware of the | 37 | * controller. Thus drivers need not be aware of the |
23 | * interrupt-controller. | 38 | * interrupt-controller. |
24 | * | 39 | * |
@@ -28,41 +43,68 @@ | |||
28 | * | 43 | * |
29 | * Controller mappings for all interrupt sources: | 44 | * Controller mappings for all interrupt sources: |
30 | */ | 45 | */ |
31 | irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { | 46 | struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = { |
32 | [0 ... NR_IRQS-1] = { | 47 | [0 ... NR_IRQS-1] = { |
33 | .status = IRQ_DISABLED, | 48 | .status = IRQ_DISABLED, |
34 | .handler = &no_irq_type, | 49 | .chip = &no_irq_chip, |
35 | .lock = SPIN_LOCK_UNLOCKED | 50 | .handle_irq = handle_bad_irq, |
51 | .depth = 1, | ||
52 | .lock = SPIN_LOCK_UNLOCKED, | ||
53 | #ifdef CONFIG_SMP | ||
54 | .affinity = CPU_MASK_ALL | ||
55 | #endif | ||
36 | } | 56 | } |
37 | }; | 57 | }; |
38 | 58 | ||
39 | /* | 59 | /* |
40 | * Generic 'no controller' code | 60 | * What should we do if we get a hw irq event on an illegal vector? |
61 | * Each architecture has to answer this themself. | ||
41 | */ | 62 | */ |
42 | static void end_none(unsigned int irq) { } | 63 | static void ack_bad(unsigned int irq) |
43 | static void enable_none(unsigned int irq) { } | ||
44 | static void disable_none(unsigned int irq) { } | ||
45 | static void shutdown_none(unsigned int irq) { } | ||
46 | static unsigned int startup_none(unsigned int irq) { return 0; } | ||
47 | |||
48 | static void ack_none(unsigned int irq) | ||
49 | { | 64 | { |
50 | /* | 65 | print_irq_desc(irq, irq_desc + irq); |
51 | * 'what should we do if we get a hw irq event on an illegal vector'. | ||
52 | * each architecture has to answer this themself. | ||
53 | */ | ||
54 | ack_bad_irq(irq); | 66 | ack_bad_irq(irq); |
55 | } | 67 | } |
56 | 68 | ||
57 | struct hw_interrupt_type no_irq_type = { | 69 | /* |
58 | .typename = "none", | 70 | * NOP functions |
59 | .startup = startup_none, | 71 | */ |
60 | .shutdown = shutdown_none, | 72 | static void noop(unsigned int irq) |
61 | .enable = enable_none, | 73 | { |
62 | .disable = disable_none, | 74 | } |
63 | .ack = ack_none, | 75 | |
64 | .end = end_none, | 76 | static unsigned int noop_ret(unsigned int irq) |
65 | .set_affinity = NULL | 77 | { |
78 | return 0; | ||
79 | } | ||
80 | |||
81 | /* | ||
82 | * Generic no controller implementation | ||
83 | */ | ||
84 | struct irq_chip no_irq_chip = { | ||
85 | .name = "none", | ||
86 | .startup = noop_ret, | ||
87 | .shutdown = noop, | ||
88 | .enable = noop, | ||
89 | .disable = noop, | ||
90 | .ack = ack_bad, | ||
91 | .end = noop, | ||
92 | }; | ||
93 | |||
94 | /* | ||
95 | * Generic dummy implementation which can be used for | ||
96 | * real dumb interrupt sources | ||
97 | */ | ||
98 | struct irq_chip dummy_irq_chip = { | ||
99 | .name = "dummy", | ||
100 | .startup = noop_ret, | ||
101 | .shutdown = noop, | ||
102 | .enable = noop, | ||
103 | .disable = noop, | ||
104 | .ack = noop, | ||
105 | .mask = noop, | ||
106 | .unmask = noop, | ||
107 | .end = noop, | ||
66 | }; | 108 | }; |
67 | 109 | ||
68 | /* | 110 | /* |
@@ -73,16 +115,24 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs) | |||
73 | return IRQ_NONE; | 115 | return IRQ_NONE; |
74 | } | 116 | } |
75 | 117 | ||
76 | /* | 118 | /** |
77 | * Have got an event to handle: | 119 | * handle_IRQ_event - irq action chain handler |
120 | * @irq: the interrupt number | ||
121 | * @regs: pointer to a register structure | ||
122 | * @action: the interrupt action chain for this irq | ||
123 | * | ||
124 | * Handles the action chain of an irq event | ||
78 | */ | 125 | */ |
79 | fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, | 126 | irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, |
80 | struct irqaction *action) | 127 | struct irqaction *action) |
81 | { | 128 | { |
82 | int ret, retval = 0, status = 0; | 129 | irqreturn_t ret, retval = IRQ_NONE; |
130 | unsigned int status = 0; | ||
131 | |||
132 | handle_dynamic_tick(action); | ||
83 | 133 | ||
84 | if (!(action->flags & SA_INTERRUPT)) | 134 | if (!(action->flags & IRQF_DISABLED)) |
85 | local_irq_enable(); | 135 | local_irq_enable_in_hardirq(); |
86 | 136 | ||
87 | do { | 137 | do { |
88 | ret = action->handler(irq, action->dev_id, regs); | 138 | ret = action->handler(irq, action->dev_id, regs); |
@@ -92,22 +142,29 @@ fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, | |||
92 | action = action->next; | 142 | action = action->next; |
93 | } while (action); | 143 | } while (action); |
94 | 144 | ||
95 | if (status & SA_SAMPLE_RANDOM) | 145 | if (status & IRQF_SAMPLE_RANDOM) |
96 | add_interrupt_randomness(irq); | 146 | add_interrupt_randomness(irq); |
97 | local_irq_disable(); | 147 | local_irq_disable(); |
98 | 148 | ||
99 | return retval; | 149 | return retval; |
100 | } | 150 | } |
101 | 151 | ||
102 | /* | 152 | /** |
103 | * do_IRQ handles all normal device IRQ's (the special | 153 | * __do_IRQ - original all in one highlevel IRQ handler |
154 | * @irq: the interrupt number | ||
155 | * @regs: pointer to a register structure | ||
156 | * | ||
157 | * __do_IRQ handles all normal device IRQ's (the special | ||
104 | * SMP cross-CPU interrupts have their own specific | 158 | * SMP cross-CPU interrupts have their own specific |
105 | * handlers). | 159 | * handlers). |
160 | * | ||
161 | * This is the original x86 implementation which is used for every | ||
162 | * interrupt type. | ||
106 | */ | 163 | */ |
107 | fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) | 164 | fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) |
108 | { | 165 | { |
109 | irq_desc_t *desc = irq_desc + irq; | 166 | struct irq_desc *desc = irq_desc + irq; |
110 | struct irqaction * action; | 167 | struct irqaction *action; |
111 | unsigned int status; | 168 | unsigned int status; |
112 | 169 | ||
113 | kstat_this_cpu.irqs[irq]++; | 170 | kstat_this_cpu.irqs[irq]++; |
@@ -117,16 +174,16 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) | |||
117 | /* | 174 | /* |
118 | * No locking required for CPU-local interrupts: | 175 | * No locking required for CPU-local interrupts: |
119 | */ | 176 | */ |
120 | if (desc->handler->ack) | 177 | if (desc->chip->ack) |
121 | desc->handler->ack(irq); | 178 | desc->chip->ack(irq); |
122 | action_ret = handle_IRQ_event(irq, regs, desc->action); | 179 | action_ret = handle_IRQ_event(irq, regs, desc->action); |
123 | desc->handler->end(irq); | 180 | desc->chip->end(irq); |
124 | return 1; | 181 | return 1; |
125 | } | 182 | } |
126 | 183 | ||
127 | spin_lock(&desc->lock); | 184 | spin_lock(&desc->lock); |
128 | if (desc->handler->ack) | 185 | if (desc->chip->ack) |
129 | desc->handler->ack(irq); | 186 | desc->chip->ack(irq); |
130 | /* | 187 | /* |
131 | * REPLAY is when Linux resends an IRQ that was dropped earlier | 188 | * REPLAY is when Linux resends an IRQ that was dropped earlier |
132 | * WAITING is used by probe to mark irqs that are being tested | 189 | * WAITING is used by probe to mark irqs that are being tested |
@@ -186,9 +243,25 @@ out: | |||
186 | * The ->end() handler has to deal with interrupts which got | 243 | * The ->end() handler has to deal with interrupts which got |
187 | * disabled while the handler was running. | 244 | * disabled while the handler was running. |
188 | */ | 245 | */ |
189 | desc->handler->end(irq); | 246 | desc->chip->end(irq); |
190 | spin_unlock(&desc->lock); | 247 | spin_unlock(&desc->lock); |
191 | 248 | ||
192 | return 1; | 249 | return 1; |
193 | } | 250 | } |
194 | 251 | ||
252 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
253 | |||
254 | /* | ||
255 | * lockdep: we want to handle all irq_desc locks as a single lock-class: | ||
256 | */ | ||
257 | static struct lock_class_key irq_desc_lock_class; | ||
258 | |||
259 | void early_init_irq_lock_class(void) | ||
260 | { | ||
261 | int i; | ||
262 | |||
263 | for (i = 0; i < NR_IRQS; i++) | ||
264 | lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class); | ||
265 | } | ||
266 | |||
267 | #endif | ||
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 46feba630266..08a849a22447 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
@@ -4,6 +4,12 @@ | |||
4 | 4 | ||
5 | extern int noirqdebug; | 5 | extern int noirqdebug; |
6 | 6 | ||
7 | /* Set default functions for irq_chip structures: */ | ||
8 | extern void irq_chip_set_defaults(struct irq_chip *chip); | ||
9 | |||
10 | /* Set default handler: */ | ||
11 | extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); | ||
12 | |||
7 | #ifdef CONFIG_PROC_FS | 13 | #ifdef CONFIG_PROC_FS |
8 | extern void register_irq_proc(unsigned int irq); | 14 | extern void register_irq_proc(unsigned int irq); |
9 | extern void register_handler_proc(unsigned int irq, struct irqaction *action); | 15 | extern void register_handler_proc(unsigned int irq, struct irqaction *action); |
@@ -16,3 +22,43 @@ static inline void unregister_handler_proc(unsigned int irq, | |||
16 | struct irqaction *action) { } | 22 | struct irqaction *action) { } |
17 | #endif | 23 | #endif |
18 | 24 | ||
25 | /* | ||
26 | * Debugging printout: | ||
27 | */ | ||
28 | |||
29 | #include <linux/kallsyms.h> | ||
30 | |||
31 | #define P(f) if (desc->status & f) printk("%14s set\n", #f) | ||
32 | |||
33 | static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) | ||
34 | { | ||
35 | printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", | ||
36 | irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); | ||
37 | printk("->handle_irq(): %p, ", desc->handle_irq); | ||
38 | print_symbol("%s\n", (unsigned long)desc->handle_irq); | ||
39 | printk("->chip(): %p, ", desc->chip); | ||
40 | print_symbol("%s\n", (unsigned long)desc->chip); | ||
41 | printk("->action(): %p\n", desc->action); | ||
42 | if (desc->action) { | ||
43 | printk("->action->handler(): %p, ", desc->action->handler); | ||
44 | print_symbol("%s\n", (unsigned long)desc->action->handler); | ||
45 | } | ||
46 | |||
47 | P(IRQ_INPROGRESS); | ||
48 | P(IRQ_DISABLED); | ||
49 | P(IRQ_PENDING); | ||
50 | P(IRQ_REPLAY); | ||
51 | P(IRQ_AUTODETECT); | ||
52 | P(IRQ_WAITING); | ||
53 | P(IRQ_LEVEL); | ||
54 | P(IRQ_MASKED); | ||
55 | #ifdef CONFIG_IRQ_PER_CPU | ||
56 | P(IRQ_PER_CPU); | ||
57 | #endif | ||
58 | P(IRQ_NOPROBE); | ||
59 | P(IRQ_NOREQUEST); | ||
60 | P(IRQ_NOAUTOEN); | ||
61 | } | ||
62 | |||
63 | #undef P | ||
64 | |||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1279e3499534..4e461438e48b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -1,12 +1,12 @@ | |||
1 | /* | 1 | /* |
2 | * linux/kernel/irq/manage.c | 2 | * linux/kernel/irq/manage.c |
3 | * | 3 | * |
4 | * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar | 4 | * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar |
5 | * Copyright (C) 2005-2006 Thomas Gleixner | ||
5 | * | 6 | * |
6 | * This file contains driver APIs to the irq subsystem. | 7 | * This file contains driver APIs to the irq subsystem. |
7 | */ | 8 | */ |
8 | 9 | ||
9 | #include <linux/config.h> | ||
10 | #include <linux/irq.h> | 10 | #include <linux/irq.h> |
11 | #include <linux/module.h> | 11 | #include <linux/module.h> |
12 | #include <linux/random.h> | 12 | #include <linux/random.h> |
@@ -16,12 +16,6 @@ | |||
16 | 16 | ||
17 | #ifdef CONFIG_SMP | 17 | #ifdef CONFIG_SMP |
18 | 18 | ||
19 | cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; | ||
20 | |||
21 | #if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE) | ||
22 | cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS]; | ||
23 | #endif | ||
24 | |||
25 | /** | 19 | /** |
26 | * synchronize_irq - wait for pending IRQ handlers (on other CPUs) | 20 | * synchronize_irq - wait for pending IRQ handlers (on other CPUs) |
27 | * @irq: interrupt number to wait for | 21 | * @irq: interrupt number to wait for |
@@ -42,7 +36,6 @@ void synchronize_irq(unsigned int irq) | |||
42 | while (desc->status & IRQ_INPROGRESS) | 36 | while (desc->status & IRQ_INPROGRESS) |
43 | cpu_relax(); | 37 | cpu_relax(); |
44 | } | 38 | } |
45 | |||
46 | EXPORT_SYMBOL(synchronize_irq); | 39 | EXPORT_SYMBOL(synchronize_irq); |
47 | 40 | ||
48 | #endif | 41 | #endif |
@@ -60,7 +53,7 @@ EXPORT_SYMBOL(synchronize_irq); | |||
60 | */ | 53 | */ |
61 | void disable_irq_nosync(unsigned int irq) | 54 | void disable_irq_nosync(unsigned int irq) |
62 | { | 55 | { |
63 | irq_desc_t *desc = irq_desc + irq; | 56 | struct irq_desc *desc = irq_desc + irq; |
64 | unsigned long flags; | 57 | unsigned long flags; |
65 | 58 | ||
66 | if (irq >= NR_IRQS) | 59 | if (irq >= NR_IRQS) |
@@ -69,11 +62,10 @@ void disable_irq_nosync(unsigned int irq) | |||
69 | spin_lock_irqsave(&desc->lock, flags); | 62 | spin_lock_irqsave(&desc->lock, flags); |
70 | if (!desc->depth++) { | 63 | if (!desc->depth++) { |
71 | desc->status |= IRQ_DISABLED; | 64 | desc->status |= IRQ_DISABLED; |
72 | desc->handler->disable(irq); | 65 | desc->chip->disable(irq); |
73 | } | 66 | } |
74 | spin_unlock_irqrestore(&desc->lock, flags); | 67 | spin_unlock_irqrestore(&desc->lock, flags); |
75 | } | 68 | } |
76 | |||
77 | EXPORT_SYMBOL(disable_irq_nosync); | 69 | EXPORT_SYMBOL(disable_irq_nosync); |
78 | 70 | ||
79 | /** | 71 | /** |
@@ -90,7 +82,7 @@ EXPORT_SYMBOL(disable_irq_nosync); | |||
90 | */ | 82 | */ |
91 | void disable_irq(unsigned int irq) | 83 | void disable_irq(unsigned int irq) |
92 | { | 84 | { |
93 | irq_desc_t *desc = irq_desc + irq; | 85 | struct irq_desc *desc = irq_desc + irq; |
94 | 86 | ||
95 | if (irq >= NR_IRQS) | 87 | if (irq >= NR_IRQS) |
96 | return; | 88 | return; |
@@ -99,7 +91,6 @@ void disable_irq(unsigned int irq) | |||
99 | if (desc->action) | 91 | if (desc->action) |
100 | synchronize_irq(irq); | 92 | synchronize_irq(irq); |
101 | } | 93 | } |
102 | |||
103 | EXPORT_SYMBOL(disable_irq); | 94 | EXPORT_SYMBOL(disable_irq); |
104 | 95 | ||
105 | /** | 96 | /** |
@@ -114,7 +105,7 @@ EXPORT_SYMBOL(disable_irq); | |||
114 | */ | 105 | */ |
115 | void enable_irq(unsigned int irq) | 106 | void enable_irq(unsigned int irq) |
116 | { | 107 | { |
117 | irq_desc_t *desc = irq_desc + irq; | 108 | struct irq_desc *desc = irq_desc + irq; |
118 | unsigned long flags; | 109 | unsigned long flags; |
119 | 110 | ||
120 | if (irq >= NR_IRQS) | 111 | if (irq >= NR_IRQS) |
@@ -123,17 +114,15 @@ void enable_irq(unsigned int irq) | |||
123 | spin_lock_irqsave(&desc->lock, flags); | 114 | spin_lock_irqsave(&desc->lock, flags); |
124 | switch (desc->depth) { | 115 | switch (desc->depth) { |
125 | case 0: | 116 | case 0: |
117 | printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); | ||
126 | WARN_ON(1); | 118 | WARN_ON(1); |
127 | break; | 119 | break; |
128 | case 1: { | 120 | case 1: { |
129 | unsigned int status = desc->status & ~IRQ_DISABLED; | 121 | unsigned int status = desc->status & ~IRQ_DISABLED; |
130 | 122 | ||
131 | desc->status = status; | 123 | /* Prevent probing on this irq: */ |
132 | if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { | 124 | desc->status = status | IRQ_NOPROBE; |
133 | desc->status = status | IRQ_REPLAY; | 125 | check_irq_resend(desc, irq); |
134 | hw_resend_irq(desc->handler,irq); | ||
135 | } | ||
136 | desc->handler->enable(irq); | ||
137 | /* fall-through */ | 126 | /* fall-through */ |
138 | } | 127 | } |
139 | default: | 128 | default: |
@@ -141,9 +130,29 @@ void enable_irq(unsigned int irq) | |||
141 | } | 130 | } |
142 | spin_unlock_irqrestore(&desc->lock, flags); | 131 | spin_unlock_irqrestore(&desc->lock, flags); |
143 | } | 132 | } |
144 | |||
145 | EXPORT_SYMBOL(enable_irq); | 133 | EXPORT_SYMBOL(enable_irq); |
146 | 134 | ||
135 | /** | ||
136 | * set_irq_wake - control irq power management wakeup | ||
137 | * @irq: interrupt to control | ||
138 | * @on: enable/disable power management wakeup | ||
139 | * | ||
140 | * Enable/disable power management wakeup mode | ||
141 | */ | ||
142 | int set_irq_wake(unsigned int irq, unsigned int on) | ||
143 | { | ||
144 | struct irq_desc *desc = irq_desc + irq; | ||
145 | unsigned long flags; | ||
146 | int ret = -ENXIO; | ||
147 | |||
148 | spin_lock_irqsave(&desc->lock, flags); | ||
149 | if (desc->chip->set_wake) | ||
150 | ret = desc->chip->set_wake(irq, on); | ||
151 | spin_unlock_irqrestore(&desc->lock, flags); | ||
152 | return ret; | ||
153 | } | ||
154 | EXPORT_SYMBOL(set_irq_wake); | ||
155 | |||
147 | /* | 156 | /* |
148 | * Internal function that tells the architecture code whether a | 157 | * Internal function that tells the architecture code whether a |
149 | * particular irq has been exclusively allocated or is available | 158 | * particular irq has been exclusively allocated or is available |
@@ -153,22 +162,33 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) | |||
153 | { | 162 | { |
154 | struct irqaction *action; | 163 | struct irqaction *action; |
155 | 164 | ||
156 | if (irq >= NR_IRQS) | 165 | if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST) |
157 | return 0; | 166 | return 0; |
158 | 167 | ||
159 | action = irq_desc[irq].action; | 168 | action = irq_desc[irq].action; |
160 | if (action) | 169 | if (action) |
161 | if (irqflags & action->flags & SA_SHIRQ) | 170 | if (irqflags & action->flags & IRQF_SHARED) |
162 | action = NULL; | 171 | action = NULL; |
163 | 172 | ||
164 | return !action; | 173 | return !action; |
165 | } | 174 | } |
166 | 175 | ||
176 | void compat_irq_chip_set_default_handler(struct irq_desc *desc) | ||
177 | { | ||
178 | /* | ||
179 | * If the architecture still has not overriden | ||
180 | * the flow handler then zap the default. This | ||
181 | * should catch incorrect flow-type setting. | ||
182 | */ | ||
183 | if (desc->handle_irq == &handle_bad_irq) | ||
184 | desc->handle_irq = NULL; | ||
185 | } | ||
186 | |||
167 | /* | 187 | /* |
168 | * Internal function to register an irqaction - typically used to | 188 | * Internal function to register an irqaction - typically used to |
169 | * allocate special interrupts that are part of the architecture. | 189 | * allocate special interrupts that are part of the architecture. |
170 | */ | 190 | */ |
171 | int setup_irq(unsigned int irq, struct irqaction * new) | 191 | int setup_irq(unsigned int irq, struct irqaction *new) |
172 | { | 192 | { |
173 | struct irq_desc *desc = irq_desc + irq; | 193 | struct irq_desc *desc = irq_desc + irq; |
174 | struct irqaction *old, **p; | 194 | struct irqaction *old, **p; |
@@ -178,14 +198,14 @@ int setup_irq(unsigned int irq, struct irqaction * new) | |||
178 | if (irq >= NR_IRQS) | 198 | if (irq >= NR_IRQS) |
179 | return -EINVAL; | 199 | return -EINVAL; |
180 | 200 | ||
181 | if (desc->handler == &no_irq_type) | 201 | if (desc->chip == &no_irq_chip) |
182 | return -ENOSYS; | 202 | return -ENOSYS; |
183 | /* | 203 | /* |
184 | * Some drivers like serial.c use request_irq() heavily, | 204 | * Some drivers like serial.c use request_irq() heavily, |
185 | * so we have to be careful not to interfere with a | 205 | * so we have to be careful not to interfere with a |
186 | * running system. | 206 | * running system. |
187 | */ | 207 | */ |
188 | if (new->flags & SA_SAMPLE_RANDOM) { | 208 | if (new->flags & IRQF_SAMPLE_RANDOM) { |
189 | /* | 209 | /* |
190 | * This function might sleep, we want to call it first, | 210 | * This function might sleep, we want to call it first, |
191 | * outside of the atomic block. | 211 | * outside of the atomic block. |
@@ -200,16 +220,24 @@ int setup_irq(unsigned int irq, struct irqaction * new) | |||
200 | /* | 220 | /* |
201 | * The following block of code has to be executed atomically | 221 | * The following block of code has to be executed atomically |
202 | */ | 222 | */ |
203 | spin_lock_irqsave(&desc->lock,flags); | 223 | spin_lock_irqsave(&desc->lock, flags); |
204 | p = &desc->action; | 224 | p = &desc->action; |
205 | if ((old = *p) != NULL) { | 225 | old = *p; |
206 | /* Can't share interrupts unless both agree to */ | 226 | if (old) { |
207 | if (!(old->flags & new->flags & SA_SHIRQ)) | 227 | /* |
228 | * Can't share interrupts unless both agree to and are | ||
229 | * the same type (level, edge, polarity). So both flag | ||
230 | * fields must have IRQF_SHARED set and the bits which | ||
231 | * set the trigger type must match. | ||
232 | */ | ||
233 | if (!((old->flags & new->flags) & IRQF_SHARED) || | ||
234 | ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) | ||
208 | goto mismatch; | 235 | goto mismatch; |
209 | 236 | ||
210 | #if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) | 237 | #if defined(CONFIG_IRQ_PER_CPU) |
211 | /* All handlers must agree on per-cpuness */ | 238 | /* All handlers must agree on per-cpuness */ |
212 | if ((old->flags & IRQ_PER_CPU) != (new->flags & IRQ_PER_CPU)) | 239 | if ((old->flags & IRQF_PERCPU) != |
240 | (new->flags & IRQF_PERCPU)) | ||
213 | goto mismatch; | 241 | goto mismatch; |
214 | #endif | 242 | #endif |
215 | 243 | ||
@@ -222,20 +250,45 @@ int setup_irq(unsigned int irq, struct irqaction * new) | |||
222 | } | 250 | } |
223 | 251 | ||
224 | *p = new; | 252 | *p = new; |
225 | #if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) | 253 | #if defined(CONFIG_IRQ_PER_CPU) |
226 | if (new->flags & SA_PERCPU_IRQ) | 254 | if (new->flags & IRQF_PERCPU) |
227 | desc->status |= IRQ_PER_CPU; | 255 | desc->status |= IRQ_PER_CPU; |
228 | #endif | 256 | #endif |
229 | if (!shared) { | 257 | if (!shared) { |
230 | desc->depth = 0; | 258 | irq_chip_set_defaults(desc->chip); |
231 | desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | | 259 | |
232 | IRQ_WAITING | IRQ_INPROGRESS); | 260 | /* Setup the type (level, edge polarity) if configured: */ |
233 | if (desc->handler->startup) | 261 | if (new->flags & IRQF_TRIGGER_MASK) { |
234 | desc->handler->startup(irq); | 262 | if (desc->chip && desc->chip->set_type) |
235 | else | 263 | desc->chip->set_type(irq, |
236 | desc->handler->enable(irq); | 264 | new->flags & IRQF_TRIGGER_MASK); |
265 | else | ||
266 | /* | ||
267 | * IRQF_TRIGGER_* but the PIC does not support | ||
268 | * multiple flow-types? | ||
269 | */ | ||
270 | printk(KERN_WARNING "No IRQF_TRIGGER set_type " | ||
271 | "function for IRQ %d (%s)\n", irq, | ||
272 | desc->chip ? desc->chip->name : | ||
273 | "unknown"); | ||
274 | } else | ||
275 | compat_irq_chip_set_default_handler(desc); | ||
276 | |||
277 | desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | | ||
278 | IRQ_INPROGRESS); | ||
279 | |||
280 | if (!(desc->status & IRQ_NOAUTOEN)) { | ||
281 | desc->depth = 0; | ||
282 | desc->status &= ~IRQ_DISABLED; | ||
283 | if (desc->chip->startup) | ||
284 | desc->chip->startup(irq); | ||
285 | else | ||
286 | desc->chip->enable(irq); | ||
287 | } else | ||
288 | /* Undo nested disables: */ | ||
289 | desc->depth = 1; | ||
237 | } | 290 | } |
238 | spin_unlock_irqrestore(&desc->lock,flags); | 291 | spin_unlock_irqrestore(&desc->lock, flags); |
239 | 292 | ||
240 | new->irq = irq; | 293 | new->irq = irq; |
241 | register_irq_proc(irq); | 294 | register_irq_proc(irq); |
@@ -246,8 +299,8 @@ int setup_irq(unsigned int irq, struct irqaction * new) | |||
246 | 299 | ||
247 | mismatch: | 300 | mismatch: |
248 | spin_unlock_irqrestore(&desc->lock, flags); | 301 | spin_unlock_irqrestore(&desc->lock, flags); |
249 | if (!(new->flags & SA_PROBEIRQ)) { | 302 | if (!(new->flags & IRQF_PROBE_SHARED)) { |
250 | printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__); | 303 | printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); |
251 | dump_stack(); | 304 | dump_stack(); |
252 | } | 305 | } |
253 | return -EBUSY; | 306 | return -EBUSY; |
@@ -278,10 +331,10 @@ void free_irq(unsigned int irq, void *dev_id) | |||
278 | return; | 331 | return; |
279 | 332 | ||
280 | desc = irq_desc + irq; | 333 | desc = irq_desc + irq; |
281 | spin_lock_irqsave(&desc->lock,flags); | 334 | spin_lock_irqsave(&desc->lock, flags); |
282 | p = &desc->action; | 335 | p = &desc->action; |
283 | for (;;) { | 336 | for (;;) { |
284 | struct irqaction * action = *p; | 337 | struct irqaction *action = *p; |
285 | 338 | ||
286 | if (action) { | 339 | if (action) { |
287 | struct irqaction **pp = p; | 340 | struct irqaction **pp = p; |
@@ -295,18 +348,18 @@ void free_irq(unsigned int irq, void *dev_id) | |||
295 | 348 | ||
296 | /* Currently used only by UML, might disappear one day.*/ | 349 | /* Currently used only by UML, might disappear one day.*/ |
297 | #ifdef CONFIG_IRQ_RELEASE_METHOD | 350 | #ifdef CONFIG_IRQ_RELEASE_METHOD |
298 | if (desc->handler->release) | 351 | if (desc->chip->release) |
299 | desc->handler->release(irq, dev_id); | 352 | desc->chip->release(irq, dev_id); |
300 | #endif | 353 | #endif |
301 | 354 | ||
302 | if (!desc->action) { | 355 | if (!desc->action) { |
303 | desc->status |= IRQ_DISABLED; | 356 | desc->status |= IRQ_DISABLED; |
304 | if (desc->handler->shutdown) | 357 | if (desc->chip->shutdown) |
305 | desc->handler->shutdown(irq); | 358 | desc->chip->shutdown(irq); |
306 | else | 359 | else |
307 | desc->handler->disable(irq); | 360 | desc->chip->disable(irq); |
308 | } | 361 | } |
309 | spin_unlock_irqrestore(&desc->lock,flags); | 362 | spin_unlock_irqrestore(&desc->lock, flags); |
310 | unregister_handler_proc(irq, action); | 363 | unregister_handler_proc(irq, action); |
311 | 364 | ||
312 | /* Make sure it's not being used on another CPU */ | 365 | /* Make sure it's not being used on another CPU */ |
@@ -314,12 +367,11 @@ void free_irq(unsigned int irq, void *dev_id) | |||
314 | kfree(action); | 367 | kfree(action); |
315 | return; | 368 | return; |
316 | } | 369 | } |
317 | printk(KERN_ERR "Trying to free free IRQ%d\n",irq); | 370 | printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq); |
318 | spin_unlock_irqrestore(&desc->lock,flags); | 371 | spin_unlock_irqrestore(&desc->lock, flags); |
319 | return; | 372 | return; |
320 | } | 373 | } |
321 | } | 374 | } |
322 | |||
323 | EXPORT_SYMBOL(free_irq); | 375 | EXPORT_SYMBOL(free_irq); |
324 | 376 | ||
325 | /** | 377 | /** |
@@ -346,28 +398,36 @@ EXPORT_SYMBOL(free_irq); | |||
346 | * | 398 | * |
347 | * Flags: | 399 | * Flags: |
348 | * | 400 | * |
349 | * SA_SHIRQ Interrupt is shared | 401 | * IRQF_SHARED Interrupt is shared |
350 | * SA_INTERRUPT Disable local interrupts while processing | 402 | * IRQF_DISABLED Disable local interrupts while processing |
351 | * SA_SAMPLE_RANDOM The interrupt can be used for entropy | 403 | * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy |
352 | * | 404 | * |
353 | */ | 405 | */ |
354 | int request_irq(unsigned int irq, | 406 | int request_irq(unsigned int irq, |
355 | irqreturn_t (*handler)(int, void *, struct pt_regs *), | 407 | irqreturn_t (*handler)(int, void *, struct pt_regs *), |
356 | unsigned long irqflags, const char * devname, void *dev_id) | 408 | unsigned long irqflags, const char *devname, void *dev_id) |
357 | { | 409 | { |
358 | struct irqaction * action; | 410 | struct irqaction *action; |
359 | int retval; | 411 | int retval; |
360 | 412 | ||
413 | #ifdef CONFIG_LOCKDEP | ||
414 | /* | ||
415 | * Lockdep wants atomic interrupt handlers: | ||
416 | */ | ||
417 | irqflags |= SA_INTERRUPT; | ||
418 | #endif | ||
361 | /* | 419 | /* |
362 | * Sanity-check: shared interrupts must pass in a real dev-ID, | 420 | * Sanity-check: shared interrupts must pass in a real dev-ID, |
363 | * otherwise we'll have trouble later trying to figure out | 421 | * otherwise we'll have trouble later trying to figure out |
364 | * which interrupt is which (messes up the interrupt freeing | 422 | * which interrupt is which (messes up the interrupt freeing |
365 | * logic etc). | 423 | * logic etc). |
366 | */ | 424 | */ |
367 | if ((irqflags & SA_SHIRQ) && !dev_id) | 425 | if ((irqflags & IRQF_SHARED) && !dev_id) |
368 | return -EINVAL; | 426 | return -EINVAL; |
369 | if (irq >= NR_IRQS) | 427 | if (irq >= NR_IRQS) |
370 | return -EINVAL; | 428 | return -EINVAL; |
429 | if (irq_desc[irq].status & IRQ_NOREQUEST) | ||
430 | return -EINVAL; | ||
371 | if (!handler) | 431 | if (!handler) |
372 | return -EINVAL; | 432 | return -EINVAL; |
373 | 433 | ||
@@ -390,6 +450,5 @@ int request_irq(unsigned int irq, | |||
390 | 450 | ||
391 | return retval; | 451 | return retval; |
392 | } | 452 | } |
393 | |||
394 | EXPORT_SYMBOL(request_irq); | 453 | EXPORT_SYMBOL(request_irq); |
395 | 454 | ||
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 134f9f2e0e39..a57ebe9fa6f6 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c | |||
@@ -3,19 +3,19 @@ | |||
3 | 3 | ||
4 | void set_pending_irq(unsigned int irq, cpumask_t mask) | 4 | void set_pending_irq(unsigned int irq, cpumask_t mask) |
5 | { | 5 | { |
6 | irq_desc_t *desc = irq_desc + irq; | 6 | struct irq_desc *desc = irq_desc + irq; |
7 | unsigned long flags; | 7 | unsigned long flags; |
8 | 8 | ||
9 | spin_lock_irqsave(&desc->lock, flags); | 9 | spin_lock_irqsave(&desc->lock, flags); |
10 | desc->move_irq = 1; | 10 | desc->move_irq = 1; |
11 | pending_irq_cpumask[irq] = mask; | 11 | irq_desc[irq].pending_mask = mask; |
12 | spin_unlock_irqrestore(&desc->lock, flags); | 12 | spin_unlock_irqrestore(&desc->lock, flags); |
13 | } | 13 | } |
14 | 14 | ||
15 | void move_native_irq(int irq) | 15 | void move_native_irq(int irq) |
16 | { | 16 | { |
17 | struct irq_desc *desc = irq_desc + irq; | ||
17 | cpumask_t tmp; | 18 | cpumask_t tmp; |
18 | irq_desc_t *desc = irq_descp(irq); | ||
19 | 19 | ||
20 | if (likely(!desc->move_irq)) | 20 | if (likely(!desc->move_irq)) |
21 | return; | 21 | return; |
@@ -30,15 +30,15 @@ void move_native_irq(int irq) | |||
30 | 30 | ||
31 | desc->move_irq = 0; | 31 | desc->move_irq = 0; |
32 | 32 | ||
33 | if (likely(cpus_empty(pending_irq_cpumask[irq]))) | 33 | if (unlikely(cpus_empty(irq_desc[irq].pending_mask))) |
34 | return; | 34 | return; |
35 | 35 | ||
36 | if (!desc->handler->set_affinity) | 36 | if (!desc->chip->set_affinity) |
37 | return; | 37 | return; |
38 | 38 | ||
39 | assert_spin_locked(&desc->lock); | 39 | assert_spin_locked(&desc->lock); |
40 | 40 | ||
41 | cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map); | 41 | cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map); |
42 | 42 | ||
43 | /* | 43 | /* |
44 | * If there was a valid mask to work with, please | 44 | * If there was a valid mask to work with, please |
@@ -49,14 +49,14 @@ void move_native_irq(int irq) | |||
49 | * cause some ioapics to mal-function. | 49 | * cause some ioapics to mal-function. |
50 | * Being paranoid i guess! | 50 | * Being paranoid i guess! |
51 | */ | 51 | */ |
52 | if (unlikely(!cpus_empty(tmp))) { | 52 | if (likely(!cpus_empty(tmp))) { |
53 | if (likely(!(desc->status & IRQ_DISABLED))) | 53 | if (likely(!(desc->status & IRQ_DISABLED))) |
54 | desc->handler->disable(irq); | 54 | desc->chip->disable(irq); |
55 | 55 | ||
56 | desc->handler->set_affinity(irq,tmp); | 56 | desc->chip->set_affinity(irq,tmp); |
57 | 57 | ||
58 | if (likely(!(desc->status & IRQ_DISABLED))) | 58 | if (likely(!(desc->status & IRQ_DISABLED))) |
59 | desc->handler->enable(irq); | 59 | desc->chip->enable(irq); |
60 | } | 60 | } |
61 | cpus_clear(pending_irq_cpumask[irq]); | 61 | cpus_clear(irq_desc[irq].pending_mask); |
62 | } | 62 | } |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index d03b5eef8ce0..607c7809ad01 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -12,18 +12,15 @@ | |||
12 | 12 | ||
13 | #include "internals.h" | 13 | #include "internals.h" |
14 | 14 | ||
15 | static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; | 15 | static struct proc_dir_entry *root_irq_dir; |
16 | 16 | ||
17 | #ifdef CONFIG_SMP | 17 | #ifdef CONFIG_SMP |
18 | 18 | ||
19 | /* | ||
20 | * The /proc/irq/<irq>/smp_affinity values: | ||
21 | */ | ||
22 | static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; | ||
23 | |||
24 | #ifdef CONFIG_GENERIC_PENDING_IRQ | 19 | #ifdef CONFIG_GENERIC_PENDING_IRQ |
25 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) | 20 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) |
26 | { | 21 | { |
22 | set_balance_irq_affinity(irq, mask_val); | ||
23 | |||
27 | /* | 24 | /* |
28 | * Save these away for later use. Re-progam when the | 25 | * Save these away for later use. Re-progam when the |
29 | * interrupt is pending | 26 | * interrupt is pending |
@@ -33,15 +30,16 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) | |||
33 | #else | 30 | #else |
34 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) | 31 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) |
35 | { | 32 | { |
36 | irq_affinity[irq] = mask_val; | 33 | set_balance_irq_affinity(irq, mask_val); |
37 | irq_desc[irq].handler->set_affinity(irq, mask_val); | 34 | irq_desc[irq].affinity = mask_val; |
35 | irq_desc[irq].chip->set_affinity(irq, mask_val); | ||
38 | } | 36 | } |
39 | #endif | 37 | #endif |
40 | 38 | ||
41 | static int irq_affinity_read_proc(char *page, char **start, off_t off, | 39 | static int irq_affinity_read_proc(char *page, char **start, off_t off, |
42 | int count, int *eof, void *data) | 40 | int count, int *eof, void *data) |
43 | { | 41 | { |
44 | int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]); | 42 | int len = cpumask_scnprintf(page, count, irq_desc[(long)data].affinity); |
45 | 43 | ||
46 | if (count - len < 2) | 44 | if (count - len < 2) |
47 | return -EINVAL; | 45 | return -EINVAL; |
@@ -56,7 +54,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, | |||
56 | unsigned int irq = (int)(long)data, full_count = count, err; | 54 | unsigned int irq = (int)(long)data, full_count = count, err; |
57 | cpumask_t new_value, tmp; | 55 | cpumask_t new_value, tmp; |
58 | 56 | ||
59 | if (!irq_desc[irq].handler->set_affinity || no_irq_affinity) | 57 | if (!irq_desc[irq].chip->set_affinity || no_irq_affinity) |
60 | return -EIO; | 58 | return -EIO; |
61 | 59 | ||
62 | err = cpumask_parse(buffer, count, new_value); | 60 | err = cpumask_parse(buffer, count, new_value); |
@@ -99,7 +97,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) | |||
99 | { | 97 | { |
100 | char name [MAX_NAMELEN]; | 98 | char name [MAX_NAMELEN]; |
101 | 99 | ||
102 | if (!irq_dir[irq] || action->dir || !action->name || | 100 | if (!irq_desc[irq].dir || action->dir || !action->name || |
103 | !name_unique(irq, action)) | 101 | !name_unique(irq, action)) |
104 | return; | 102 | return; |
105 | 103 | ||
@@ -107,7 +105,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) | |||
107 | snprintf(name, MAX_NAMELEN, "%s", action->name); | 105 | snprintf(name, MAX_NAMELEN, "%s", action->name); |
108 | 106 | ||
109 | /* create /proc/irq/1234/handler/ */ | 107 | /* create /proc/irq/1234/handler/ */ |
110 | action->dir = proc_mkdir(name, irq_dir[irq]); | 108 | action->dir = proc_mkdir(name, irq_desc[irq].dir); |
111 | } | 109 | } |
112 | 110 | ||
113 | #undef MAX_NAMELEN | 111 | #undef MAX_NAMELEN |
@@ -119,22 +117,22 @@ void register_irq_proc(unsigned int irq) | |||
119 | char name [MAX_NAMELEN]; | 117 | char name [MAX_NAMELEN]; |
120 | 118 | ||
121 | if (!root_irq_dir || | 119 | if (!root_irq_dir || |
122 | (irq_desc[irq].handler == &no_irq_type) || | 120 | (irq_desc[irq].chip == &no_irq_chip) || |
123 | irq_dir[irq]) | 121 | irq_desc[irq].dir) |
124 | return; | 122 | return; |
125 | 123 | ||
126 | memset(name, 0, MAX_NAMELEN); | 124 | memset(name, 0, MAX_NAMELEN); |
127 | sprintf(name, "%d", irq); | 125 | sprintf(name, "%d", irq); |
128 | 126 | ||
129 | /* create /proc/irq/1234 */ | 127 | /* create /proc/irq/1234 */ |
130 | irq_dir[irq] = proc_mkdir(name, root_irq_dir); | 128 | irq_desc[irq].dir = proc_mkdir(name, root_irq_dir); |
131 | 129 | ||
132 | #ifdef CONFIG_SMP | 130 | #ifdef CONFIG_SMP |
133 | { | 131 | { |
134 | struct proc_dir_entry *entry; | 132 | struct proc_dir_entry *entry; |
135 | 133 | ||
136 | /* create /proc/irq/<irq>/smp_affinity */ | 134 | /* create /proc/irq/<irq>/smp_affinity */ |
137 | entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]); | 135 | entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); |
138 | 136 | ||
139 | if (entry) { | 137 | if (entry) { |
140 | entry->nlink = 1; | 138 | entry->nlink = 1; |
@@ -142,7 +140,6 @@ void register_irq_proc(unsigned int irq) | |||
142 | entry->read_proc = irq_affinity_read_proc; | 140 | entry->read_proc = irq_affinity_read_proc; |
143 | entry->write_proc = irq_affinity_write_proc; | 141 | entry->write_proc = irq_affinity_write_proc; |
144 | } | 142 | } |
145 | smp_affinity_entry[irq] = entry; | ||
146 | } | 143 | } |
147 | #endif | 144 | #endif |
148 | } | 145 | } |
@@ -152,7 +149,7 @@ void register_irq_proc(unsigned int irq) | |||
152 | void unregister_handler_proc(unsigned int irq, struct irqaction *action) | 149 | void unregister_handler_proc(unsigned int irq, struct irqaction *action) |
153 | { | 150 | { |
154 | if (action->dir) | 151 | if (action->dir) |
155 | remove_proc_entry(action->dir->name, irq_dir[irq]); | 152 | remove_proc_entry(action->dir->name, irq_desc[irq].dir); |
156 | } | 153 | } |
157 | 154 | ||
158 | void init_irq_proc(void) | 155 | void init_irq_proc(void) |
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c new file mode 100644 index 000000000000..872f91ba2ce8 --- /dev/null +++ b/kernel/irq/resend.c | |||
@@ -0,0 +1,78 @@ | |||
1 | /* | ||
2 | * linux/kernel/irq/resend.c | ||
3 | * | ||
4 | * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar | ||
5 | * Copyright (C) 2005-2006, Thomas Gleixner | ||
6 | * | ||
7 | * This file contains the IRQ-resend code | ||
8 | * | ||
9 | * If the interrupt is waiting to be processed, we try to re-run it. | ||
10 | * We can't directly run it from here since the caller might be in an | ||
11 | * interrupt-protected region. Not all irq controller chips can | ||
12 | * retrigger interrupts at the hardware level, so in those cases | ||
13 | * we allow the resending of IRQs via a tasklet. | ||
14 | */ | ||
15 | |||
16 | #include <linux/irq.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/random.h> | ||
19 | #include <linux/interrupt.h> | ||
20 | |||
21 | #include "internals.h" | ||
22 | |||
23 | #ifdef CONFIG_HARDIRQS_SW_RESEND | ||
24 | |||
25 | /* Bitmap to handle software resend of interrupts: */ | ||
26 | static DECLARE_BITMAP(irqs_resend, NR_IRQS); | ||
27 | |||
28 | /* | ||
29 | * Run software resends of IRQ's | ||
30 | */ | ||
31 | static void resend_irqs(unsigned long arg) | ||
32 | { | ||
33 | struct irq_desc *desc; | ||
34 | int irq; | ||
35 | |||
36 | while (!bitmap_empty(irqs_resend, NR_IRQS)) { | ||
37 | irq = find_first_bit(irqs_resend, NR_IRQS); | ||
38 | clear_bit(irq, irqs_resend); | ||
39 | desc = irq_desc + irq; | ||
40 | local_irq_disable(); | ||
41 | desc->handle_irq(irq, desc, NULL); | ||
42 | local_irq_enable(); | ||
43 | } | ||
44 | } | ||
45 | |||
46 | /* Tasklet to handle resend: */ | ||
47 | static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); | ||
48 | |||
49 | #endif | ||
50 | |||
51 | /* | ||
52 | * IRQ resend | ||
53 | * | ||
54 | * Is called with interrupts disabled and desc->lock held. | ||
55 | */ | ||
56 | void check_irq_resend(struct irq_desc *desc, unsigned int irq) | ||
57 | { | ||
58 | unsigned int status = desc->status; | ||
59 | |||
60 | /* | ||
61 | * Make sure the interrupt is enabled, before resending it: | ||
62 | */ | ||
63 | desc->chip->enable(irq); | ||
64 | |||
65 | if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { | ||
66 | desc->status &= ~IRQ_PENDING; | ||
67 | desc->status = status | IRQ_REPLAY; | ||
68 | |||
69 | if (!desc->chip || !desc->chip->retrigger || | ||
70 | !desc->chip->retrigger(irq)) { | ||
71 | #ifdef CONFIG_HARDIRQS_SW_RESEND | ||
72 | /* Set it pending and activate the softirq: */ | ||
73 | set_bit(irq, irqs_resend); | ||
74 | tasklet_schedule(&resend_tasklet); | ||
75 | #endif | ||
76 | } | ||
77 | } | ||
78 | } | ||
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 7df9abd5ec86..417e98092cf2 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
@@ -11,44 +11,44 @@ | |||
11 | #include <linux/kallsyms.h> | 11 | #include <linux/kallsyms.h> |
12 | #include <linux/interrupt.h> | 12 | #include <linux/interrupt.h> |
13 | 13 | ||
14 | static int irqfixup; | 14 | static int irqfixup __read_mostly; |
15 | 15 | ||
16 | /* | 16 | /* |
17 | * Recovery handler for misrouted interrupts. | 17 | * Recovery handler for misrouted interrupts. |
18 | */ | 18 | */ |
19 | |||
20 | static int misrouted_irq(int irq, struct pt_regs *regs) | 19 | static int misrouted_irq(int irq, struct pt_regs *regs) |
21 | { | 20 | { |
22 | int i; | 21 | int i; |
23 | irq_desc_t *desc; | ||
24 | int ok = 0; | 22 | int ok = 0; |
25 | int work = 0; /* Did we do work for a real IRQ */ | 23 | int work = 0; /* Did we do work for a real IRQ */ |
26 | 24 | ||
27 | for(i = 1; i < NR_IRQS; i++) { | 25 | for (i = 1; i < NR_IRQS; i++) { |
26 | struct irq_desc *desc = irq_desc + i; | ||
28 | struct irqaction *action; | 27 | struct irqaction *action; |
29 | 28 | ||
30 | if (i == irq) /* Already tried */ | 29 | if (i == irq) /* Already tried */ |
31 | continue; | 30 | continue; |
32 | desc = &irq_desc[i]; | 31 | |
33 | spin_lock(&desc->lock); | 32 | spin_lock(&desc->lock); |
34 | action = desc->action; | ||
35 | /* Already running on another processor */ | 33 | /* Already running on another processor */ |
36 | if (desc->status & IRQ_INPROGRESS) { | 34 | if (desc->status & IRQ_INPROGRESS) { |
37 | /* | 35 | /* |
38 | * Already running: If it is shared get the other | 36 | * Already running: If it is shared get the other |
39 | * CPU to go looking for our mystery interrupt too | 37 | * CPU to go looking for our mystery interrupt too |
40 | */ | 38 | */ |
41 | if (desc->action && (desc->action->flags & SA_SHIRQ)) | 39 | if (desc->action && (desc->action->flags & IRQF_SHARED)) |
42 | desc->status |= IRQ_PENDING; | 40 | desc->status |= IRQ_PENDING; |
43 | spin_unlock(&desc->lock); | 41 | spin_unlock(&desc->lock); |
44 | continue; | 42 | continue; |
45 | } | 43 | } |
46 | /* Honour the normal IRQ locking */ | 44 | /* Honour the normal IRQ locking */ |
47 | desc->status |= IRQ_INPROGRESS; | 45 | desc->status |= IRQ_INPROGRESS; |
46 | action = desc->action; | ||
48 | spin_unlock(&desc->lock); | 47 | spin_unlock(&desc->lock); |
48 | |||
49 | while (action) { | 49 | while (action) { |
50 | /* Only shared IRQ handlers are safe to call */ | 50 | /* Only shared IRQ handlers are safe to call */ |
51 | if (action->flags & SA_SHIRQ) { | 51 | if (action->flags & IRQF_SHARED) { |
52 | if (action->handler(i, action->dev_id, regs) == | 52 | if (action->handler(i, action->dev_id, regs) == |
53 | IRQ_HANDLED) | 53 | IRQ_HANDLED) |
54 | ok = 1; | 54 | ok = 1; |
@@ -62,9 +62,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs) | |||
62 | 62 | ||
63 | /* | 63 | /* |
64 | * While we were looking for a fixup someone queued a real | 64 | * While we were looking for a fixup someone queued a real |
65 | * IRQ clashing with our walk | 65 | * IRQ clashing with our walk: |
66 | */ | 66 | */ |
67 | |||
68 | while ((desc->status & IRQ_PENDING) && action) { | 67 | while ((desc->status & IRQ_PENDING) && action) { |
69 | /* | 68 | /* |
70 | * Perform real IRQ processing for the IRQ we deferred | 69 | * Perform real IRQ processing for the IRQ we deferred |
@@ -80,8 +79,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs) | |||
80 | * If we did actual work for the real IRQ line we must let the | 79 | * If we did actual work for the real IRQ line we must let the |
81 | * IRQ controller clean up too | 80 | * IRQ controller clean up too |
82 | */ | 81 | */ |
83 | if(work) | 82 | if (work && desc->chip && desc->chip->end) |
84 | desc->handler->end(i); | 83 | desc->chip->end(i); |
85 | spin_unlock(&desc->lock); | 84 | spin_unlock(&desc->lock); |
86 | } | 85 | } |
87 | /* So the caller can adjust the irq error counts */ | 86 | /* So the caller can adjust the irq error counts */ |
@@ -100,7 +99,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs) | |||
100 | */ | 99 | */ |
101 | 100 | ||
102 | static void | 101 | static void |
103 | __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | 102 | __report_bad_irq(unsigned int irq, struct irq_desc *desc, |
103 | irqreturn_t action_ret) | ||
104 | { | 104 | { |
105 | struct irqaction *action; | 105 | struct irqaction *action; |
106 | 106 | ||
@@ -113,6 +113,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | |||
113 | } | 113 | } |
114 | dump_stack(); | 114 | dump_stack(); |
115 | printk(KERN_ERR "handlers:\n"); | 115 | printk(KERN_ERR "handlers:\n"); |
116 | |||
116 | action = desc->action; | 117 | action = desc->action; |
117 | while (action) { | 118 | while (action) { |
118 | printk(KERN_ERR "[<%p>]", action->handler); | 119 | printk(KERN_ERR "[<%p>]", action->handler); |
@@ -123,7 +124,8 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | |||
123 | } | 124 | } |
124 | } | 125 | } |
125 | 126 | ||
126 | static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | 127 | static void |
128 | report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) | ||
127 | { | 129 | { |
128 | static int count = 100; | 130 | static int count = 100; |
129 | 131 | ||
@@ -133,12 +135,12 @@ static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t actio | |||
133 | } | 135 | } |
134 | } | 136 | } |
135 | 137 | ||
136 | void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, | 138 | void note_interrupt(unsigned int irq, struct irq_desc *desc, |
137 | struct pt_regs *regs) | 139 | irqreturn_t action_ret, struct pt_regs *regs) |
138 | { | 140 | { |
139 | if (action_ret != IRQ_HANDLED) { | 141 | if (unlikely(action_ret != IRQ_HANDLED)) { |
140 | desc->irqs_unhandled++; | 142 | desc->irqs_unhandled++; |
141 | if (action_ret != IRQ_NONE) | 143 | if (unlikely(action_ret != IRQ_NONE)) |
142 | report_bad_irq(irq, desc, action_ret); | 144 | report_bad_irq(irq, desc, action_ret); |
143 | } | 145 | } |
144 | 146 | ||
@@ -152,11 +154,11 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, | |||
152 | } | 154 | } |
153 | 155 | ||
154 | desc->irq_count++; | 156 | desc->irq_count++; |
155 | if (desc->irq_count < 100000) | 157 | if (likely(desc->irq_count < 100000)) |
156 | return; | 158 | return; |
157 | 159 | ||
158 | desc->irq_count = 0; | 160 | desc->irq_count = 0; |
159 | if (desc->irqs_unhandled > 99900) { | 161 | if (unlikely(desc->irqs_unhandled > 99900)) { |
160 | /* | 162 | /* |
161 | * The interrupt is stuck | 163 | * The interrupt is stuck |
162 | */ | 164 | */ |
@@ -166,17 +168,19 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, | |||
166 | */ | 168 | */ |
167 | printk(KERN_EMERG "Disabling IRQ #%d\n", irq); | 169 | printk(KERN_EMERG "Disabling IRQ #%d\n", irq); |
168 | desc->status |= IRQ_DISABLED; | 170 | desc->status |= IRQ_DISABLED; |
169 | desc->handler->disable(irq); | 171 | desc->depth = 1; |
172 | desc->chip->disable(irq); | ||
170 | } | 173 | } |
171 | desc->irqs_unhandled = 0; | 174 | desc->irqs_unhandled = 0; |
172 | } | 175 | } |
173 | 176 | ||
174 | int noirqdebug; | 177 | int noirqdebug __read_mostly; |
175 | 178 | ||
176 | int __init noirqdebug_setup(char *str) | 179 | int __init noirqdebug_setup(char *str) |
177 | { | 180 | { |
178 | noirqdebug = 1; | 181 | noirqdebug = 1; |
179 | printk(KERN_INFO "IRQ lockup detection disabled\n"); | 182 | printk(KERN_INFO "IRQ lockup detection disabled\n"); |
183 | |||
180 | return 1; | 184 | return 1; |
181 | } | 185 | } |
182 | 186 | ||
@@ -187,6 +191,7 @@ static int __init irqfixup_setup(char *str) | |||
187 | irqfixup = 1; | 191 | irqfixup = 1; |
188 | printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); | 192 | printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); |
189 | printk(KERN_WARNING "This may impact system performance.\n"); | 193 | printk(KERN_WARNING "This may impact system performance.\n"); |
194 | |||
190 | return 1; | 195 | return 1; |
191 | } | 196 | } |
192 | 197 | ||
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 39277dd6bf90..ab16a5a4cfe9 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
@@ -275,8 +275,8 @@ static void upcase_if_global(struct kallsym_iter *iter) | |||
275 | static int get_ksymbol_mod(struct kallsym_iter *iter) | 275 | static int get_ksymbol_mod(struct kallsym_iter *iter) |
276 | { | 276 | { |
277 | iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, | 277 | iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, |
278 | &iter->value, | 278 | &iter->value, &iter->type, |
279 | &iter->type, iter->name); | 279 | iter->name, sizeof(iter->name)); |
280 | if (iter->owner == NULL) | 280 | if (iter->owner == NULL) |
281 | return 0; | 281 | return 0; |
282 | 282 | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c index bf39d28e4c0e..50087ecf337e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -902,14 +902,14 @@ static int kimage_load_segment(struct kimage *image, | |||
902 | * kexec does not sync, or unmount filesystems so if you need | 902 | * kexec does not sync, or unmount filesystems so if you need |
903 | * that to happen you need to do that yourself. | 903 | * that to happen you need to do that yourself. |
904 | */ | 904 | */ |
905 | struct kimage *kexec_image = NULL; | 905 | struct kimage *kexec_image; |
906 | static struct kimage *kexec_crash_image = NULL; | 906 | struct kimage *kexec_crash_image; |
907 | /* | 907 | /* |
908 | * A home grown binary mutex. | 908 | * A home grown binary mutex. |
909 | * Nothing can wait so this mutex is safe to use | 909 | * Nothing can wait so this mutex is safe to use |
910 | * in interrupt context :) | 910 | * in interrupt context :) |
911 | */ | 911 | */ |
912 | static int kexec_lock = 0; | 912 | static int kexec_lock; |
913 | 913 | ||
914 | asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, | 914 | asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, |
915 | struct kexec_segment __user *segments, | 915 | struct kexec_segment __user *segments, |
@@ -1042,7 +1042,6 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry, | |||
1042 | 1042 | ||
1043 | void crash_kexec(struct pt_regs *regs) | 1043 | void crash_kexec(struct pt_regs *regs) |
1044 | { | 1044 | { |
1045 | struct kimage *image; | ||
1046 | int locked; | 1045 | int locked; |
1047 | 1046 | ||
1048 | 1047 | ||
@@ -1056,12 +1055,11 @@ void crash_kexec(struct pt_regs *regs) | |||
1056 | */ | 1055 | */ |
1057 | locked = xchg(&kexec_lock, 1); | 1056 | locked = xchg(&kexec_lock, 1); |
1058 | if (!locked) { | 1057 | if (!locked) { |
1059 | image = xchg(&kexec_crash_image, NULL); | 1058 | if (kexec_crash_image) { |
1060 | if (image) { | ||
1061 | struct pt_regs fixed_regs; | 1059 | struct pt_regs fixed_regs; |
1062 | crash_setup_regs(&fixed_regs, regs); | 1060 | crash_setup_regs(&fixed_regs, regs); |
1063 | machine_crash_shutdown(&fixed_regs); | 1061 | machine_crash_shutdown(&fixed_regs); |
1064 | machine_kexec(image); | 1062 | machine_kexec(kexec_crash_image); |
1065 | } | 1063 | } |
1066 | xchg(&kexec_lock, 0); | 1064 | xchg(&kexec_lock, 0); |
1067 | } | 1065 | } |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 20a997c73c3d..1d32defa38ab 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -20,7 +20,6 @@ | |||
20 | */ | 20 | */ |
21 | #define __KERNEL_SYSCALLS__ | 21 | #define __KERNEL_SYSCALLS__ |
22 | 22 | ||
23 | #include <linux/config.h> | ||
24 | #include <linux/module.h> | 23 | #include <linux/module.h> |
25 | #include <linux/sched.h> | 24 | #include <linux/sched.h> |
26 | #include <linux/syscalls.h> | 25 | #include <linux/syscalls.h> |
@@ -234,7 +233,7 @@ static void __call_usermodehelper(void *data) | |||
234 | int call_usermodehelper_keys(char *path, char **argv, char **envp, | 233 | int call_usermodehelper_keys(char *path, char **argv, char **envp, |
235 | struct key *session_keyring, int wait) | 234 | struct key *session_keyring, int wait) |
236 | { | 235 | { |
237 | DECLARE_COMPLETION(done); | 236 | DECLARE_COMPLETION_ONSTACK(done); |
238 | struct subprocess_info sub_info = { | 237 | struct subprocess_info sub_info = { |
239 | .complete = &done, | 238 | .complete = &done, |
240 | .path = path, | 239 | .path = path, |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1fbf466a29aa..64aab081153b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -47,11 +47,17 @@ | |||
47 | 47 | ||
48 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; | 48 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; |
49 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; | 49 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; |
50 | static atomic_t kprobe_count; | ||
50 | 51 | ||
51 | DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ | 52 | DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ |
52 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ | 53 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ |
53 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; | 54 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; |
54 | 55 | ||
56 | static struct notifier_block kprobe_page_fault_nb = { | ||
57 | .notifier_call = kprobe_exceptions_notify, | ||
58 | .priority = 0x7fffffff /* we need to notified first */ | ||
59 | }; | ||
60 | |||
55 | #ifdef __ARCH_WANT_KPROBES_INSN_SLOT | 61 | #ifdef __ARCH_WANT_KPROBES_INSN_SLOT |
56 | /* | 62 | /* |
57 | * kprobe->ainsn.insn points to the copy of the instruction to be | 63 | * kprobe->ainsn.insn points to the copy of the instruction to be |
@@ -368,16 +374,15 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) | |||
368 | */ | 374 | */ |
369 | static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) | 375 | static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) |
370 | { | 376 | { |
371 | struct kprobe *kp; | ||
372 | |||
373 | if (p->break_handler) { | 377 | if (p->break_handler) { |
374 | list_for_each_entry_rcu(kp, &old_p->list, list) { | 378 | if (old_p->break_handler) |
375 | if (kp->break_handler) | 379 | return -EEXIST; |
376 | return -EEXIST; | ||
377 | } | ||
378 | list_add_tail_rcu(&p->list, &old_p->list); | 380 | list_add_tail_rcu(&p->list, &old_p->list); |
381 | old_p->break_handler = aggr_break_handler; | ||
379 | } else | 382 | } else |
380 | list_add_rcu(&p->list, &old_p->list); | 383 | list_add_rcu(&p->list, &old_p->list); |
384 | if (p->post_handler && !old_p->post_handler) | ||
385 | old_p->post_handler = aggr_post_handler; | ||
381 | return 0; | 386 | return 0; |
382 | } | 387 | } |
383 | 388 | ||
@@ -390,9 +395,11 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | |||
390 | copy_kprobe(p, ap); | 395 | copy_kprobe(p, ap); |
391 | ap->addr = p->addr; | 396 | ap->addr = p->addr; |
392 | ap->pre_handler = aggr_pre_handler; | 397 | ap->pre_handler = aggr_pre_handler; |
393 | ap->post_handler = aggr_post_handler; | ||
394 | ap->fault_handler = aggr_fault_handler; | 398 | ap->fault_handler = aggr_fault_handler; |
395 | ap->break_handler = aggr_break_handler; | 399 | if (p->post_handler) |
400 | ap->post_handler = aggr_post_handler; | ||
401 | if (p->break_handler) | ||
402 | ap->break_handler = aggr_break_handler; | ||
396 | 403 | ||
397 | INIT_LIST_HEAD(&ap->list); | 404 | INIT_LIST_HEAD(&ap->list); |
398 | list_add_rcu(&p->list, &ap->list); | 405 | list_add_rcu(&p->list, &ap->list); |
@@ -464,6 +471,8 @@ static int __kprobes __register_kprobe(struct kprobe *p, | |||
464 | old_p = get_kprobe(p->addr); | 471 | old_p = get_kprobe(p->addr); |
465 | if (old_p) { | 472 | if (old_p) { |
466 | ret = register_aggr_kprobe(old_p, p); | 473 | ret = register_aggr_kprobe(old_p, p); |
474 | if (!ret) | ||
475 | atomic_inc(&kprobe_count); | ||
467 | goto out; | 476 | goto out; |
468 | } | 477 | } |
469 | 478 | ||
@@ -474,6 +483,10 @@ static int __kprobes __register_kprobe(struct kprobe *p, | |||
474 | hlist_add_head_rcu(&p->hlist, | 483 | hlist_add_head_rcu(&p->hlist, |
475 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); | 484 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); |
476 | 485 | ||
486 | if (atomic_add_return(1, &kprobe_count) == \ | ||
487 | (ARCH_INACTIVE_KPROBE_COUNT + 1)) | ||
488 | register_page_fault_notifier(&kprobe_page_fault_nb); | ||
489 | |||
477 | arch_arm_kprobe(p); | 490 | arch_arm_kprobe(p); |
478 | 491 | ||
479 | out: | 492 | out: |
@@ -536,14 +549,40 @@ valid_p: | |||
536 | kfree(old_p); | 549 | kfree(old_p); |
537 | } | 550 | } |
538 | arch_remove_kprobe(p); | 551 | arch_remove_kprobe(p); |
552 | } else { | ||
553 | mutex_lock(&kprobe_mutex); | ||
554 | if (p->break_handler) | ||
555 | old_p->break_handler = NULL; | ||
556 | if (p->post_handler){ | ||
557 | list_for_each_entry_rcu(list_p, &old_p->list, list){ | ||
558 | if (list_p->post_handler){ | ||
559 | cleanup_p = 2; | ||
560 | break; | ||
561 | } | ||
562 | } | ||
563 | if (cleanup_p == 0) | ||
564 | old_p->post_handler = NULL; | ||
565 | } | ||
566 | mutex_unlock(&kprobe_mutex); | ||
539 | } | 567 | } |
568 | |||
569 | /* Call unregister_page_fault_notifier() | ||
570 | * if no probes are active | ||
571 | */ | ||
572 | mutex_lock(&kprobe_mutex); | ||
573 | if (atomic_add_return(-1, &kprobe_count) == \ | ||
574 | ARCH_INACTIVE_KPROBE_COUNT) | ||
575 | unregister_page_fault_notifier(&kprobe_page_fault_nb); | ||
576 | mutex_unlock(&kprobe_mutex); | ||
577 | return; | ||
540 | } | 578 | } |
541 | 579 | ||
542 | static struct notifier_block kprobe_exceptions_nb = { | 580 | static struct notifier_block kprobe_exceptions_nb = { |
543 | .notifier_call = kprobe_exceptions_notify, | 581 | .notifier_call = kprobe_exceptions_notify, |
544 | .priority = 0x7fffffff /* we need to notified first */ | 582 | .priority = 0x7fffffff /* we need to be notified first */ |
545 | }; | 583 | }; |
546 | 584 | ||
585 | |||
547 | int __kprobes register_jprobe(struct jprobe *jp) | 586 | int __kprobes register_jprobe(struct jprobe *jp) |
548 | { | 587 | { |
549 | /* Todo: Verify probepoint is a function entry point */ | 588 | /* Todo: Verify probepoint is a function entry point */ |
@@ -652,6 +691,7 @@ static int __init init_kprobes(void) | |||
652 | INIT_HLIST_HEAD(&kprobe_table[i]); | 691 | INIT_HLIST_HEAD(&kprobe_table[i]); |
653 | INIT_HLIST_HEAD(&kretprobe_inst_table[i]); | 692 | INIT_HLIST_HEAD(&kretprobe_inst_table[i]); |
654 | } | 693 | } |
694 | atomic_set(&kprobe_count, 0); | ||
655 | 695 | ||
656 | err = arch_init_kprobes(); | 696 | err = arch_init_kprobes(); |
657 | if (!err) | 697 | if (!err) |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index f119e098e67b..e0ffe4ab0917 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -8,12 +8,12 @@ | |||
8 | * | 8 | * |
9 | */ | 9 | */ |
10 | 10 | ||
11 | #include <linux/config.h> | ||
12 | #include <linux/kobject.h> | 11 | #include <linux/kobject.h> |
13 | #include <linux/string.h> | 12 | #include <linux/string.h> |
14 | #include <linux/sysfs.h> | 13 | #include <linux/sysfs.h> |
15 | #include <linux/module.h> | 14 | #include <linux/module.h> |
16 | #include <linux/init.h> | 15 | #include <linux/init.h> |
16 | #include <linux/kexec.h> | ||
17 | 17 | ||
18 | #define KERNEL_ATTR_RO(_name) \ | 18 | #define KERNEL_ATTR_RO(_name) \ |
19 | static struct subsys_attribute _name##_attr = __ATTR_RO(_name) | 19 | static struct subsys_attribute _name##_attr = __ATTR_RO(_name) |
@@ -48,6 +48,20 @@ static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, s | |||
48 | KERNEL_ATTR_RW(uevent_helper); | 48 | KERNEL_ATTR_RW(uevent_helper); |
49 | #endif | 49 | #endif |
50 | 50 | ||
51 | #ifdef CONFIG_KEXEC | ||
52 | static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page) | ||
53 | { | ||
54 | return sprintf(page, "%d\n", !!kexec_image); | ||
55 | } | ||
56 | KERNEL_ATTR_RO(kexec_loaded); | ||
57 | |||
58 | static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page) | ||
59 | { | ||
60 | return sprintf(page, "%d\n", !!kexec_crash_image); | ||
61 | } | ||
62 | KERNEL_ATTR_RO(kexec_crash_loaded); | ||
63 | #endif /* CONFIG_KEXEC */ | ||
64 | |||
51 | decl_subsys(kernel, NULL, NULL); | 65 | decl_subsys(kernel, NULL, NULL); |
52 | EXPORT_SYMBOL_GPL(kernel_subsys); | 66 | EXPORT_SYMBOL_GPL(kernel_subsys); |
53 | 67 | ||
@@ -56,6 +70,10 @@ static struct attribute * kernel_attrs[] = { | |||
56 | &uevent_seqnum_attr.attr, | 70 | &uevent_seqnum_attr.attr, |
57 | &uevent_helper_attr.attr, | 71 | &uevent_helper_attr.attr, |
58 | #endif | 72 | #endif |
73 | #ifdef CONFIG_KEXEC | ||
74 | &kexec_loaded_attr.attr, | ||
75 | &kexec_crash_loaded_attr.attr, | ||
76 | #endif | ||
59 | NULL | 77 | NULL |
60 | }; | 78 | }; |
61 | 79 | ||
diff --git a/kernel/kthread.c b/kernel/kthread.c index c5f3c6613b6d..4f9c60ef95e8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -45,6 +45,13 @@ struct kthread_stop_info | |||
45 | static DEFINE_MUTEX(kthread_stop_lock); | 45 | static DEFINE_MUTEX(kthread_stop_lock); |
46 | static struct kthread_stop_info kthread_stop_info; | 46 | static struct kthread_stop_info kthread_stop_info; |
47 | 47 | ||
48 | /** | ||
49 | * kthread_should_stop - should this kthread return now? | ||
50 | * | ||
51 | * When someone calls kthread_stop on your kthread, it will be woken | ||
52 | * and this will return true. You should then return, and your return | ||
53 | * value will be passed through to kthread_stop(). | ||
54 | */ | ||
48 | int kthread_should_stop(void) | 55 | int kthread_should_stop(void) |
49 | { | 56 | { |
50 | return (kthread_stop_info.k == current); | 57 | return (kthread_stop_info.k == current); |
@@ -122,6 +129,25 @@ static void keventd_create_kthread(void *_create) | |||
122 | complete(&create->done); | 129 | complete(&create->done); |
123 | } | 130 | } |
124 | 131 | ||
132 | /** | ||
133 | * kthread_create - create a kthread. | ||
134 | * @threadfn: the function to run until signal_pending(current). | ||
135 | * @data: data ptr for @threadfn. | ||
136 | * @namefmt: printf-style name for the thread. | ||
137 | * | ||
138 | * Description: This helper function creates and names a kernel | ||
139 | * thread. The thread will be stopped: use wake_up_process() to start | ||
140 | * it. See also kthread_run(), kthread_create_on_cpu(). | ||
141 | * | ||
142 | * When woken, the thread will run @threadfn() with @data as its | ||
143 | * argument. @threadfn can either call do_exit() directly if it is a | ||
144 | * standalone thread for which noone will call kthread_stop(), or | ||
145 | * return when 'kthread_should_stop()' is true (which means | ||
146 | * kthread_stop() has been called). The return value should be zero | ||
147 | * or a negative error number; it will be passed to kthread_stop(). | ||
148 | * | ||
149 | * Returns a task_struct or ERR_PTR(-ENOMEM). | ||
150 | */ | ||
125 | struct task_struct *kthread_create(int (*threadfn)(void *data), | 151 | struct task_struct *kthread_create(int (*threadfn)(void *data), |
126 | void *data, | 152 | void *data, |
127 | const char namefmt[], | 153 | const char namefmt[], |
@@ -156,6 +182,15 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), | |||
156 | } | 182 | } |
157 | EXPORT_SYMBOL(kthread_create); | 183 | EXPORT_SYMBOL(kthread_create); |
158 | 184 | ||
185 | /** | ||
186 | * kthread_bind - bind a just-created kthread to a cpu. | ||
187 | * @k: thread created by kthread_create(). | ||
188 | * @cpu: cpu (might not be online, must be possible) for @k to run on. | ||
189 | * | ||
190 | * Description: This function is equivalent to set_cpus_allowed(), | ||
191 | * except that @cpu doesn't need to be online, and the thread must be | ||
192 | * stopped (i.e., just returned from kthread_create(). | ||
193 | */ | ||
159 | void kthread_bind(struct task_struct *k, unsigned int cpu) | 194 | void kthread_bind(struct task_struct *k, unsigned int cpu) |
160 | { | 195 | { |
161 | BUG_ON(k->state != TASK_INTERRUPTIBLE); | 196 | BUG_ON(k->state != TASK_INTERRUPTIBLE); |
@@ -166,14 +201,21 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) | |||
166 | } | 201 | } |
167 | EXPORT_SYMBOL(kthread_bind); | 202 | EXPORT_SYMBOL(kthread_bind); |
168 | 203 | ||
204 | /** | ||
205 | * kthread_stop - stop a thread created by kthread_create(). | ||
206 | * @k: thread created by kthread_create(). | ||
207 | * | ||
208 | * Sets kthread_should_stop() for @k to return true, wakes it, and | ||
209 | * waits for it to exit. Your threadfn() must not call do_exit() | ||
210 | * itself if you use this function! This can also be called after | ||
211 | * kthread_create() instead of calling wake_up_process(): the thread | ||
212 | * will exit without calling threadfn(). | ||
213 | * | ||
214 | * Returns the result of threadfn(), or %-EINTR if wake_up_process() | ||
215 | * was never called. | ||
216 | */ | ||
169 | int kthread_stop(struct task_struct *k) | 217 | int kthread_stop(struct task_struct *k) |
170 | { | 218 | { |
171 | return kthread_stop_sem(k, NULL); | ||
172 | } | ||
173 | EXPORT_SYMBOL(kthread_stop); | ||
174 | |||
175 | int kthread_stop_sem(struct task_struct *k, struct semaphore *s) | ||
176 | { | ||
177 | int ret; | 219 | int ret; |
178 | 220 | ||
179 | mutex_lock(&kthread_stop_lock); | 221 | mutex_lock(&kthread_stop_lock); |
@@ -187,10 +229,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) | |||
187 | 229 | ||
188 | /* Now set kthread_should_stop() to true, and wake it up. */ | 230 | /* Now set kthread_should_stop() to true, and wake it up. */ |
189 | kthread_stop_info.k = k; | 231 | kthread_stop_info.k = k; |
190 | if (s) | 232 | wake_up_process(k); |
191 | up(s); | ||
192 | else | ||
193 | wake_up_process(k); | ||
194 | put_task_struct(k); | 233 | put_task_struct(k); |
195 | 234 | ||
196 | /* Once it dies, reset stop ptr, gather result and we're done. */ | 235 | /* Once it dies, reset stop ptr, gather result and we're done. */ |
@@ -201,7 +240,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) | |||
201 | 240 | ||
202 | return ret; | 241 | return ret; |
203 | } | 242 | } |
204 | EXPORT_SYMBOL(kthread_stop_sem); | 243 | EXPORT_SYMBOL(kthread_stop); |
205 | 244 | ||
206 | static __init int helper_init(void) | 245 | static __init int helper_init(void) |
207 | { | 246 | { |
@@ -210,5 +249,5 @@ static __init int helper_init(void) | |||
210 | 249 | ||
211 | return 0; | 250 | return 0; |
212 | } | 251 | } |
213 | core_initcall(helper_init); | ||
214 | 252 | ||
253 | core_initcall(helper_init); | ||
diff --git a/kernel/lockdep.c b/kernel/lockdep.c new file mode 100644 index 000000000000..9bad17884513 --- /dev/null +++ b/kernel/lockdep.c | |||
@@ -0,0 +1,2704 @@ | |||
1 | /* | ||
2 | * kernel/lockdep.c | ||
3 | * | ||
4 | * Runtime locking correctness validator | ||
5 | * | ||
6 | * Started by Ingo Molnar: | ||
7 | * | ||
8 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
9 | * | ||
10 | * this code maps all the lock dependencies as they occur in a live kernel | ||
11 | * and will warn about the following classes of locking bugs: | ||
12 | * | ||
13 | * - lock inversion scenarios | ||
14 | * - circular lock dependencies | ||
15 | * - hardirq/softirq safe/unsafe locking bugs | ||
16 | * | ||
17 | * Bugs are reported even if the current locking scenario does not cause | ||
18 | * any deadlock at this point. | ||
19 | * | ||
20 | * I.e. if anytime in the past two locks were taken in a different order, | ||
21 | * even if it happened for another task, even if those were different | ||
22 | * locks (but of the same class as this lock), this code will detect it. | ||
23 | * | ||
24 | * Thanks to Arjan van de Ven for coming up with the initial idea of | ||
25 | * mapping lock dependencies runtime. | ||
26 | */ | ||
27 | #include <linux/mutex.h> | ||
28 | #include <linux/sched.h> | ||
29 | #include <linux/delay.h> | ||
30 | #include <linux/module.h> | ||
31 | #include <linux/proc_fs.h> | ||
32 | #include <linux/seq_file.h> | ||
33 | #include <linux/spinlock.h> | ||
34 | #include <linux/kallsyms.h> | ||
35 | #include <linux/interrupt.h> | ||
36 | #include <linux/stacktrace.h> | ||
37 | #include <linux/debug_locks.h> | ||
38 | #include <linux/irqflags.h> | ||
39 | |||
40 | #include <asm/sections.h> | ||
41 | |||
42 | #include "lockdep_internals.h" | ||
43 | |||
44 | /* | ||
45 | * hash_lock: protects the lockdep hashes and class/list/hash allocators. | ||
46 | * | ||
47 | * This is one of the rare exceptions where it's justified | ||
48 | * to use a raw spinlock - we really dont want the spinlock | ||
49 | * code to recurse back into the lockdep code. | ||
50 | */ | ||
51 | static raw_spinlock_t hash_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; | ||
52 | |||
53 | static int lockdep_initialized; | ||
54 | |||
55 | unsigned long nr_list_entries; | ||
56 | static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; | ||
57 | |||
58 | /* | ||
59 | * Allocate a lockdep entry. (assumes hash_lock held, returns | ||
60 | * with NULL on failure) | ||
61 | */ | ||
62 | static struct lock_list *alloc_list_entry(void) | ||
63 | { | ||
64 | if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { | ||
65 | __raw_spin_unlock(&hash_lock); | ||
66 | debug_locks_off(); | ||
67 | printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); | ||
68 | printk("turning off the locking correctness validator.\n"); | ||
69 | return NULL; | ||
70 | } | ||
71 | return list_entries + nr_list_entries++; | ||
72 | } | ||
73 | |||
74 | /* | ||
75 | * All data structures here are protected by the global debug_lock. | ||
76 | * | ||
77 | * Mutex key structs only get allocated, once during bootup, and never | ||
78 | * get freed - this significantly simplifies the debugging code. | ||
79 | */ | ||
80 | unsigned long nr_lock_classes; | ||
81 | static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; | ||
82 | |||
83 | /* | ||
84 | * We keep a global list of all lock classes. The list only grows, | ||
85 | * never shrinks. The list is only accessed with the lockdep | ||
86 | * spinlock lock held. | ||
87 | */ | ||
88 | LIST_HEAD(all_lock_classes); | ||
89 | |||
90 | /* | ||
91 | * The lockdep classes are in a hash-table as well, for fast lookup: | ||
92 | */ | ||
93 | #define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1) | ||
94 | #define CLASSHASH_SIZE (1UL << CLASSHASH_BITS) | ||
95 | #define CLASSHASH_MASK (CLASSHASH_SIZE - 1) | ||
96 | #define __classhashfn(key) ((((unsigned long)key >> CLASSHASH_BITS) + (unsigned long)key) & CLASSHASH_MASK) | ||
97 | #define classhashentry(key) (classhash_table + __classhashfn((key))) | ||
98 | |||
99 | static struct list_head classhash_table[CLASSHASH_SIZE]; | ||
100 | |||
101 | unsigned long nr_lock_chains; | ||
102 | static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; | ||
103 | |||
104 | /* | ||
105 | * We put the lock dependency chains into a hash-table as well, to cache | ||
106 | * their existence: | ||
107 | */ | ||
108 | #define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1) | ||
109 | #define CHAINHASH_SIZE (1UL << CHAINHASH_BITS) | ||
110 | #define CHAINHASH_MASK (CHAINHASH_SIZE - 1) | ||
111 | #define __chainhashfn(chain) \ | ||
112 | (((chain >> CHAINHASH_BITS) + chain) & CHAINHASH_MASK) | ||
113 | #define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) | ||
114 | |||
115 | static struct list_head chainhash_table[CHAINHASH_SIZE]; | ||
116 | |||
117 | /* | ||
118 | * The hash key of the lock dependency chains is a hash itself too: | ||
119 | * it's a hash of all locks taken up to that lock, including that lock. | ||
120 | * It's a 64-bit hash, because it's important for the keys to be | ||
121 | * unique. | ||
122 | */ | ||
123 | #define iterate_chain_key(key1, key2) \ | ||
124 | (((key1) << MAX_LOCKDEP_KEYS_BITS/2) ^ \ | ||
125 | ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS/2)) ^ \ | ||
126 | (key2)) | ||
127 | |||
128 | void lockdep_off(void) | ||
129 | { | ||
130 | current->lockdep_recursion++; | ||
131 | } | ||
132 | |||
133 | EXPORT_SYMBOL(lockdep_off); | ||
134 | |||
135 | void lockdep_on(void) | ||
136 | { | ||
137 | current->lockdep_recursion--; | ||
138 | } | ||
139 | |||
140 | EXPORT_SYMBOL(lockdep_on); | ||
141 | |||
142 | int lockdep_internal(void) | ||
143 | { | ||
144 | return current->lockdep_recursion != 0; | ||
145 | } | ||
146 | |||
147 | EXPORT_SYMBOL(lockdep_internal); | ||
148 | |||
149 | /* | ||
150 | * Debugging switches: | ||
151 | */ | ||
152 | |||
153 | #define VERBOSE 0 | ||
154 | #ifdef VERBOSE | ||
155 | # define VERY_VERBOSE 0 | ||
156 | #endif | ||
157 | |||
158 | #if VERBOSE | ||
159 | # define HARDIRQ_VERBOSE 1 | ||
160 | # define SOFTIRQ_VERBOSE 1 | ||
161 | #else | ||
162 | # define HARDIRQ_VERBOSE 0 | ||
163 | # define SOFTIRQ_VERBOSE 0 | ||
164 | #endif | ||
165 | |||
166 | #if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE | ||
167 | /* | ||
168 | * Quick filtering for interesting events: | ||
169 | */ | ||
170 | static int class_filter(struct lock_class *class) | ||
171 | { | ||
172 | #if 0 | ||
173 | /* Example */ | ||
174 | if (class->name_version == 1 && | ||
175 | !strcmp(class->name, "lockname")) | ||
176 | return 1; | ||
177 | if (class->name_version == 1 && | ||
178 | !strcmp(class->name, "&struct->lockfield")) | ||
179 | return 1; | ||
180 | #endif | ||
181 | /* Allow everything else. 0 would be filter everything else */ | ||
182 | return 1; | ||
183 | } | ||
184 | #endif | ||
185 | |||
186 | static int verbose(struct lock_class *class) | ||
187 | { | ||
188 | #if VERBOSE | ||
189 | return class_filter(class); | ||
190 | #endif | ||
191 | return 0; | ||
192 | } | ||
193 | |||
194 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
195 | |||
196 | static int hardirq_verbose(struct lock_class *class) | ||
197 | { | ||
198 | #if HARDIRQ_VERBOSE | ||
199 | return class_filter(class); | ||
200 | #endif | ||
201 | return 0; | ||
202 | } | ||
203 | |||
204 | static int softirq_verbose(struct lock_class *class) | ||
205 | { | ||
206 | #if SOFTIRQ_VERBOSE | ||
207 | return class_filter(class); | ||
208 | #endif | ||
209 | return 0; | ||
210 | } | ||
211 | |||
212 | #endif | ||
213 | |||
214 | /* | ||
215 | * Stack-trace: tightly packed array of stack backtrace | ||
216 | * addresses. Protected by the hash_lock. | ||
217 | */ | ||
218 | unsigned long nr_stack_trace_entries; | ||
219 | static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; | ||
220 | |||
221 | static int save_trace(struct stack_trace *trace) | ||
222 | { | ||
223 | trace->nr_entries = 0; | ||
224 | trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; | ||
225 | trace->entries = stack_trace + nr_stack_trace_entries; | ||
226 | |||
227 | save_stack_trace(trace, NULL, 0, 3); | ||
228 | |||
229 | trace->max_entries = trace->nr_entries; | ||
230 | |||
231 | nr_stack_trace_entries += trace->nr_entries; | ||
232 | if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES)) | ||
233 | return 0; | ||
234 | |||
235 | if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { | ||
236 | __raw_spin_unlock(&hash_lock); | ||
237 | if (debug_locks_off()) { | ||
238 | printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); | ||
239 | printk("turning off the locking correctness validator.\n"); | ||
240 | dump_stack(); | ||
241 | } | ||
242 | return 0; | ||
243 | } | ||
244 | |||
245 | return 1; | ||
246 | } | ||
247 | |||
248 | unsigned int nr_hardirq_chains; | ||
249 | unsigned int nr_softirq_chains; | ||
250 | unsigned int nr_process_chains; | ||
251 | unsigned int max_lockdep_depth; | ||
252 | unsigned int max_recursion_depth; | ||
253 | |||
254 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
255 | /* | ||
256 | * We cannot printk in early bootup code. Not even early_printk() | ||
257 | * might work. So we mark any initialization errors and printk | ||
258 | * about it later on, in lockdep_info(). | ||
259 | */ | ||
260 | static int lockdep_init_error; | ||
261 | |||
262 | /* | ||
263 | * Various lockdep statistics: | ||
264 | */ | ||
265 | atomic_t chain_lookup_hits; | ||
266 | atomic_t chain_lookup_misses; | ||
267 | atomic_t hardirqs_on_events; | ||
268 | atomic_t hardirqs_off_events; | ||
269 | atomic_t redundant_hardirqs_on; | ||
270 | atomic_t redundant_hardirqs_off; | ||
271 | atomic_t softirqs_on_events; | ||
272 | atomic_t softirqs_off_events; | ||
273 | atomic_t redundant_softirqs_on; | ||
274 | atomic_t redundant_softirqs_off; | ||
275 | atomic_t nr_unused_locks; | ||
276 | atomic_t nr_cyclic_checks; | ||
277 | atomic_t nr_cyclic_check_recursions; | ||
278 | atomic_t nr_find_usage_forwards_checks; | ||
279 | atomic_t nr_find_usage_forwards_recursions; | ||
280 | atomic_t nr_find_usage_backwards_checks; | ||
281 | atomic_t nr_find_usage_backwards_recursions; | ||
282 | # define debug_atomic_inc(ptr) atomic_inc(ptr) | ||
283 | # define debug_atomic_dec(ptr) atomic_dec(ptr) | ||
284 | # define debug_atomic_read(ptr) atomic_read(ptr) | ||
285 | #else | ||
286 | # define debug_atomic_inc(ptr) do { } while (0) | ||
287 | # define debug_atomic_dec(ptr) do { } while (0) | ||
288 | # define debug_atomic_read(ptr) 0 | ||
289 | #endif | ||
290 | |||
291 | /* | ||
292 | * Locking printouts: | ||
293 | */ | ||
294 | |||
295 | static const char *usage_str[] = | ||
296 | { | ||
297 | [LOCK_USED] = "initial-use ", | ||
298 | [LOCK_USED_IN_HARDIRQ] = "in-hardirq-W", | ||
299 | [LOCK_USED_IN_SOFTIRQ] = "in-softirq-W", | ||
300 | [LOCK_ENABLED_SOFTIRQS] = "softirq-on-W", | ||
301 | [LOCK_ENABLED_HARDIRQS] = "hardirq-on-W", | ||
302 | [LOCK_USED_IN_HARDIRQ_READ] = "in-hardirq-R", | ||
303 | [LOCK_USED_IN_SOFTIRQ_READ] = "in-softirq-R", | ||
304 | [LOCK_ENABLED_SOFTIRQS_READ] = "softirq-on-R", | ||
305 | [LOCK_ENABLED_HARDIRQS_READ] = "hardirq-on-R", | ||
306 | }; | ||
307 | |||
308 | const char * __get_key_name(struct lockdep_subclass_key *key, char *str) | ||
309 | { | ||
310 | unsigned long offs, size; | ||
311 | char *modname; | ||
312 | |||
313 | return kallsyms_lookup((unsigned long)key, &size, &offs, &modname, str); | ||
314 | } | ||
315 | |||
316 | void | ||
317 | get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4) | ||
318 | { | ||
319 | *c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.'; | ||
320 | |||
321 | if (class->usage_mask & LOCKF_USED_IN_HARDIRQ) | ||
322 | *c1 = '+'; | ||
323 | else | ||
324 | if (class->usage_mask & LOCKF_ENABLED_HARDIRQS) | ||
325 | *c1 = '-'; | ||
326 | |||
327 | if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) | ||
328 | *c2 = '+'; | ||
329 | else | ||
330 | if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS) | ||
331 | *c2 = '-'; | ||
332 | |||
333 | if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) | ||
334 | *c3 = '-'; | ||
335 | if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) { | ||
336 | *c3 = '+'; | ||
337 | if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) | ||
338 | *c3 = '?'; | ||
339 | } | ||
340 | |||
341 | if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) | ||
342 | *c4 = '-'; | ||
343 | if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) { | ||
344 | *c4 = '+'; | ||
345 | if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) | ||
346 | *c4 = '?'; | ||
347 | } | ||
348 | } | ||
349 | |||
350 | static void print_lock_name(struct lock_class *class) | ||
351 | { | ||
352 | char str[128], c1, c2, c3, c4; | ||
353 | const char *name; | ||
354 | |||
355 | get_usage_chars(class, &c1, &c2, &c3, &c4); | ||
356 | |||
357 | name = class->name; | ||
358 | if (!name) { | ||
359 | name = __get_key_name(class->key, str); | ||
360 | printk(" (%s", name); | ||
361 | } else { | ||
362 | printk(" (%s", name); | ||
363 | if (class->name_version > 1) | ||
364 | printk("#%d", class->name_version); | ||
365 | if (class->subclass) | ||
366 | printk("/%d", class->subclass); | ||
367 | } | ||
368 | printk("){%c%c%c%c}", c1, c2, c3, c4); | ||
369 | } | ||
370 | |||
371 | static void print_lockdep_cache(struct lockdep_map *lock) | ||
372 | { | ||
373 | const char *name; | ||
374 | char str[128]; | ||
375 | |||
376 | name = lock->name; | ||
377 | if (!name) | ||
378 | name = __get_key_name(lock->key->subkeys, str); | ||
379 | |||
380 | printk("%s", name); | ||
381 | } | ||
382 | |||
383 | static void print_lock(struct held_lock *hlock) | ||
384 | { | ||
385 | print_lock_name(hlock->class); | ||
386 | printk(", at: "); | ||
387 | print_ip_sym(hlock->acquire_ip); | ||
388 | } | ||
389 | |||
390 | static void lockdep_print_held_locks(struct task_struct *curr) | ||
391 | { | ||
392 | int i, depth = curr->lockdep_depth; | ||
393 | |||
394 | if (!depth) { | ||
395 | printk("no locks held by %s/%d.\n", curr->comm, curr->pid); | ||
396 | return; | ||
397 | } | ||
398 | printk("%d lock%s held by %s/%d:\n", | ||
399 | depth, depth > 1 ? "s" : "", curr->comm, curr->pid); | ||
400 | |||
401 | for (i = 0; i < depth; i++) { | ||
402 | printk(" #%d: ", i); | ||
403 | print_lock(curr->held_locks + i); | ||
404 | } | ||
405 | } | ||
406 | |||
407 | static void print_lock_class_header(struct lock_class *class, int depth) | ||
408 | { | ||
409 | int bit; | ||
410 | |||
411 | printk("%*s->", depth, ""); | ||
412 | print_lock_name(class); | ||
413 | printk(" ops: %lu", class->ops); | ||
414 | printk(" {\n"); | ||
415 | |||
416 | for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { | ||
417 | if (class->usage_mask & (1 << bit)) { | ||
418 | int len = depth; | ||
419 | |||
420 | len += printk("%*s %s", depth, "", usage_str[bit]); | ||
421 | len += printk(" at:\n"); | ||
422 | print_stack_trace(class->usage_traces + bit, len); | ||
423 | } | ||
424 | } | ||
425 | printk("%*s }\n", depth, ""); | ||
426 | |||
427 | printk("%*s ... key at: ",depth,""); | ||
428 | print_ip_sym((unsigned long)class->key); | ||
429 | } | ||
430 | |||
431 | /* | ||
432 | * printk all lock dependencies starting at <entry>: | ||
433 | */ | ||
434 | static void print_lock_dependencies(struct lock_class *class, int depth) | ||
435 | { | ||
436 | struct lock_list *entry; | ||
437 | |||
438 | if (DEBUG_LOCKS_WARN_ON(depth >= 20)) | ||
439 | return; | ||
440 | |||
441 | print_lock_class_header(class, depth); | ||
442 | |||
443 | list_for_each_entry(entry, &class->locks_after, entry) { | ||
444 | DEBUG_LOCKS_WARN_ON(!entry->class); | ||
445 | print_lock_dependencies(entry->class, depth + 1); | ||
446 | |||
447 | printk("%*s ... acquired at:\n",depth,""); | ||
448 | print_stack_trace(&entry->trace, 2); | ||
449 | printk("\n"); | ||
450 | } | ||
451 | } | ||
452 | |||
453 | /* | ||
454 | * Add a new dependency to the head of the list: | ||
455 | */ | ||
456 | static int add_lock_to_list(struct lock_class *class, struct lock_class *this, | ||
457 | struct list_head *head, unsigned long ip) | ||
458 | { | ||
459 | struct lock_list *entry; | ||
460 | /* | ||
461 | * Lock not present yet - get a new dependency struct and | ||
462 | * add it to the list: | ||
463 | */ | ||
464 | entry = alloc_list_entry(); | ||
465 | if (!entry) | ||
466 | return 0; | ||
467 | |||
468 | entry->class = this; | ||
469 | save_trace(&entry->trace); | ||
470 | |||
471 | /* | ||
472 | * Since we never remove from the dependency list, the list can | ||
473 | * be walked lockless by other CPUs, it's only allocation | ||
474 | * that must be protected by the spinlock. But this also means | ||
475 | * we must make new entries visible only once writes to the | ||
476 | * entry become visible - hence the RCU op: | ||
477 | */ | ||
478 | list_add_tail_rcu(&entry->entry, head); | ||
479 | |||
480 | return 1; | ||
481 | } | ||
482 | |||
483 | /* | ||
484 | * Recursive, forwards-direction lock-dependency checking, used for | ||
485 | * both noncyclic checking and for hardirq-unsafe/softirq-unsafe | ||
486 | * checking. | ||
487 | * | ||
488 | * (to keep the stackframe of the recursive functions small we | ||
489 | * use these global variables, and we also mark various helper | ||
490 | * functions as noinline.) | ||
491 | */ | ||
492 | static struct held_lock *check_source, *check_target; | ||
493 | |||
494 | /* | ||
495 | * Print a dependency chain entry (this is only done when a deadlock | ||
496 | * has been detected): | ||
497 | */ | ||
498 | static noinline int | ||
499 | print_circular_bug_entry(struct lock_list *target, unsigned int depth) | ||
500 | { | ||
501 | if (debug_locks_silent) | ||
502 | return 0; | ||
503 | printk("\n-> #%u", depth); | ||
504 | print_lock_name(target->class); | ||
505 | printk(":\n"); | ||
506 | print_stack_trace(&target->trace, 6); | ||
507 | |||
508 | return 0; | ||
509 | } | ||
510 | |||
511 | /* | ||
512 | * When a circular dependency is detected, print the | ||
513 | * header first: | ||
514 | */ | ||
515 | static noinline int | ||
516 | print_circular_bug_header(struct lock_list *entry, unsigned int depth) | ||
517 | { | ||
518 | struct task_struct *curr = current; | ||
519 | |||
520 | __raw_spin_unlock(&hash_lock); | ||
521 | debug_locks_off(); | ||
522 | if (debug_locks_silent) | ||
523 | return 0; | ||
524 | |||
525 | printk("\n=======================================================\n"); | ||
526 | printk( "[ INFO: possible circular locking dependency detected ]\n"); | ||
527 | printk( "-------------------------------------------------------\n"); | ||
528 | printk("%s/%d is trying to acquire lock:\n", | ||
529 | curr->comm, curr->pid); | ||
530 | print_lock(check_source); | ||
531 | printk("\nbut task is already holding lock:\n"); | ||
532 | print_lock(check_target); | ||
533 | printk("\nwhich lock already depends on the new lock.\n\n"); | ||
534 | printk("\nthe existing dependency chain (in reverse order) is:\n"); | ||
535 | |||
536 | print_circular_bug_entry(entry, depth); | ||
537 | |||
538 | return 0; | ||
539 | } | ||
540 | |||
541 | static noinline int print_circular_bug_tail(void) | ||
542 | { | ||
543 | struct task_struct *curr = current; | ||
544 | struct lock_list this; | ||
545 | |||
546 | if (debug_locks_silent) | ||
547 | return 0; | ||
548 | |||
549 | this.class = check_source->class; | ||
550 | save_trace(&this.trace); | ||
551 | print_circular_bug_entry(&this, 0); | ||
552 | |||
553 | printk("\nother info that might help us debug this:\n\n"); | ||
554 | lockdep_print_held_locks(curr); | ||
555 | |||
556 | printk("\nstack backtrace:\n"); | ||
557 | dump_stack(); | ||
558 | |||
559 | return 0; | ||
560 | } | ||
561 | |||
562 | static int noinline print_infinite_recursion_bug(void) | ||
563 | { | ||
564 | __raw_spin_unlock(&hash_lock); | ||
565 | DEBUG_LOCKS_WARN_ON(1); | ||
566 | |||
567 | return 0; | ||
568 | } | ||
569 | |||
570 | /* | ||
571 | * Prove that the dependency graph starting at <entry> can not | ||
572 | * lead to <target>. Print an error and return 0 if it does. | ||
573 | */ | ||
574 | static noinline int | ||
575 | check_noncircular(struct lock_class *source, unsigned int depth) | ||
576 | { | ||
577 | struct lock_list *entry; | ||
578 | |||
579 | debug_atomic_inc(&nr_cyclic_check_recursions); | ||
580 | if (depth > max_recursion_depth) | ||
581 | max_recursion_depth = depth; | ||
582 | if (depth >= 20) | ||
583 | return print_infinite_recursion_bug(); | ||
584 | /* | ||
585 | * Check this lock's dependency list: | ||
586 | */ | ||
587 | list_for_each_entry(entry, &source->locks_after, entry) { | ||
588 | if (entry->class == check_target->class) | ||
589 | return print_circular_bug_header(entry, depth+1); | ||
590 | debug_atomic_inc(&nr_cyclic_checks); | ||
591 | if (!check_noncircular(entry->class, depth+1)) | ||
592 | return print_circular_bug_entry(entry, depth+1); | ||
593 | } | ||
594 | return 1; | ||
595 | } | ||
596 | |||
597 | static int very_verbose(struct lock_class *class) | ||
598 | { | ||
599 | #if VERY_VERBOSE | ||
600 | return class_filter(class); | ||
601 | #endif | ||
602 | return 0; | ||
603 | } | ||
604 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
605 | |||
606 | /* | ||
607 | * Forwards and backwards subgraph searching, for the purposes of | ||
608 | * proving that two subgraphs can be connected by a new dependency | ||
609 | * without creating any illegal irq-safe -> irq-unsafe lock dependency. | ||
610 | */ | ||
611 | static enum lock_usage_bit find_usage_bit; | ||
612 | static struct lock_class *forwards_match, *backwards_match; | ||
613 | |||
614 | /* | ||
615 | * Find a node in the forwards-direction dependency sub-graph starting | ||
616 | * at <source> that matches <find_usage_bit>. | ||
617 | * | ||
618 | * Return 2 if such a node exists in the subgraph, and put that node | ||
619 | * into <forwards_match>. | ||
620 | * | ||
621 | * Return 1 otherwise and keep <forwards_match> unchanged. | ||
622 | * Return 0 on error. | ||
623 | */ | ||
624 | static noinline int | ||
625 | find_usage_forwards(struct lock_class *source, unsigned int depth) | ||
626 | { | ||
627 | struct lock_list *entry; | ||
628 | int ret; | ||
629 | |||
630 | if (depth > max_recursion_depth) | ||
631 | max_recursion_depth = depth; | ||
632 | if (depth >= 20) | ||
633 | return print_infinite_recursion_bug(); | ||
634 | |||
635 | debug_atomic_inc(&nr_find_usage_forwards_checks); | ||
636 | if (source->usage_mask & (1 << find_usage_bit)) { | ||
637 | forwards_match = source; | ||
638 | return 2; | ||
639 | } | ||
640 | |||
641 | /* | ||
642 | * Check this lock's dependency list: | ||
643 | */ | ||
644 | list_for_each_entry(entry, &source->locks_after, entry) { | ||
645 | debug_atomic_inc(&nr_find_usage_forwards_recursions); | ||
646 | ret = find_usage_forwards(entry->class, depth+1); | ||
647 | if (ret == 2 || ret == 0) | ||
648 | return ret; | ||
649 | } | ||
650 | return 1; | ||
651 | } | ||
652 | |||
653 | /* | ||
654 | * Find a node in the backwards-direction dependency sub-graph starting | ||
655 | * at <source> that matches <find_usage_bit>. | ||
656 | * | ||
657 | * Return 2 if such a node exists in the subgraph, and put that node | ||
658 | * into <backwards_match>. | ||
659 | * | ||
660 | * Return 1 otherwise and keep <backwards_match> unchanged. | ||
661 | * Return 0 on error. | ||
662 | */ | ||
663 | static noinline int | ||
664 | find_usage_backwards(struct lock_class *source, unsigned int depth) | ||
665 | { | ||
666 | struct lock_list *entry; | ||
667 | int ret; | ||
668 | |||
669 | if (depth > max_recursion_depth) | ||
670 | max_recursion_depth = depth; | ||
671 | if (depth >= 20) | ||
672 | return print_infinite_recursion_bug(); | ||
673 | |||
674 | debug_atomic_inc(&nr_find_usage_backwards_checks); | ||
675 | if (source->usage_mask & (1 << find_usage_bit)) { | ||
676 | backwards_match = source; | ||
677 | return 2; | ||
678 | } | ||
679 | |||
680 | /* | ||
681 | * Check this lock's dependency list: | ||
682 | */ | ||
683 | list_for_each_entry(entry, &source->locks_before, entry) { | ||
684 | debug_atomic_inc(&nr_find_usage_backwards_recursions); | ||
685 | ret = find_usage_backwards(entry->class, depth+1); | ||
686 | if (ret == 2 || ret == 0) | ||
687 | return ret; | ||
688 | } | ||
689 | return 1; | ||
690 | } | ||
691 | |||
692 | static int | ||
693 | print_bad_irq_dependency(struct task_struct *curr, | ||
694 | struct held_lock *prev, | ||
695 | struct held_lock *next, | ||
696 | enum lock_usage_bit bit1, | ||
697 | enum lock_usage_bit bit2, | ||
698 | const char *irqclass) | ||
699 | { | ||
700 | __raw_spin_unlock(&hash_lock); | ||
701 | debug_locks_off(); | ||
702 | if (debug_locks_silent) | ||
703 | return 0; | ||
704 | |||
705 | printk("\n======================================================\n"); | ||
706 | printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", | ||
707 | irqclass, irqclass); | ||
708 | printk( "------------------------------------------------------\n"); | ||
709 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", | ||
710 | curr->comm, curr->pid, | ||
711 | curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, | ||
712 | curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, | ||
713 | curr->hardirqs_enabled, | ||
714 | curr->softirqs_enabled); | ||
715 | print_lock(next); | ||
716 | |||
717 | printk("\nand this task is already holding:\n"); | ||
718 | print_lock(prev); | ||
719 | printk("which would create a new lock dependency:\n"); | ||
720 | print_lock_name(prev->class); | ||
721 | printk(" ->"); | ||
722 | print_lock_name(next->class); | ||
723 | printk("\n"); | ||
724 | |||
725 | printk("\nbut this new dependency connects a %s-irq-safe lock:\n", | ||
726 | irqclass); | ||
727 | print_lock_name(backwards_match); | ||
728 | printk("\n... which became %s-irq-safe at:\n", irqclass); | ||
729 | |||
730 | print_stack_trace(backwards_match->usage_traces + bit1, 1); | ||
731 | |||
732 | printk("\nto a %s-irq-unsafe lock:\n", irqclass); | ||
733 | print_lock_name(forwards_match); | ||
734 | printk("\n... which became %s-irq-unsafe at:\n", irqclass); | ||
735 | printk("..."); | ||
736 | |||
737 | print_stack_trace(forwards_match->usage_traces + bit2, 1); | ||
738 | |||
739 | printk("\nother info that might help us debug this:\n\n"); | ||
740 | lockdep_print_held_locks(curr); | ||
741 | |||
742 | printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass); | ||
743 | print_lock_dependencies(backwards_match, 0); | ||
744 | |||
745 | printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass); | ||
746 | print_lock_dependencies(forwards_match, 0); | ||
747 | |||
748 | printk("\nstack backtrace:\n"); | ||
749 | dump_stack(); | ||
750 | |||
751 | return 0; | ||
752 | } | ||
753 | |||
754 | static int | ||
755 | check_usage(struct task_struct *curr, struct held_lock *prev, | ||
756 | struct held_lock *next, enum lock_usage_bit bit_backwards, | ||
757 | enum lock_usage_bit bit_forwards, const char *irqclass) | ||
758 | { | ||
759 | int ret; | ||
760 | |||
761 | find_usage_bit = bit_backwards; | ||
762 | /* fills in <backwards_match> */ | ||
763 | ret = find_usage_backwards(prev->class, 0); | ||
764 | if (!ret || ret == 1) | ||
765 | return ret; | ||
766 | |||
767 | find_usage_bit = bit_forwards; | ||
768 | ret = find_usage_forwards(next->class, 0); | ||
769 | if (!ret || ret == 1) | ||
770 | return ret; | ||
771 | /* ret == 2 */ | ||
772 | return print_bad_irq_dependency(curr, prev, next, | ||
773 | bit_backwards, bit_forwards, irqclass); | ||
774 | } | ||
775 | |||
776 | #endif | ||
777 | |||
778 | static int | ||
779 | print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, | ||
780 | struct held_lock *next) | ||
781 | { | ||
782 | debug_locks_off(); | ||
783 | __raw_spin_unlock(&hash_lock); | ||
784 | if (debug_locks_silent) | ||
785 | return 0; | ||
786 | |||
787 | printk("\n=============================================\n"); | ||
788 | printk( "[ INFO: possible recursive locking detected ]\n"); | ||
789 | printk( "---------------------------------------------\n"); | ||
790 | printk("%s/%d is trying to acquire lock:\n", | ||
791 | curr->comm, curr->pid); | ||
792 | print_lock(next); | ||
793 | printk("\nbut task is already holding lock:\n"); | ||
794 | print_lock(prev); | ||
795 | |||
796 | printk("\nother info that might help us debug this:\n"); | ||
797 | lockdep_print_held_locks(curr); | ||
798 | |||
799 | printk("\nstack backtrace:\n"); | ||
800 | dump_stack(); | ||
801 | |||
802 | return 0; | ||
803 | } | ||
804 | |||
805 | /* | ||
806 | * Check whether we are holding such a class already. | ||
807 | * | ||
808 | * (Note that this has to be done separately, because the graph cannot | ||
809 | * detect such classes of deadlocks.) | ||
810 | * | ||
811 | * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read | ||
812 | */ | ||
813 | static int | ||
814 | check_deadlock(struct task_struct *curr, struct held_lock *next, | ||
815 | struct lockdep_map *next_instance, int read) | ||
816 | { | ||
817 | struct held_lock *prev; | ||
818 | int i; | ||
819 | |||
820 | for (i = 0; i < curr->lockdep_depth; i++) { | ||
821 | prev = curr->held_locks + i; | ||
822 | if (prev->class != next->class) | ||
823 | continue; | ||
824 | /* | ||
825 | * Allow read-after-read recursion of the same | ||
826 | * lock class (i.e. read_lock(lock)+read_lock(lock)): | ||
827 | */ | ||
828 | if ((read == 2) && prev->read) | ||
829 | return 2; | ||
830 | return print_deadlock_bug(curr, prev, next); | ||
831 | } | ||
832 | return 1; | ||
833 | } | ||
834 | |||
835 | /* | ||
836 | * There was a chain-cache miss, and we are about to add a new dependency | ||
837 | * to a previous lock. We recursively validate the following rules: | ||
838 | * | ||
839 | * - would the adding of the <prev> -> <next> dependency create a | ||
840 | * circular dependency in the graph? [== circular deadlock] | ||
841 | * | ||
842 | * - does the new prev->next dependency connect any hardirq-safe lock | ||
843 | * (in the full backwards-subgraph starting at <prev>) with any | ||
844 | * hardirq-unsafe lock (in the full forwards-subgraph starting at | ||
845 | * <next>)? [== illegal lock inversion with hardirq contexts] | ||
846 | * | ||
847 | * - does the new prev->next dependency connect any softirq-safe lock | ||
848 | * (in the full backwards-subgraph starting at <prev>) with any | ||
849 | * softirq-unsafe lock (in the full forwards-subgraph starting at | ||
850 | * <next>)? [== illegal lock inversion with softirq contexts] | ||
851 | * | ||
852 | * any of these scenarios could lead to a deadlock. | ||
853 | * | ||
854 | * Then if all the validations pass, we add the forwards and backwards | ||
855 | * dependency. | ||
856 | */ | ||
857 | static int | ||
858 | check_prev_add(struct task_struct *curr, struct held_lock *prev, | ||
859 | struct held_lock *next) | ||
860 | { | ||
861 | struct lock_list *entry; | ||
862 | int ret; | ||
863 | |||
864 | /* | ||
865 | * Prove that the new <prev> -> <next> dependency would not | ||
866 | * create a circular dependency in the graph. (We do this by | ||
867 | * forward-recursing into the graph starting at <next>, and | ||
868 | * checking whether we can reach <prev>.) | ||
869 | * | ||
870 | * We are using global variables to control the recursion, to | ||
871 | * keep the stackframe size of the recursive functions low: | ||
872 | */ | ||
873 | check_source = next; | ||
874 | check_target = prev; | ||
875 | if (!(check_noncircular(next->class, 0))) | ||
876 | return print_circular_bug_tail(); | ||
877 | |||
878 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
879 | /* | ||
880 | * Prove that the new dependency does not connect a hardirq-safe | ||
881 | * lock with a hardirq-unsafe lock - to achieve this we search | ||
882 | * the backwards-subgraph starting at <prev>, and the | ||
883 | * forwards-subgraph starting at <next>: | ||
884 | */ | ||
885 | if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ, | ||
886 | LOCK_ENABLED_HARDIRQS, "hard")) | ||
887 | return 0; | ||
888 | |||
889 | /* | ||
890 | * Prove that the new dependency does not connect a hardirq-safe-read | ||
891 | * lock with a hardirq-unsafe lock - to achieve this we search | ||
892 | * the backwards-subgraph starting at <prev>, and the | ||
893 | * forwards-subgraph starting at <next>: | ||
894 | */ | ||
895 | if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ, | ||
896 | LOCK_ENABLED_HARDIRQS, "hard-read")) | ||
897 | return 0; | ||
898 | |||
899 | /* | ||
900 | * Prove that the new dependency does not connect a softirq-safe | ||
901 | * lock with a softirq-unsafe lock - to achieve this we search | ||
902 | * the backwards-subgraph starting at <prev>, and the | ||
903 | * forwards-subgraph starting at <next>: | ||
904 | */ | ||
905 | if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ, | ||
906 | LOCK_ENABLED_SOFTIRQS, "soft")) | ||
907 | return 0; | ||
908 | /* | ||
909 | * Prove that the new dependency does not connect a softirq-safe-read | ||
910 | * lock with a softirq-unsafe lock - to achieve this we search | ||
911 | * the backwards-subgraph starting at <prev>, and the | ||
912 | * forwards-subgraph starting at <next>: | ||
913 | */ | ||
914 | if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ, | ||
915 | LOCK_ENABLED_SOFTIRQS, "soft")) | ||
916 | return 0; | ||
917 | #endif | ||
918 | /* | ||
919 | * For recursive read-locks we do all the dependency checks, | ||
920 | * but we dont store read-triggered dependencies (only | ||
921 | * write-triggered dependencies). This ensures that only the | ||
922 | * write-side dependencies matter, and that if for example a | ||
923 | * write-lock never takes any other locks, then the reads are | ||
924 | * equivalent to a NOP. | ||
925 | */ | ||
926 | if (next->read == 2 || prev->read == 2) | ||
927 | return 1; | ||
928 | /* | ||
929 | * Is the <prev> -> <next> dependency already present? | ||
930 | * | ||
931 | * (this may occur even though this is a new chain: consider | ||
932 | * e.g. the L1 -> L2 -> L3 -> L4 and the L5 -> L1 -> L2 -> L3 | ||
933 | * chains - the second one will be new, but L1 already has | ||
934 | * L2 added to its dependency list, due to the first chain.) | ||
935 | */ | ||
936 | list_for_each_entry(entry, &prev->class->locks_after, entry) { | ||
937 | if (entry->class == next->class) | ||
938 | return 2; | ||
939 | } | ||
940 | |||
941 | /* | ||
942 | * Ok, all validations passed, add the new lock | ||
943 | * to the previous lock's dependency list: | ||
944 | */ | ||
945 | ret = add_lock_to_list(prev->class, next->class, | ||
946 | &prev->class->locks_after, next->acquire_ip); | ||
947 | if (!ret) | ||
948 | return 0; | ||
949 | /* | ||
950 | * Return value of 2 signals 'dependency already added', | ||
951 | * in that case we dont have to add the backlink either. | ||
952 | */ | ||
953 | if (ret == 2) | ||
954 | return 2; | ||
955 | ret = add_lock_to_list(next->class, prev->class, | ||
956 | &next->class->locks_before, next->acquire_ip); | ||
957 | |||
958 | /* | ||
959 | * Debugging printouts: | ||
960 | */ | ||
961 | if (verbose(prev->class) || verbose(next->class)) { | ||
962 | __raw_spin_unlock(&hash_lock); | ||
963 | printk("\n new dependency: "); | ||
964 | print_lock_name(prev->class); | ||
965 | printk(" => "); | ||
966 | print_lock_name(next->class); | ||
967 | printk("\n"); | ||
968 | dump_stack(); | ||
969 | __raw_spin_lock(&hash_lock); | ||
970 | } | ||
971 | return 1; | ||
972 | } | ||
973 | |||
974 | /* | ||
975 | * Add the dependency to all directly-previous locks that are 'relevant'. | ||
976 | * The ones that are relevant are (in increasing distance from curr): | ||
977 | * all consecutive trylock entries and the final non-trylock entry - or | ||
978 | * the end of this context's lock-chain - whichever comes first. | ||
979 | */ | ||
980 | static int | ||
981 | check_prevs_add(struct task_struct *curr, struct held_lock *next) | ||
982 | { | ||
983 | int depth = curr->lockdep_depth; | ||
984 | struct held_lock *hlock; | ||
985 | |||
986 | /* | ||
987 | * Debugging checks. | ||
988 | * | ||
989 | * Depth must not be zero for a non-head lock: | ||
990 | */ | ||
991 | if (!depth) | ||
992 | goto out_bug; | ||
993 | /* | ||
994 | * At least two relevant locks must exist for this | ||
995 | * to be a head: | ||
996 | */ | ||
997 | if (curr->held_locks[depth].irq_context != | ||
998 | curr->held_locks[depth-1].irq_context) | ||
999 | goto out_bug; | ||
1000 | |||
1001 | for (;;) { | ||
1002 | hlock = curr->held_locks + depth-1; | ||
1003 | /* | ||
1004 | * Only non-recursive-read entries get new dependencies | ||
1005 | * added: | ||
1006 | */ | ||
1007 | if (hlock->read != 2) { | ||
1008 | check_prev_add(curr, hlock, next); | ||
1009 | /* | ||
1010 | * Stop after the first non-trylock entry, | ||
1011 | * as non-trylock entries have added their | ||
1012 | * own direct dependencies already, so this | ||
1013 | * lock is connected to them indirectly: | ||
1014 | */ | ||
1015 | if (!hlock->trylock) | ||
1016 | break; | ||
1017 | } | ||
1018 | depth--; | ||
1019 | /* | ||
1020 | * End of lock-stack? | ||
1021 | */ | ||
1022 | if (!depth) | ||
1023 | break; | ||
1024 | /* | ||
1025 | * Stop the search if we cross into another context: | ||
1026 | */ | ||
1027 | if (curr->held_locks[depth].irq_context != | ||
1028 | curr->held_locks[depth-1].irq_context) | ||
1029 | break; | ||
1030 | } | ||
1031 | return 1; | ||
1032 | out_bug: | ||
1033 | __raw_spin_unlock(&hash_lock); | ||
1034 | DEBUG_LOCKS_WARN_ON(1); | ||
1035 | |||
1036 | return 0; | ||
1037 | } | ||
1038 | |||
1039 | |||
1040 | /* | ||
1041 | * Is this the address of a static object: | ||
1042 | */ | ||
1043 | static int static_obj(void *obj) | ||
1044 | { | ||
1045 | unsigned long start = (unsigned long) &_stext, | ||
1046 | end = (unsigned long) &_end, | ||
1047 | addr = (unsigned long) obj; | ||
1048 | #ifdef CONFIG_SMP | ||
1049 | int i; | ||
1050 | #endif | ||
1051 | |||
1052 | /* | ||
1053 | * static variable? | ||
1054 | */ | ||
1055 | if ((addr >= start) && (addr < end)) | ||
1056 | return 1; | ||
1057 | |||
1058 | #ifdef CONFIG_SMP | ||
1059 | /* | ||
1060 | * percpu var? | ||
1061 | */ | ||
1062 | for_each_possible_cpu(i) { | ||
1063 | start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); | ||
1064 | end = (unsigned long) &__per_cpu_end + per_cpu_offset(i); | ||
1065 | |||
1066 | if ((addr >= start) && (addr < end)) | ||
1067 | return 1; | ||
1068 | } | ||
1069 | #endif | ||
1070 | |||
1071 | /* | ||
1072 | * module var? | ||
1073 | */ | ||
1074 | return is_module_address(addr); | ||
1075 | } | ||
1076 | |||
1077 | /* | ||
1078 | * To make lock name printouts unique, we calculate a unique | ||
1079 | * class->name_version generation counter: | ||
1080 | */ | ||
1081 | static int count_matching_names(struct lock_class *new_class) | ||
1082 | { | ||
1083 | struct lock_class *class; | ||
1084 | int count = 0; | ||
1085 | |||
1086 | if (!new_class->name) | ||
1087 | return 0; | ||
1088 | |||
1089 | list_for_each_entry(class, &all_lock_classes, lock_entry) { | ||
1090 | if (new_class->key - new_class->subclass == class->key) | ||
1091 | return class->name_version; | ||
1092 | if (class->name && !strcmp(class->name, new_class->name)) | ||
1093 | count = max(count, class->name_version); | ||
1094 | } | ||
1095 | |||
1096 | return count + 1; | ||
1097 | } | ||
1098 | |||
1099 | extern void __error_too_big_MAX_LOCKDEP_SUBCLASSES(void); | ||
1100 | |||
1101 | /* | ||
1102 | * Register a lock's class in the hash-table, if the class is not present | ||
1103 | * yet. Otherwise we look it up. We cache the result in the lock object | ||
1104 | * itself, so actual lookup of the hash should be once per lock object. | ||
1105 | */ | ||
1106 | static inline struct lock_class * | ||
1107 | look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | ||
1108 | { | ||
1109 | struct lockdep_subclass_key *key; | ||
1110 | struct list_head *hash_head; | ||
1111 | struct lock_class *class; | ||
1112 | |||
1113 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
1114 | /* | ||
1115 | * If the architecture calls into lockdep before initializing | ||
1116 | * the hashes then we'll warn about it later. (we cannot printk | ||
1117 | * right now) | ||
1118 | */ | ||
1119 | if (unlikely(!lockdep_initialized)) { | ||
1120 | lockdep_init(); | ||
1121 | lockdep_init_error = 1; | ||
1122 | } | ||
1123 | #endif | ||
1124 | |||
1125 | /* | ||
1126 | * Static locks do not have their class-keys yet - for them the key | ||
1127 | * is the lock object itself: | ||
1128 | */ | ||
1129 | if (unlikely(!lock->key)) | ||
1130 | lock->key = (void *)lock; | ||
1131 | |||
1132 | /* | ||
1133 | * NOTE: the class-key must be unique. For dynamic locks, a static | ||
1134 | * lock_class_key variable is passed in through the mutex_init() | ||
1135 | * (or spin_lock_init()) call - which acts as the key. For static | ||
1136 | * locks we use the lock object itself as the key. | ||
1137 | */ | ||
1138 | if (sizeof(struct lock_class_key) > sizeof(struct lock_class)) | ||
1139 | __error_too_big_MAX_LOCKDEP_SUBCLASSES(); | ||
1140 | |||
1141 | key = lock->key->subkeys + subclass; | ||
1142 | |||
1143 | hash_head = classhashentry(key); | ||
1144 | |||
1145 | /* | ||
1146 | * We can walk the hash lockfree, because the hash only | ||
1147 | * grows, and we are careful when adding entries to the end: | ||
1148 | */ | ||
1149 | list_for_each_entry(class, hash_head, hash_entry) | ||
1150 | if (class->key == key) | ||
1151 | return class; | ||
1152 | |||
1153 | return NULL; | ||
1154 | } | ||
1155 | |||
1156 | /* | ||
1157 | * Register a lock's class in the hash-table, if the class is not present | ||
1158 | * yet. Otherwise we look it up. We cache the result in the lock object | ||
1159 | * itself, so actual lookup of the hash should be once per lock object. | ||
1160 | */ | ||
1161 | static inline struct lock_class * | ||
1162 | register_lock_class(struct lockdep_map *lock, unsigned int subclass) | ||
1163 | { | ||
1164 | struct lockdep_subclass_key *key; | ||
1165 | struct list_head *hash_head; | ||
1166 | struct lock_class *class; | ||
1167 | |||
1168 | class = look_up_lock_class(lock, subclass); | ||
1169 | if (likely(class)) | ||
1170 | return class; | ||
1171 | |||
1172 | /* | ||
1173 | * Debug-check: all keys must be persistent! | ||
1174 | */ | ||
1175 | if (!static_obj(lock->key)) { | ||
1176 | debug_locks_off(); | ||
1177 | printk("INFO: trying to register non-static key.\n"); | ||
1178 | printk("the code is fine but needs lockdep annotation.\n"); | ||
1179 | printk("turning off the locking correctness validator.\n"); | ||
1180 | dump_stack(); | ||
1181 | |||
1182 | return NULL; | ||
1183 | } | ||
1184 | |||
1185 | key = lock->key->subkeys + subclass; | ||
1186 | hash_head = classhashentry(key); | ||
1187 | |||
1188 | __raw_spin_lock(&hash_lock); | ||
1189 | /* | ||
1190 | * We have to do the hash-walk again, to avoid races | ||
1191 | * with another CPU: | ||
1192 | */ | ||
1193 | list_for_each_entry(class, hash_head, hash_entry) | ||
1194 | if (class->key == key) | ||
1195 | goto out_unlock_set; | ||
1196 | /* | ||
1197 | * Allocate a new key from the static array, and add it to | ||
1198 | * the hash: | ||
1199 | */ | ||
1200 | if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { | ||
1201 | __raw_spin_unlock(&hash_lock); | ||
1202 | debug_locks_off(); | ||
1203 | printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); | ||
1204 | printk("turning off the locking correctness validator.\n"); | ||
1205 | return NULL; | ||
1206 | } | ||
1207 | class = lock_classes + nr_lock_classes++; | ||
1208 | debug_atomic_inc(&nr_unused_locks); | ||
1209 | class->key = key; | ||
1210 | class->name = lock->name; | ||
1211 | class->subclass = subclass; | ||
1212 | INIT_LIST_HEAD(&class->lock_entry); | ||
1213 | INIT_LIST_HEAD(&class->locks_before); | ||
1214 | INIT_LIST_HEAD(&class->locks_after); | ||
1215 | class->name_version = count_matching_names(class); | ||
1216 | /* | ||
1217 | * We use RCU's safe list-add method to make | ||
1218 | * parallel walking of the hash-list safe: | ||
1219 | */ | ||
1220 | list_add_tail_rcu(&class->hash_entry, hash_head); | ||
1221 | |||
1222 | if (verbose(class)) { | ||
1223 | __raw_spin_unlock(&hash_lock); | ||
1224 | printk("\nnew class %p: %s", class->key, class->name); | ||
1225 | if (class->name_version > 1) | ||
1226 | printk("#%d", class->name_version); | ||
1227 | printk("\n"); | ||
1228 | dump_stack(); | ||
1229 | __raw_spin_lock(&hash_lock); | ||
1230 | } | ||
1231 | out_unlock_set: | ||
1232 | __raw_spin_unlock(&hash_lock); | ||
1233 | |||
1234 | if (!subclass) | ||
1235 | lock->class_cache = class; | ||
1236 | |||
1237 | DEBUG_LOCKS_WARN_ON(class->subclass != subclass); | ||
1238 | |||
1239 | return class; | ||
1240 | } | ||
1241 | |||
1242 | /* | ||
1243 | * Look up a dependency chain. If the key is not present yet then | ||
1244 | * add it and return 0 - in this case the new dependency chain is | ||
1245 | * validated. If the key is already hashed, return 1. | ||
1246 | */ | ||
1247 | static inline int lookup_chain_cache(u64 chain_key) | ||
1248 | { | ||
1249 | struct list_head *hash_head = chainhashentry(chain_key); | ||
1250 | struct lock_chain *chain; | ||
1251 | |||
1252 | DEBUG_LOCKS_WARN_ON(!irqs_disabled()); | ||
1253 | /* | ||
1254 | * We can walk it lock-free, because entries only get added | ||
1255 | * to the hash: | ||
1256 | */ | ||
1257 | list_for_each_entry(chain, hash_head, entry) { | ||
1258 | if (chain->chain_key == chain_key) { | ||
1259 | cache_hit: | ||
1260 | debug_atomic_inc(&chain_lookup_hits); | ||
1261 | /* | ||
1262 | * In the debugging case, force redundant checking | ||
1263 | * by returning 1: | ||
1264 | */ | ||
1265 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
1266 | __raw_spin_lock(&hash_lock); | ||
1267 | return 1; | ||
1268 | #endif | ||
1269 | return 0; | ||
1270 | } | ||
1271 | } | ||
1272 | /* | ||
1273 | * Allocate a new chain entry from the static array, and add | ||
1274 | * it to the hash: | ||
1275 | */ | ||
1276 | __raw_spin_lock(&hash_lock); | ||
1277 | /* | ||
1278 | * We have to walk the chain again locked - to avoid duplicates: | ||
1279 | */ | ||
1280 | list_for_each_entry(chain, hash_head, entry) { | ||
1281 | if (chain->chain_key == chain_key) { | ||
1282 | __raw_spin_unlock(&hash_lock); | ||
1283 | goto cache_hit; | ||
1284 | } | ||
1285 | } | ||
1286 | if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { | ||
1287 | __raw_spin_unlock(&hash_lock); | ||
1288 | debug_locks_off(); | ||
1289 | printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); | ||
1290 | printk("turning off the locking correctness validator.\n"); | ||
1291 | return 0; | ||
1292 | } | ||
1293 | chain = lock_chains + nr_lock_chains++; | ||
1294 | chain->chain_key = chain_key; | ||
1295 | list_add_tail_rcu(&chain->entry, hash_head); | ||
1296 | debug_atomic_inc(&chain_lookup_misses); | ||
1297 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
1298 | if (current->hardirq_context) | ||
1299 | nr_hardirq_chains++; | ||
1300 | else { | ||
1301 | if (current->softirq_context) | ||
1302 | nr_softirq_chains++; | ||
1303 | else | ||
1304 | nr_process_chains++; | ||
1305 | } | ||
1306 | #else | ||
1307 | nr_process_chains++; | ||
1308 | #endif | ||
1309 | |||
1310 | return 1; | ||
1311 | } | ||
1312 | |||
1313 | /* | ||
1314 | * We are building curr_chain_key incrementally, so double-check | ||
1315 | * it from scratch, to make sure that it's done correctly: | ||
1316 | */ | ||
1317 | static void check_chain_key(struct task_struct *curr) | ||
1318 | { | ||
1319 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
1320 | struct held_lock *hlock, *prev_hlock = NULL; | ||
1321 | unsigned int i, id; | ||
1322 | u64 chain_key = 0; | ||
1323 | |||
1324 | for (i = 0; i < curr->lockdep_depth; i++) { | ||
1325 | hlock = curr->held_locks + i; | ||
1326 | if (chain_key != hlock->prev_chain_key) { | ||
1327 | debug_locks_off(); | ||
1328 | printk("hm#1, depth: %u [%u], %016Lx != %016Lx\n", | ||
1329 | curr->lockdep_depth, i, | ||
1330 | (unsigned long long)chain_key, | ||
1331 | (unsigned long long)hlock->prev_chain_key); | ||
1332 | WARN_ON(1); | ||
1333 | return; | ||
1334 | } | ||
1335 | id = hlock->class - lock_classes; | ||
1336 | DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS); | ||
1337 | if (prev_hlock && (prev_hlock->irq_context != | ||
1338 | hlock->irq_context)) | ||
1339 | chain_key = 0; | ||
1340 | chain_key = iterate_chain_key(chain_key, id); | ||
1341 | prev_hlock = hlock; | ||
1342 | } | ||
1343 | if (chain_key != curr->curr_chain_key) { | ||
1344 | debug_locks_off(); | ||
1345 | printk("hm#2, depth: %u [%u], %016Lx != %016Lx\n", | ||
1346 | curr->lockdep_depth, i, | ||
1347 | (unsigned long long)chain_key, | ||
1348 | (unsigned long long)curr->curr_chain_key); | ||
1349 | WARN_ON(1); | ||
1350 | } | ||
1351 | #endif | ||
1352 | } | ||
1353 | |||
1354 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
1355 | |||
1356 | /* | ||
1357 | * print irq inversion bug: | ||
1358 | */ | ||
1359 | static int | ||
1360 | print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, | ||
1361 | struct held_lock *this, int forwards, | ||
1362 | const char *irqclass) | ||
1363 | { | ||
1364 | __raw_spin_unlock(&hash_lock); | ||
1365 | debug_locks_off(); | ||
1366 | if (debug_locks_silent) | ||
1367 | return 0; | ||
1368 | |||
1369 | printk("\n=========================================================\n"); | ||
1370 | printk( "[ INFO: possible irq lock inversion dependency detected ]\n"); | ||
1371 | printk( "---------------------------------------------------------\n"); | ||
1372 | printk("%s/%d just changed the state of lock:\n", | ||
1373 | curr->comm, curr->pid); | ||
1374 | print_lock(this); | ||
1375 | if (forwards) | ||
1376 | printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass); | ||
1377 | else | ||
1378 | printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass); | ||
1379 | print_lock_name(other); | ||
1380 | printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); | ||
1381 | |||
1382 | printk("\nother info that might help us debug this:\n"); | ||
1383 | lockdep_print_held_locks(curr); | ||
1384 | |||
1385 | printk("\nthe first lock's dependencies:\n"); | ||
1386 | print_lock_dependencies(this->class, 0); | ||
1387 | |||
1388 | printk("\nthe second lock's dependencies:\n"); | ||
1389 | print_lock_dependencies(other, 0); | ||
1390 | |||
1391 | printk("\nstack backtrace:\n"); | ||
1392 | dump_stack(); | ||
1393 | |||
1394 | return 0; | ||
1395 | } | ||
1396 | |||
1397 | /* | ||
1398 | * Prove that in the forwards-direction subgraph starting at <this> | ||
1399 | * there is no lock matching <mask>: | ||
1400 | */ | ||
1401 | static int | ||
1402 | check_usage_forwards(struct task_struct *curr, struct held_lock *this, | ||
1403 | enum lock_usage_bit bit, const char *irqclass) | ||
1404 | { | ||
1405 | int ret; | ||
1406 | |||
1407 | find_usage_bit = bit; | ||
1408 | /* fills in <forwards_match> */ | ||
1409 | ret = find_usage_forwards(this->class, 0); | ||
1410 | if (!ret || ret == 1) | ||
1411 | return ret; | ||
1412 | |||
1413 | return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass); | ||
1414 | } | ||
1415 | |||
1416 | /* | ||
1417 | * Prove that in the backwards-direction subgraph starting at <this> | ||
1418 | * there is no lock matching <mask>: | ||
1419 | */ | ||
1420 | static int | ||
1421 | check_usage_backwards(struct task_struct *curr, struct held_lock *this, | ||
1422 | enum lock_usage_bit bit, const char *irqclass) | ||
1423 | { | ||
1424 | int ret; | ||
1425 | |||
1426 | find_usage_bit = bit; | ||
1427 | /* fills in <backwards_match> */ | ||
1428 | ret = find_usage_backwards(this->class, 0); | ||
1429 | if (!ret || ret == 1) | ||
1430 | return ret; | ||
1431 | |||
1432 | return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass); | ||
1433 | } | ||
1434 | |||
1435 | static inline void print_irqtrace_events(struct task_struct *curr) | ||
1436 | { | ||
1437 | printk("irq event stamp: %u\n", curr->irq_events); | ||
1438 | printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event); | ||
1439 | print_ip_sym(curr->hardirq_enable_ip); | ||
1440 | printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event); | ||
1441 | print_ip_sym(curr->hardirq_disable_ip); | ||
1442 | printk("softirqs last enabled at (%u): ", curr->softirq_enable_event); | ||
1443 | print_ip_sym(curr->softirq_enable_ip); | ||
1444 | printk("softirqs last disabled at (%u): ", curr->softirq_disable_event); | ||
1445 | print_ip_sym(curr->softirq_disable_ip); | ||
1446 | } | ||
1447 | |||
1448 | #else | ||
1449 | static inline void print_irqtrace_events(struct task_struct *curr) | ||
1450 | { | ||
1451 | } | ||
1452 | #endif | ||
1453 | |||
1454 | static int | ||
1455 | print_usage_bug(struct task_struct *curr, struct held_lock *this, | ||
1456 | enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) | ||
1457 | { | ||
1458 | __raw_spin_unlock(&hash_lock); | ||
1459 | debug_locks_off(); | ||
1460 | if (debug_locks_silent) | ||
1461 | return 0; | ||
1462 | |||
1463 | printk("\n=================================\n"); | ||
1464 | printk( "[ INFO: inconsistent lock state ]\n"); | ||
1465 | printk( "---------------------------------\n"); | ||
1466 | |||
1467 | printk("inconsistent {%s} -> {%s} usage.\n", | ||
1468 | usage_str[prev_bit], usage_str[new_bit]); | ||
1469 | |||
1470 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", | ||
1471 | curr->comm, curr->pid, | ||
1472 | trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, | ||
1473 | trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, | ||
1474 | trace_hardirqs_enabled(curr), | ||
1475 | trace_softirqs_enabled(curr)); | ||
1476 | print_lock(this); | ||
1477 | |||
1478 | printk("{%s} state was registered at:\n", usage_str[prev_bit]); | ||
1479 | print_stack_trace(this->class->usage_traces + prev_bit, 1); | ||
1480 | |||
1481 | print_irqtrace_events(curr); | ||
1482 | printk("\nother info that might help us debug this:\n"); | ||
1483 | lockdep_print_held_locks(curr); | ||
1484 | |||
1485 | printk("\nstack backtrace:\n"); | ||
1486 | dump_stack(); | ||
1487 | |||
1488 | return 0; | ||
1489 | } | ||
1490 | |||
1491 | /* | ||
1492 | * Print out an error if an invalid bit is set: | ||
1493 | */ | ||
1494 | static inline int | ||
1495 | valid_state(struct task_struct *curr, struct held_lock *this, | ||
1496 | enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) | ||
1497 | { | ||
1498 | if (unlikely(this->class->usage_mask & (1 << bad_bit))) | ||
1499 | return print_usage_bug(curr, this, bad_bit, new_bit); | ||
1500 | return 1; | ||
1501 | } | ||
1502 | |||
1503 | #define STRICT_READ_CHECKS 1 | ||
1504 | |||
1505 | /* | ||
1506 | * Mark a lock with a usage bit, and validate the state transition: | ||
1507 | */ | ||
1508 | static int mark_lock(struct task_struct *curr, struct held_lock *this, | ||
1509 | enum lock_usage_bit new_bit, unsigned long ip) | ||
1510 | { | ||
1511 | unsigned int new_mask = 1 << new_bit, ret = 1; | ||
1512 | |||
1513 | /* | ||
1514 | * If already set then do not dirty the cacheline, | ||
1515 | * nor do any checks: | ||
1516 | */ | ||
1517 | if (likely(this->class->usage_mask & new_mask)) | ||
1518 | return 1; | ||
1519 | |||
1520 | __raw_spin_lock(&hash_lock); | ||
1521 | /* | ||
1522 | * Make sure we didnt race: | ||
1523 | */ | ||
1524 | if (unlikely(this->class->usage_mask & new_mask)) { | ||
1525 | __raw_spin_unlock(&hash_lock); | ||
1526 | return 1; | ||
1527 | } | ||
1528 | |||
1529 | this->class->usage_mask |= new_mask; | ||
1530 | |||
1531 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
1532 | if (new_bit == LOCK_ENABLED_HARDIRQS || | ||
1533 | new_bit == LOCK_ENABLED_HARDIRQS_READ) | ||
1534 | ip = curr->hardirq_enable_ip; | ||
1535 | else if (new_bit == LOCK_ENABLED_SOFTIRQS || | ||
1536 | new_bit == LOCK_ENABLED_SOFTIRQS_READ) | ||
1537 | ip = curr->softirq_enable_ip; | ||
1538 | #endif | ||
1539 | if (!save_trace(this->class->usage_traces + new_bit)) | ||
1540 | return 0; | ||
1541 | |||
1542 | switch (new_bit) { | ||
1543 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
1544 | case LOCK_USED_IN_HARDIRQ: | ||
1545 | if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) | ||
1546 | return 0; | ||
1547 | if (!valid_state(curr, this, new_bit, | ||
1548 | LOCK_ENABLED_HARDIRQS_READ)) | ||
1549 | return 0; | ||
1550 | /* | ||
1551 | * just marked it hardirq-safe, check that this lock | ||
1552 | * took no hardirq-unsafe lock in the past: | ||
1553 | */ | ||
1554 | if (!check_usage_forwards(curr, this, | ||
1555 | LOCK_ENABLED_HARDIRQS, "hard")) | ||
1556 | return 0; | ||
1557 | #if STRICT_READ_CHECKS | ||
1558 | /* | ||
1559 | * just marked it hardirq-safe, check that this lock | ||
1560 | * took no hardirq-unsafe-read lock in the past: | ||
1561 | */ | ||
1562 | if (!check_usage_forwards(curr, this, | ||
1563 | LOCK_ENABLED_HARDIRQS_READ, "hard-read")) | ||
1564 | return 0; | ||
1565 | #endif | ||
1566 | if (hardirq_verbose(this->class)) | ||
1567 | ret = 2; | ||
1568 | break; | ||
1569 | case LOCK_USED_IN_SOFTIRQ: | ||
1570 | if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS)) | ||
1571 | return 0; | ||
1572 | if (!valid_state(curr, this, new_bit, | ||
1573 | LOCK_ENABLED_SOFTIRQS_READ)) | ||
1574 | return 0; | ||
1575 | /* | ||
1576 | * just marked it softirq-safe, check that this lock | ||
1577 | * took no softirq-unsafe lock in the past: | ||
1578 | */ | ||
1579 | if (!check_usage_forwards(curr, this, | ||
1580 | LOCK_ENABLED_SOFTIRQS, "soft")) | ||
1581 | return 0; | ||
1582 | #if STRICT_READ_CHECKS | ||
1583 | /* | ||
1584 | * just marked it softirq-safe, check that this lock | ||
1585 | * took no softirq-unsafe-read lock in the past: | ||
1586 | */ | ||
1587 | if (!check_usage_forwards(curr, this, | ||
1588 | LOCK_ENABLED_SOFTIRQS_READ, "soft-read")) | ||
1589 | return 0; | ||
1590 | #endif | ||
1591 | if (softirq_verbose(this->class)) | ||
1592 | ret = 2; | ||
1593 | break; | ||
1594 | case LOCK_USED_IN_HARDIRQ_READ: | ||
1595 | if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) | ||
1596 | return 0; | ||
1597 | /* | ||
1598 | * just marked it hardirq-read-safe, check that this lock | ||
1599 | * took no hardirq-unsafe lock in the past: | ||
1600 | */ | ||
1601 | if (!check_usage_forwards(curr, this, | ||
1602 | LOCK_ENABLED_HARDIRQS, "hard")) | ||
1603 | return 0; | ||
1604 | if (hardirq_verbose(this->class)) | ||
1605 | ret = 2; | ||
1606 | break; | ||
1607 | case LOCK_USED_IN_SOFTIRQ_READ: | ||
1608 | if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS)) | ||
1609 | return 0; | ||
1610 | /* | ||
1611 | * just marked it softirq-read-safe, check that this lock | ||
1612 | * took no softirq-unsafe lock in the past: | ||
1613 | */ | ||
1614 | if (!check_usage_forwards(curr, this, | ||
1615 | LOCK_ENABLED_SOFTIRQS, "soft")) | ||
1616 | return 0; | ||
1617 | if (softirq_verbose(this->class)) | ||
1618 | ret = 2; | ||
1619 | break; | ||
1620 | case LOCK_ENABLED_HARDIRQS: | ||
1621 | if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ)) | ||
1622 | return 0; | ||
1623 | if (!valid_state(curr, this, new_bit, | ||
1624 | LOCK_USED_IN_HARDIRQ_READ)) | ||
1625 | return 0; | ||
1626 | /* | ||
1627 | * just marked it hardirq-unsafe, check that no hardirq-safe | ||
1628 | * lock in the system ever took it in the past: | ||
1629 | */ | ||
1630 | if (!check_usage_backwards(curr, this, | ||
1631 | LOCK_USED_IN_HARDIRQ, "hard")) | ||
1632 | return 0; | ||
1633 | #if STRICT_READ_CHECKS | ||
1634 | /* | ||
1635 | * just marked it hardirq-unsafe, check that no | ||
1636 | * hardirq-safe-read lock in the system ever took | ||
1637 | * it in the past: | ||
1638 | */ | ||
1639 | if (!check_usage_backwards(curr, this, | ||
1640 | LOCK_USED_IN_HARDIRQ_READ, "hard-read")) | ||
1641 | return 0; | ||
1642 | #endif | ||
1643 | if (hardirq_verbose(this->class)) | ||
1644 | ret = 2; | ||
1645 | break; | ||
1646 | case LOCK_ENABLED_SOFTIRQS: | ||
1647 | if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ)) | ||
1648 | return 0; | ||
1649 | if (!valid_state(curr, this, new_bit, | ||
1650 | LOCK_USED_IN_SOFTIRQ_READ)) | ||
1651 | return 0; | ||
1652 | /* | ||
1653 | * just marked it softirq-unsafe, check that no softirq-safe | ||
1654 | * lock in the system ever took it in the past: | ||
1655 | */ | ||
1656 | if (!check_usage_backwards(curr, this, | ||
1657 | LOCK_USED_IN_SOFTIRQ, "soft")) | ||
1658 | return 0; | ||
1659 | #if STRICT_READ_CHECKS | ||
1660 | /* | ||
1661 | * just marked it softirq-unsafe, check that no | ||
1662 | * softirq-safe-read lock in the system ever took | ||
1663 | * it in the past: | ||
1664 | */ | ||
1665 | if (!check_usage_backwards(curr, this, | ||
1666 | LOCK_USED_IN_SOFTIRQ_READ, "soft-read")) | ||
1667 | return 0; | ||
1668 | #endif | ||
1669 | if (softirq_verbose(this->class)) | ||
1670 | ret = 2; | ||
1671 | break; | ||
1672 | case LOCK_ENABLED_HARDIRQS_READ: | ||
1673 | if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ)) | ||
1674 | return 0; | ||
1675 | #if STRICT_READ_CHECKS | ||
1676 | /* | ||
1677 | * just marked it hardirq-read-unsafe, check that no | ||
1678 | * hardirq-safe lock in the system ever took it in the past: | ||
1679 | */ | ||
1680 | if (!check_usage_backwards(curr, this, | ||
1681 | LOCK_USED_IN_HARDIRQ, "hard")) | ||
1682 | return 0; | ||
1683 | #endif | ||
1684 | if (hardirq_verbose(this->class)) | ||
1685 | ret = 2; | ||
1686 | break; | ||
1687 | case LOCK_ENABLED_SOFTIRQS_READ: | ||
1688 | if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ)) | ||
1689 | return 0; | ||
1690 | #if STRICT_READ_CHECKS | ||
1691 | /* | ||
1692 | * just marked it softirq-read-unsafe, check that no | ||
1693 | * softirq-safe lock in the system ever took it in the past: | ||
1694 | */ | ||
1695 | if (!check_usage_backwards(curr, this, | ||
1696 | LOCK_USED_IN_SOFTIRQ, "soft")) | ||
1697 | return 0; | ||
1698 | #endif | ||
1699 | if (softirq_verbose(this->class)) | ||
1700 | ret = 2; | ||
1701 | break; | ||
1702 | #endif | ||
1703 | case LOCK_USED: | ||
1704 | /* | ||
1705 | * Add it to the global list of classes: | ||
1706 | */ | ||
1707 | list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes); | ||
1708 | debug_atomic_dec(&nr_unused_locks); | ||
1709 | break; | ||
1710 | default: | ||
1711 | debug_locks_off(); | ||
1712 | WARN_ON(1); | ||
1713 | return 0; | ||
1714 | } | ||
1715 | |||
1716 | __raw_spin_unlock(&hash_lock); | ||
1717 | |||
1718 | /* | ||
1719 | * We must printk outside of the hash_lock: | ||
1720 | */ | ||
1721 | if (ret == 2) { | ||
1722 | printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); | ||
1723 | print_lock(this); | ||
1724 | print_irqtrace_events(curr); | ||
1725 | dump_stack(); | ||
1726 | } | ||
1727 | |||
1728 | return ret; | ||
1729 | } | ||
1730 | |||
1731 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
1732 | /* | ||
1733 | * Mark all held locks with a usage bit: | ||
1734 | */ | ||
1735 | static int | ||
1736 | mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip) | ||
1737 | { | ||
1738 | enum lock_usage_bit usage_bit; | ||
1739 | struct held_lock *hlock; | ||
1740 | int i; | ||
1741 | |||
1742 | for (i = 0; i < curr->lockdep_depth; i++) { | ||
1743 | hlock = curr->held_locks + i; | ||
1744 | |||
1745 | if (hardirq) { | ||
1746 | if (hlock->read) | ||
1747 | usage_bit = LOCK_ENABLED_HARDIRQS_READ; | ||
1748 | else | ||
1749 | usage_bit = LOCK_ENABLED_HARDIRQS; | ||
1750 | } else { | ||
1751 | if (hlock->read) | ||
1752 | usage_bit = LOCK_ENABLED_SOFTIRQS_READ; | ||
1753 | else | ||
1754 | usage_bit = LOCK_ENABLED_SOFTIRQS; | ||
1755 | } | ||
1756 | if (!mark_lock(curr, hlock, usage_bit, ip)) | ||
1757 | return 0; | ||
1758 | } | ||
1759 | |||
1760 | return 1; | ||
1761 | } | ||
1762 | |||
1763 | /* | ||
1764 | * Debugging helper: via this flag we know that we are in | ||
1765 | * 'early bootup code', and will warn about any invalid irqs-on event: | ||
1766 | */ | ||
1767 | static int early_boot_irqs_enabled; | ||
1768 | |||
1769 | void early_boot_irqs_off(void) | ||
1770 | { | ||
1771 | early_boot_irqs_enabled = 0; | ||
1772 | } | ||
1773 | |||
1774 | void early_boot_irqs_on(void) | ||
1775 | { | ||
1776 | early_boot_irqs_enabled = 1; | ||
1777 | } | ||
1778 | |||
1779 | /* | ||
1780 | * Hardirqs will be enabled: | ||
1781 | */ | ||
1782 | void trace_hardirqs_on(void) | ||
1783 | { | ||
1784 | struct task_struct *curr = current; | ||
1785 | unsigned long ip; | ||
1786 | |||
1787 | if (unlikely(!debug_locks || current->lockdep_recursion)) | ||
1788 | return; | ||
1789 | |||
1790 | if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled))) | ||
1791 | return; | ||
1792 | |||
1793 | if (unlikely(curr->hardirqs_enabled)) { | ||
1794 | debug_atomic_inc(&redundant_hardirqs_on); | ||
1795 | return; | ||
1796 | } | ||
1797 | /* we'll do an OFF -> ON transition: */ | ||
1798 | curr->hardirqs_enabled = 1; | ||
1799 | ip = (unsigned long) __builtin_return_address(0); | ||
1800 | |||
1801 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | ||
1802 | return; | ||
1803 | if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) | ||
1804 | return; | ||
1805 | /* | ||
1806 | * We are going to turn hardirqs on, so set the | ||
1807 | * usage bit for all held locks: | ||
1808 | */ | ||
1809 | if (!mark_held_locks(curr, 1, ip)) | ||
1810 | return; | ||
1811 | /* | ||
1812 | * If we have softirqs enabled, then set the usage | ||
1813 | * bit for all held locks. (disabled hardirqs prevented | ||
1814 | * this bit from being set before) | ||
1815 | */ | ||
1816 | if (curr->softirqs_enabled) | ||
1817 | if (!mark_held_locks(curr, 0, ip)) | ||
1818 | return; | ||
1819 | |||
1820 | curr->hardirq_enable_ip = ip; | ||
1821 | curr->hardirq_enable_event = ++curr->irq_events; | ||
1822 | debug_atomic_inc(&hardirqs_on_events); | ||
1823 | } | ||
1824 | |||
1825 | EXPORT_SYMBOL(trace_hardirqs_on); | ||
1826 | |||
1827 | /* | ||
1828 | * Hardirqs were disabled: | ||
1829 | */ | ||
1830 | void trace_hardirqs_off(void) | ||
1831 | { | ||
1832 | struct task_struct *curr = current; | ||
1833 | |||
1834 | if (unlikely(!debug_locks || current->lockdep_recursion)) | ||
1835 | return; | ||
1836 | |||
1837 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | ||
1838 | return; | ||
1839 | |||
1840 | if (curr->hardirqs_enabled) { | ||
1841 | /* | ||
1842 | * We have done an ON -> OFF transition: | ||
1843 | */ | ||
1844 | curr->hardirqs_enabled = 0; | ||
1845 | curr->hardirq_disable_ip = _RET_IP_; | ||
1846 | curr->hardirq_disable_event = ++curr->irq_events; | ||
1847 | debug_atomic_inc(&hardirqs_off_events); | ||
1848 | } else | ||
1849 | debug_atomic_inc(&redundant_hardirqs_off); | ||
1850 | } | ||
1851 | |||
1852 | EXPORT_SYMBOL(trace_hardirqs_off); | ||
1853 | |||
1854 | /* | ||
1855 | * Softirqs will be enabled: | ||
1856 | */ | ||
1857 | void trace_softirqs_on(unsigned long ip) | ||
1858 | { | ||
1859 | struct task_struct *curr = current; | ||
1860 | |||
1861 | if (unlikely(!debug_locks)) | ||
1862 | return; | ||
1863 | |||
1864 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | ||
1865 | return; | ||
1866 | |||
1867 | if (curr->softirqs_enabled) { | ||
1868 | debug_atomic_inc(&redundant_softirqs_on); | ||
1869 | return; | ||
1870 | } | ||
1871 | |||
1872 | /* | ||
1873 | * We'll do an OFF -> ON transition: | ||
1874 | */ | ||
1875 | curr->softirqs_enabled = 1; | ||
1876 | curr->softirq_enable_ip = ip; | ||
1877 | curr->softirq_enable_event = ++curr->irq_events; | ||
1878 | debug_atomic_inc(&softirqs_on_events); | ||
1879 | /* | ||
1880 | * We are going to turn softirqs on, so set the | ||
1881 | * usage bit for all held locks, if hardirqs are | ||
1882 | * enabled too: | ||
1883 | */ | ||
1884 | if (curr->hardirqs_enabled) | ||
1885 | mark_held_locks(curr, 0, ip); | ||
1886 | } | ||
1887 | |||
1888 | /* | ||
1889 | * Softirqs were disabled: | ||
1890 | */ | ||
1891 | void trace_softirqs_off(unsigned long ip) | ||
1892 | { | ||
1893 | struct task_struct *curr = current; | ||
1894 | |||
1895 | if (unlikely(!debug_locks)) | ||
1896 | return; | ||
1897 | |||
1898 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | ||
1899 | return; | ||
1900 | |||
1901 | if (curr->softirqs_enabled) { | ||
1902 | /* | ||
1903 | * We have done an ON -> OFF transition: | ||
1904 | */ | ||
1905 | curr->softirqs_enabled = 0; | ||
1906 | curr->softirq_disable_ip = ip; | ||
1907 | curr->softirq_disable_event = ++curr->irq_events; | ||
1908 | debug_atomic_inc(&softirqs_off_events); | ||
1909 | DEBUG_LOCKS_WARN_ON(!softirq_count()); | ||
1910 | } else | ||
1911 | debug_atomic_inc(&redundant_softirqs_off); | ||
1912 | } | ||
1913 | |||
1914 | #endif | ||
1915 | |||
1916 | /* | ||
1917 | * Initialize a lock instance's lock-class mapping info: | ||
1918 | */ | ||
1919 | void lockdep_init_map(struct lockdep_map *lock, const char *name, | ||
1920 | struct lock_class_key *key) | ||
1921 | { | ||
1922 | if (unlikely(!debug_locks)) | ||
1923 | return; | ||
1924 | |||
1925 | if (DEBUG_LOCKS_WARN_ON(!key)) | ||
1926 | return; | ||
1927 | if (DEBUG_LOCKS_WARN_ON(!name)) | ||
1928 | return; | ||
1929 | /* | ||
1930 | * Sanity check, the lock-class key must be persistent: | ||
1931 | */ | ||
1932 | if (!static_obj(key)) { | ||
1933 | printk("BUG: key %p not in .data!\n", key); | ||
1934 | DEBUG_LOCKS_WARN_ON(1); | ||
1935 | return; | ||
1936 | } | ||
1937 | lock->name = name; | ||
1938 | lock->key = key; | ||
1939 | lock->class_cache = NULL; | ||
1940 | } | ||
1941 | |||
1942 | EXPORT_SYMBOL_GPL(lockdep_init_map); | ||
1943 | |||
1944 | /* | ||
1945 | * This gets called for every mutex_lock*()/spin_lock*() operation. | ||
1946 | * We maintain the dependency maps and validate the locking attempt: | ||
1947 | */ | ||
1948 | static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | ||
1949 | int trylock, int read, int check, int hardirqs_off, | ||
1950 | unsigned long ip) | ||
1951 | { | ||
1952 | struct task_struct *curr = current; | ||
1953 | struct lock_class *class = NULL; | ||
1954 | struct held_lock *hlock; | ||
1955 | unsigned int depth, id; | ||
1956 | int chain_head = 0; | ||
1957 | u64 chain_key; | ||
1958 | |||
1959 | if (unlikely(!debug_locks)) | ||
1960 | return 0; | ||
1961 | |||
1962 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | ||
1963 | return 0; | ||
1964 | |||
1965 | if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { | ||
1966 | debug_locks_off(); | ||
1967 | printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n"); | ||
1968 | printk("turning off the locking correctness validator.\n"); | ||
1969 | return 0; | ||
1970 | } | ||
1971 | |||
1972 | if (!subclass) | ||
1973 | class = lock->class_cache; | ||
1974 | /* | ||
1975 | * Not cached yet or subclass? | ||
1976 | */ | ||
1977 | if (unlikely(!class)) { | ||
1978 | class = register_lock_class(lock, subclass); | ||
1979 | if (!class) | ||
1980 | return 0; | ||
1981 | } | ||
1982 | debug_atomic_inc((atomic_t *)&class->ops); | ||
1983 | if (very_verbose(class)) { | ||
1984 | printk("\nacquire class [%p] %s", class->key, class->name); | ||
1985 | if (class->name_version > 1) | ||
1986 | printk("#%d", class->name_version); | ||
1987 | printk("\n"); | ||
1988 | dump_stack(); | ||
1989 | } | ||
1990 | |||
1991 | /* | ||
1992 | * Add the lock to the list of currently held locks. | ||
1993 | * (we dont increase the depth just yet, up until the | ||
1994 | * dependency checks are done) | ||
1995 | */ | ||
1996 | depth = curr->lockdep_depth; | ||
1997 | if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) | ||
1998 | return 0; | ||
1999 | |||
2000 | hlock = curr->held_locks + depth; | ||
2001 | |||
2002 | hlock->class = class; | ||
2003 | hlock->acquire_ip = ip; | ||
2004 | hlock->instance = lock; | ||
2005 | hlock->trylock = trylock; | ||
2006 | hlock->read = read; | ||
2007 | hlock->check = check; | ||
2008 | hlock->hardirqs_off = hardirqs_off; | ||
2009 | |||
2010 | if (check != 2) | ||
2011 | goto out_calc_hash; | ||
2012 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
2013 | /* | ||
2014 | * If non-trylock use in a hardirq or softirq context, then | ||
2015 | * mark the lock as used in these contexts: | ||
2016 | */ | ||
2017 | if (!trylock) { | ||
2018 | if (read) { | ||
2019 | if (curr->hardirq_context) | ||
2020 | if (!mark_lock(curr, hlock, | ||
2021 | LOCK_USED_IN_HARDIRQ_READ, ip)) | ||
2022 | return 0; | ||
2023 | if (curr->softirq_context) | ||
2024 | if (!mark_lock(curr, hlock, | ||
2025 | LOCK_USED_IN_SOFTIRQ_READ, ip)) | ||
2026 | return 0; | ||
2027 | } else { | ||
2028 | if (curr->hardirq_context) | ||
2029 | if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ, ip)) | ||
2030 | return 0; | ||
2031 | if (curr->softirq_context) | ||
2032 | if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ, ip)) | ||
2033 | return 0; | ||
2034 | } | ||
2035 | } | ||
2036 | if (!hardirqs_off) { | ||
2037 | if (read) { | ||
2038 | if (!mark_lock(curr, hlock, | ||
2039 | LOCK_ENABLED_HARDIRQS_READ, ip)) | ||
2040 | return 0; | ||
2041 | if (curr->softirqs_enabled) | ||
2042 | if (!mark_lock(curr, hlock, | ||
2043 | LOCK_ENABLED_SOFTIRQS_READ, ip)) | ||
2044 | return 0; | ||
2045 | } else { | ||
2046 | if (!mark_lock(curr, hlock, | ||
2047 | LOCK_ENABLED_HARDIRQS, ip)) | ||
2048 | return 0; | ||
2049 | if (curr->softirqs_enabled) | ||
2050 | if (!mark_lock(curr, hlock, | ||
2051 | LOCK_ENABLED_SOFTIRQS, ip)) | ||
2052 | return 0; | ||
2053 | } | ||
2054 | } | ||
2055 | #endif | ||
2056 | /* mark it as used: */ | ||
2057 | if (!mark_lock(curr, hlock, LOCK_USED, ip)) | ||
2058 | return 0; | ||
2059 | out_calc_hash: | ||
2060 | /* | ||
2061 | * Calculate the chain hash: it's the combined has of all the | ||
2062 | * lock keys along the dependency chain. We save the hash value | ||
2063 | * at every step so that we can get the current hash easily | ||
2064 | * after unlock. The chain hash is then used to cache dependency | ||
2065 | * results. | ||
2066 | * | ||
2067 | * The 'key ID' is what is the most compact key value to drive | ||
2068 | * the hash, not class->key. | ||
2069 | */ | ||
2070 | id = class - lock_classes; | ||
2071 | if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) | ||
2072 | return 0; | ||
2073 | |||
2074 | chain_key = curr->curr_chain_key; | ||
2075 | if (!depth) { | ||
2076 | if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) | ||
2077 | return 0; | ||
2078 | chain_head = 1; | ||
2079 | } | ||
2080 | |||
2081 | hlock->prev_chain_key = chain_key; | ||
2082 | |||
2083 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
2084 | /* | ||
2085 | * Keep track of points where we cross into an interrupt context: | ||
2086 | */ | ||
2087 | hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) + | ||
2088 | curr->softirq_context; | ||
2089 | if (depth) { | ||
2090 | struct held_lock *prev_hlock; | ||
2091 | |||
2092 | prev_hlock = curr->held_locks + depth-1; | ||
2093 | /* | ||
2094 | * If we cross into another context, reset the | ||
2095 | * hash key (this also prevents the checking and the | ||
2096 | * adding of the dependency to 'prev'): | ||
2097 | */ | ||
2098 | if (prev_hlock->irq_context != hlock->irq_context) { | ||
2099 | chain_key = 0; | ||
2100 | chain_head = 1; | ||
2101 | } | ||
2102 | } | ||
2103 | #endif | ||
2104 | chain_key = iterate_chain_key(chain_key, id); | ||
2105 | curr->curr_chain_key = chain_key; | ||
2106 | |||
2107 | /* | ||
2108 | * Trylock needs to maintain the stack of held locks, but it | ||
2109 | * does not add new dependencies, because trylock can be done | ||
2110 | * in any order. | ||
2111 | * | ||
2112 | * We look up the chain_key and do the O(N^2) check and update of | ||
2113 | * the dependencies only if this is a new dependency chain. | ||
2114 | * (If lookup_chain_cache() returns with 1 it acquires | ||
2115 | * hash_lock for us) | ||
2116 | */ | ||
2117 | if (!trylock && (check == 2) && lookup_chain_cache(chain_key)) { | ||
2118 | /* | ||
2119 | * Check whether last held lock: | ||
2120 | * | ||
2121 | * - is irq-safe, if this lock is irq-unsafe | ||
2122 | * - is softirq-safe, if this lock is hardirq-unsafe | ||
2123 | * | ||
2124 | * And check whether the new lock's dependency graph | ||
2125 | * could lead back to the previous lock. | ||
2126 | * | ||
2127 | * any of these scenarios could lead to a deadlock. If | ||
2128 | * All validations | ||
2129 | */ | ||
2130 | int ret = check_deadlock(curr, hlock, lock, read); | ||
2131 | |||
2132 | if (!ret) | ||
2133 | return 0; | ||
2134 | /* | ||
2135 | * Mark recursive read, as we jump over it when | ||
2136 | * building dependencies (just like we jump over | ||
2137 | * trylock entries): | ||
2138 | */ | ||
2139 | if (ret == 2) | ||
2140 | hlock->read = 2; | ||
2141 | /* | ||
2142 | * Add dependency only if this lock is not the head | ||
2143 | * of the chain, and if it's not a secondary read-lock: | ||
2144 | */ | ||
2145 | if (!chain_head && ret != 2) | ||
2146 | if (!check_prevs_add(curr, hlock)) | ||
2147 | return 0; | ||
2148 | __raw_spin_unlock(&hash_lock); | ||
2149 | } | ||
2150 | curr->lockdep_depth++; | ||
2151 | check_chain_key(curr); | ||
2152 | if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { | ||
2153 | debug_locks_off(); | ||
2154 | printk("BUG: MAX_LOCK_DEPTH too low!\n"); | ||
2155 | printk("turning off the locking correctness validator.\n"); | ||
2156 | return 0; | ||
2157 | } | ||
2158 | if (unlikely(curr->lockdep_depth > max_lockdep_depth)) | ||
2159 | max_lockdep_depth = curr->lockdep_depth; | ||
2160 | |||
2161 | return 1; | ||
2162 | } | ||
2163 | |||
2164 | static int | ||
2165 | print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, | ||
2166 | unsigned long ip) | ||
2167 | { | ||
2168 | if (!debug_locks_off()) | ||
2169 | return 0; | ||
2170 | if (debug_locks_silent) | ||
2171 | return 0; | ||
2172 | |||
2173 | printk("\n=====================================\n"); | ||
2174 | printk( "[ BUG: bad unlock balance detected! ]\n"); | ||
2175 | printk( "-------------------------------------\n"); | ||
2176 | printk("%s/%d is trying to release lock (", | ||
2177 | curr->comm, curr->pid); | ||
2178 | print_lockdep_cache(lock); | ||
2179 | printk(") at:\n"); | ||
2180 | print_ip_sym(ip); | ||
2181 | printk("but there are no more locks to release!\n"); | ||
2182 | printk("\nother info that might help us debug this:\n"); | ||
2183 | lockdep_print_held_locks(curr); | ||
2184 | |||
2185 | printk("\nstack backtrace:\n"); | ||
2186 | dump_stack(); | ||
2187 | |||
2188 | return 0; | ||
2189 | } | ||
2190 | |||
2191 | /* | ||
2192 | * Common debugging checks for both nested and non-nested unlock: | ||
2193 | */ | ||
2194 | static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, | ||
2195 | unsigned long ip) | ||
2196 | { | ||
2197 | if (unlikely(!debug_locks)) | ||
2198 | return 0; | ||
2199 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | ||
2200 | return 0; | ||
2201 | |||
2202 | if (curr->lockdep_depth <= 0) | ||
2203 | return print_unlock_inbalance_bug(curr, lock, ip); | ||
2204 | |||
2205 | return 1; | ||
2206 | } | ||
2207 | |||
2208 | /* | ||
2209 | * Remove the lock to the list of currently held locks in a | ||
2210 | * potentially non-nested (out of order) manner. This is a | ||
2211 | * relatively rare operation, as all the unlock APIs default | ||
2212 | * to nested mode (which uses lock_release()): | ||
2213 | */ | ||
2214 | static int | ||
2215 | lock_release_non_nested(struct task_struct *curr, | ||
2216 | struct lockdep_map *lock, unsigned long ip) | ||
2217 | { | ||
2218 | struct held_lock *hlock, *prev_hlock; | ||
2219 | unsigned int depth; | ||
2220 | int i; | ||
2221 | |||
2222 | /* | ||
2223 | * Check whether the lock exists in the current stack | ||
2224 | * of held locks: | ||
2225 | */ | ||
2226 | depth = curr->lockdep_depth; | ||
2227 | if (DEBUG_LOCKS_WARN_ON(!depth)) | ||
2228 | return 0; | ||
2229 | |||
2230 | prev_hlock = NULL; | ||
2231 | for (i = depth-1; i >= 0; i--) { | ||
2232 | hlock = curr->held_locks + i; | ||
2233 | /* | ||
2234 | * We must not cross into another context: | ||
2235 | */ | ||
2236 | if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) | ||
2237 | break; | ||
2238 | if (hlock->instance == lock) | ||
2239 | goto found_it; | ||
2240 | prev_hlock = hlock; | ||
2241 | } | ||
2242 | return print_unlock_inbalance_bug(curr, lock, ip); | ||
2243 | |||
2244 | found_it: | ||
2245 | /* | ||
2246 | * We have the right lock to unlock, 'hlock' points to it. | ||
2247 | * Now we remove it from the stack, and add back the other | ||
2248 | * entries (if any), recalculating the hash along the way: | ||
2249 | */ | ||
2250 | curr->lockdep_depth = i; | ||
2251 | curr->curr_chain_key = hlock->prev_chain_key; | ||
2252 | |||
2253 | for (i++; i < depth; i++) { | ||
2254 | hlock = curr->held_locks + i; | ||
2255 | if (!__lock_acquire(hlock->instance, | ||
2256 | hlock->class->subclass, hlock->trylock, | ||
2257 | hlock->read, hlock->check, hlock->hardirqs_off, | ||
2258 | hlock->acquire_ip)) | ||
2259 | return 0; | ||
2260 | } | ||
2261 | |||
2262 | if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) | ||
2263 | return 0; | ||
2264 | return 1; | ||
2265 | } | ||
2266 | |||
2267 | /* | ||
2268 | * Remove the lock to the list of currently held locks - this gets | ||
2269 | * called on mutex_unlock()/spin_unlock*() (or on a failed | ||
2270 | * mutex_lock_interruptible()). This is done for unlocks that nest | ||
2271 | * perfectly. (i.e. the current top of the lock-stack is unlocked) | ||
2272 | */ | ||
2273 | static int lock_release_nested(struct task_struct *curr, | ||
2274 | struct lockdep_map *lock, unsigned long ip) | ||
2275 | { | ||
2276 | struct held_lock *hlock; | ||
2277 | unsigned int depth; | ||
2278 | |||
2279 | /* | ||
2280 | * Pop off the top of the lock stack: | ||
2281 | */ | ||
2282 | depth = curr->lockdep_depth - 1; | ||
2283 | hlock = curr->held_locks + depth; | ||
2284 | |||
2285 | /* | ||
2286 | * Is the unlock non-nested: | ||
2287 | */ | ||
2288 | if (hlock->instance != lock) | ||
2289 | return lock_release_non_nested(curr, lock, ip); | ||
2290 | curr->lockdep_depth--; | ||
2291 | |||
2292 | if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) | ||
2293 | return 0; | ||
2294 | |||
2295 | curr->curr_chain_key = hlock->prev_chain_key; | ||
2296 | |||
2297 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
2298 | hlock->prev_chain_key = 0; | ||
2299 | hlock->class = NULL; | ||
2300 | hlock->acquire_ip = 0; | ||
2301 | hlock->irq_context = 0; | ||
2302 | #endif | ||
2303 | return 1; | ||
2304 | } | ||
2305 | |||
2306 | /* | ||
2307 | * Remove the lock to the list of currently held locks - this gets | ||
2308 | * called on mutex_unlock()/spin_unlock*() (or on a failed | ||
2309 | * mutex_lock_interruptible()). This is done for unlocks that nest | ||
2310 | * perfectly. (i.e. the current top of the lock-stack is unlocked) | ||
2311 | */ | ||
2312 | static void | ||
2313 | __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) | ||
2314 | { | ||
2315 | struct task_struct *curr = current; | ||
2316 | |||
2317 | if (!check_unlock(curr, lock, ip)) | ||
2318 | return; | ||
2319 | |||
2320 | if (nested) { | ||
2321 | if (!lock_release_nested(curr, lock, ip)) | ||
2322 | return; | ||
2323 | } else { | ||
2324 | if (!lock_release_non_nested(curr, lock, ip)) | ||
2325 | return; | ||
2326 | } | ||
2327 | |||
2328 | check_chain_key(curr); | ||
2329 | } | ||
2330 | |||
2331 | /* | ||
2332 | * Check whether we follow the irq-flags state precisely: | ||
2333 | */ | ||
2334 | static void check_flags(unsigned long flags) | ||
2335 | { | ||
2336 | #if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS) | ||
2337 | if (!debug_locks) | ||
2338 | return; | ||
2339 | |||
2340 | if (irqs_disabled_flags(flags)) | ||
2341 | DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled); | ||
2342 | else | ||
2343 | DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled); | ||
2344 | |||
2345 | /* | ||
2346 | * We dont accurately track softirq state in e.g. | ||
2347 | * hardirq contexts (such as on 4KSTACKS), so only | ||
2348 | * check if not in hardirq contexts: | ||
2349 | */ | ||
2350 | if (!hardirq_count()) { | ||
2351 | if (softirq_count()) | ||
2352 | DEBUG_LOCKS_WARN_ON(current->softirqs_enabled); | ||
2353 | else | ||
2354 | DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); | ||
2355 | } | ||
2356 | |||
2357 | if (!debug_locks) | ||
2358 | print_irqtrace_events(current); | ||
2359 | #endif | ||
2360 | } | ||
2361 | |||
2362 | /* | ||
2363 | * We are not always called with irqs disabled - do that here, | ||
2364 | * and also avoid lockdep recursion: | ||
2365 | */ | ||
2366 | void lock_acquire(struct lockdep_map *lock, unsigned int subclass, | ||
2367 | int trylock, int read, int check, unsigned long ip) | ||
2368 | { | ||
2369 | unsigned long flags; | ||
2370 | |||
2371 | if (unlikely(current->lockdep_recursion)) | ||
2372 | return; | ||
2373 | |||
2374 | raw_local_irq_save(flags); | ||
2375 | check_flags(flags); | ||
2376 | |||
2377 | current->lockdep_recursion = 1; | ||
2378 | __lock_acquire(lock, subclass, trylock, read, check, | ||
2379 | irqs_disabled_flags(flags), ip); | ||
2380 | current->lockdep_recursion = 0; | ||
2381 | raw_local_irq_restore(flags); | ||
2382 | } | ||
2383 | |||
2384 | EXPORT_SYMBOL_GPL(lock_acquire); | ||
2385 | |||
2386 | void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) | ||
2387 | { | ||
2388 | unsigned long flags; | ||
2389 | |||
2390 | if (unlikely(current->lockdep_recursion)) | ||
2391 | return; | ||
2392 | |||
2393 | raw_local_irq_save(flags); | ||
2394 | check_flags(flags); | ||
2395 | current->lockdep_recursion = 1; | ||
2396 | __lock_release(lock, nested, ip); | ||
2397 | current->lockdep_recursion = 0; | ||
2398 | raw_local_irq_restore(flags); | ||
2399 | } | ||
2400 | |||
2401 | EXPORT_SYMBOL_GPL(lock_release); | ||
2402 | |||
2403 | /* | ||
2404 | * Used by the testsuite, sanitize the validator state | ||
2405 | * after a simulated failure: | ||
2406 | */ | ||
2407 | |||
2408 | void lockdep_reset(void) | ||
2409 | { | ||
2410 | unsigned long flags; | ||
2411 | |||
2412 | raw_local_irq_save(flags); | ||
2413 | current->curr_chain_key = 0; | ||
2414 | current->lockdep_depth = 0; | ||
2415 | current->lockdep_recursion = 0; | ||
2416 | memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock)); | ||
2417 | nr_hardirq_chains = 0; | ||
2418 | nr_softirq_chains = 0; | ||
2419 | nr_process_chains = 0; | ||
2420 | debug_locks = 1; | ||
2421 | raw_local_irq_restore(flags); | ||
2422 | } | ||
2423 | |||
2424 | static void zap_class(struct lock_class *class) | ||
2425 | { | ||
2426 | int i; | ||
2427 | |||
2428 | /* | ||
2429 | * Remove all dependencies this lock is | ||
2430 | * involved in: | ||
2431 | */ | ||
2432 | for (i = 0; i < nr_list_entries; i++) { | ||
2433 | if (list_entries[i].class == class) | ||
2434 | list_del_rcu(&list_entries[i].entry); | ||
2435 | } | ||
2436 | /* | ||
2437 | * Unhash the class and remove it from the all_lock_classes list: | ||
2438 | */ | ||
2439 | list_del_rcu(&class->hash_entry); | ||
2440 | list_del_rcu(&class->lock_entry); | ||
2441 | |||
2442 | } | ||
2443 | |||
2444 | static inline int within(void *addr, void *start, unsigned long size) | ||
2445 | { | ||
2446 | return addr >= start && addr < start + size; | ||
2447 | } | ||
2448 | |||
2449 | void lockdep_free_key_range(void *start, unsigned long size) | ||
2450 | { | ||
2451 | struct lock_class *class, *next; | ||
2452 | struct list_head *head; | ||
2453 | unsigned long flags; | ||
2454 | int i; | ||
2455 | |||
2456 | raw_local_irq_save(flags); | ||
2457 | __raw_spin_lock(&hash_lock); | ||
2458 | |||
2459 | /* | ||
2460 | * Unhash all classes that were created by this module: | ||
2461 | */ | ||
2462 | for (i = 0; i < CLASSHASH_SIZE; i++) { | ||
2463 | head = classhash_table + i; | ||
2464 | if (list_empty(head)) | ||
2465 | continue; | ||
2466 | list_for_each_entry_safe(class, next, head, hash_entry) | ||
2467 | if (within(class->key, start, size)) | ||
2468 | zap_class(class); | ||
2469 | } | ||
2470 | |||
2471 | __raw_spin_unlock(&hash_lock); | ||
2472 | raw_local_irq_restore(flags); | ||
2473 | } | ||
2474 | |||
2475 | void lockdep_reset_lock(struct lockdep_map *lock) | ||
2476 | { | ||
2477 | struct lock_class *class, *next; | ||
2478 | struct list_head *head; | ||
2479 | unsigned long flags; | ||
2480 | int i, j; | ||
2481 | |||
2482 | raw_local_irq_save(flags); | ||
2483 | |||
2484 | /* | ||
2485 | * Remove all classes this lock might have: | ||
2486 | */ | ||
2487 | for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) { | ||
2488 | /* | ||
2489 | * If the class exists we look it up and zap it: | ||
2490 | */ | ||
2491 | class = look_up_lock_class(lock, j); | ||
2492 | if (class) | ||
2493 | zap_class(class); | ||
2494 | } | ||
2495 | /* | ||
2496 | * Debug check: in the end all mapped classes should | ||
2497 | * be gone. | ||
2498 | */ | ||
2499 | __raw_spin_lock(&hash_lock); | ||
2500 | for (i = 0; i < CLASSHASH_SIZE; i++) { | ||
2501 | head = classhash_table + i; | ||
2502 | if (list_empty(head)) | ||
2503 | continue; | ||
2504 | list_for_each_entry_safe(class, next, head, hash_entry) { | ||
2505 | if (unlikely(class == lock->class_cache)) { | ||
2506 | __raw_spin_unlock(&hash_lock); | ||
2507 | DEBUG_LOCKS_WARN_ON(1); | ||
2508 | goto out_restore; | ||
2509 | } | ||
2510 | } | ||
2511 | } | ||
2512 | __raw_spin_unlock(&hash_lock); | ||
2513 | |||
2514 | out_restore: | ||
2515 | raw_local_irq_restore(flags); | ||
2516 | } | ||
2517 | |||
2518 | void __init lockdep_init(void) | ||
2519 | { | ||
2520 | int i; | ||
2521 | |||
2522 | /* | ||
2523 | * Some architectures have their own start_kernel() | ||
2524 | * code which calls lockdep_init(), while we also | ||
2525 | * call lockdep_init() from the start_kernel() itself, | ||
2526 | * and we want to initialize the hashes only once: | ||
2527 | */ | ||
2528 | if (lockdep_initialized) | ||
2529 | return; | ||
2530 | |||
2531 | for (i = 0; i < CLASSHASH_SIZE; i++) | ||
2532 | INIT_LIST_HEAD(classhash_table + i); | ||
2533 | |||
2534 | for (i = 0; i < CHAINHASH_SIZE; i++) | ||
2535 | INIT_LIST_HEAD(chainhash_table + i); | ||
2536 | |||
2537 | lockdep_initialized = 1; | ||
2538 | } | ||
2539 | |||
2540 | void __init lockdep_info(void) | ||
2541 | { | ||
2542 | printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); | ||
2543 | |||
2544 | printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); | ||
2545 | printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); | ||
2546 | printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); | ||
2547 | printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); | ||
2548 | printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); | ||
2549 | printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); | ||
2550 | printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); | ||
2551 | |||
2552 | printk(" memory used by lock dependency info: %lu kB\n", | ||
2553 | (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS + | ||
2554 | sizeof(struct list_head) * CLASSHASH_SIZE + | ||
2555 | sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + | ||
2556 | sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + | ||
2557 | sizeof(struct list_head) * CHAINHASH_SIZE) / 1024); | ||
2558 | |||
2559 | printk(" per task-struct memory footprint: %lu bytes\n", | ||
2560 | sizeof(struct held_lock) * MAX_LOCK_DEPTH); | ||
2561 | |||
2562 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
2563 | if (lockdep_init_error) | ||
2564 | printk("WARNING: lockdep init error! Arch code didnt call lockdep_init() early enough?\n"); | ||
2565 | #endif | ||
2566 | } | ||
2567 | |||
2568 | static inline int in_range(const void *start, const void *addr, const void *end) | ||
2569 | { | ||
2570 | return addr >= start && addr <= end; | ||
2571 | } | ||
2572 | |||
2573 | static void | ||
2574 | print_freed_lock_bug(struct task_struct *curr, const void *mem_from, | ||
2575 | const void *mem_to, struct held_lock *hlock) | ||
2576 | { | ||
2577 | if (!debug_locks_off()) | ||
2578 | return; | ||
2579 | if (debug_locks_silent) | ||
2580 | return; | ||
2581 | |||
2582 | printk("\n=========================\n"); | ||
2583 | printk( "[ BUG: held lock freed! ]\n"); | ||
2584 | printk( "-------------------------\n"); | ||
2585 | printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", | ||
2586 | curr->comm, curr->pid, mem_from, mem_to-1); | ||
2587 | print_lock(hlock); | ||
2588 | lockdep_print_held_locks(curr); | ||
2589 | |||
2590 | printk("\nstack backtrace:\n"); | ||
2591 | dump_stack(); | ||
2592 | } | ||
2593 | |||
2594 | /* | ||
2595 | * Called when kernel memory is freed (or unmapped), or if a lock | ||
2596 | * is destroyed or reinitialized - this code checks whether there is | ||
2597 | * any held lock in the memory range of <from> to <to>: | ||
2598 | */ | ||
2599 | void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) | ||
2600 | { | ||
2601 | const void *mem_to = mem_from + mem_len, *lock_from, *lock_to; | ||
2602 | struct task_struct *curr = current; | ||
2603 | struct held_lock *hlock; | ||
2604 | unsigned long flags; | ||
2605 | int i; | ||
2606 | |||
2607 | if (unlikely(!debug_locks)) | ||
2608 | return; | ||
2609 | |||
2610 | local_irq_save(flags); | ||
2611 | for (i = 0; i < curr->lockdep_depth; i++) { | ||
2612 | hlock = curr->held_locks + i; | ||
2613 | |||
2614 | lock_from = (void *)hlock->instance; | ||
2615 | lock_to = (void *)(hlock->instance + 1); | ||
2616 | |||
2617 | if (!in_range(mem_from, lock_from, mem_to) && | ||
2618 | !in_range(mem_from, lock_to, mem_to)) | ||
2619 | continue; | ||
2620 | |||
2621 | print_freed_lock_bug(curr, mem_from, mem_to, hlock); | ||
2622 | break; | ||
2623 | } | ||
2624 | local_irq_restore(flags); | ||
2625 | } | ||
2626 | |||
2627 | static void print_held_locks_bug(struct task_struct *curr) | ||
2628 | { | ||
2629 | if (!debug_locks_off()) | ||
2630 | return; | ||
2631 | if (debug_locks_silent) | ||
2632 | return; | ||
2633 | |||
2634 | printk("\n=====================================\n"); | ||
2635 | printk( "[ BUG: lock held at task exit time! ]\n"); | ||
2636 | printk( "-------------------------------------\n"); | ||
2637 | printk("%s/%d is exiting with locks still held!\n", | ||
2638 | curr->comm, curr->pid); | ||
2639 | lockdep_print_held_locks(curr); | ||
2640 | |||
2641 | printk("\nstack backtrace:\n"); | ||
2642 | dump_stack(); | ||
2643 | } | ||
2644 | |||
2645 | void debug_check_no_locks_held(struct task_struct *task) | ||
2646 | { | ||
2647 | if (unlikely(task->lockdep_depth > 0)) | ||
2648 | print_held_locks_bug(task); | ||
2649 | } | ||
2650 | |||
2651 | void debug_show_all_locks(void) | ||
2652 | { | ||
2653 | struct task_struct *g, *p; | ||
2654 | int count = 10; | ||
2655 | int unlock = 1; | ||
2656 | |||
2657 | printk("\nShowing all locks held in the system:\n"); | ||
2658 | |||
2659 | /* | ||
2660 | * Here we try to get the tasklist_lock as hard as possible, | ||
2661 | * if not successful after 2 seconds we ignore it (but keep | ||
2662 | * trying). This is to enable a debug printout even if a | ||
2663 | * tasklist_lock-holding task deadlocks or crashes. | ||
2664 | */ | ||
2665 | retry: | ||
2666 | if (!read_trylock(&tasklist_lock)) { | ||
2667 | if (count == 10) | ||
2668 | printk("hm, tasklist_lock locked, retrying... "); | ||
2669 | if (count) { | ||
2670 | count--; | ||
2671 | printk(" #%d", 10-count); | ||
2672 | mdelay(200); | ||
2673 | goto retry; | ||
2674 | } | ||
2675 | printk(" ignoring it.\n"); | ||
2676 | unlock = 0; | ||
2677 | } | ||
2678 | if (count != 10) | ||
2679 | printk(" locked it.\n"); | ||
2680 | |||
2681 | do_each_thread(g, p) { | ||
2682 | if (p->lockdep_depth) | ||
2683 | lockdep_print_held_locks(p); | ||
2684 | if (!unlock) | ||
2685 | if (read_trylock(&tasklist_lock)) | ||
2686 | unlock = 1; | ||
2687 | } while_each_thread(g, p); | ||
2688 | |||
2689 | printk("\n"); | ||
2690 | printk("=============================================\n\n"); | ||
2691 | |||
2692 | if (unlock) | ||
2693 | read_unlock(&tasklist_lock); | ||
2694 | } | ||
2695 | |||
2696 | EXPORT_SYMBOL_GPL(debug_show_all_locks); | ||
2697 | |||
2698 | void debug_show_held_locks(struct task_struct *task) | ||
2699 | { | ||
2700 | lockdep_print_held_locks(task); | ||
2701 | } | ||
2702 | |||
2703 | EXPORT_SYMBOL_GPL(debug_show_held_locks); | ||
2704 | |||
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h new file mode 100644 index 000000000000..0d355f24fe04 --- /dev/null +++ b/kernel/lockdep_internals.h | |||
@@ -0,0 +1,78 @@ | |||
1 | /* | ||
2 | * kernel/lockdep_internals.h | ||
3 | * | ||
4 | * Runtime locking correctness validator | ||
5 | * | ||
6 | * lockdep subsystem internal functions and variables. | ||
7 | */ | ||
8 | |||
9 | /* | ||
10 | * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies | ||
11 | * we track. | ||
12 | * | ||
13 | * We use the per-lock dependency maps in two ways: we grow it by adding | ||
14 | * every to-be-taken lock to all currently held lock's own dependency | ||
15 | * table (if it's not there yet), and we check it for lock order | ||
16 | * conflicts and deadlocks. | ||
17 | */ | ||
18 | #define MAX_LOCKDEP_ENTRIES 8192UL | ||
19 | |||
20 | #define MAX_LOCKDEP_KEYS_BITS 11 | ||
21 | #define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) | ||
22 | |||
23 | #define MAX_LOCKDEP_CHAINS_BITS 13 | ||
24 | #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) | ||
25 | |||
26 | /* | ||
27 | * Stack-trace: tightly packed array of stack backtrace | ||
28 | * addresses. Protected by the hash_lock. | ||
29 | */ | ||
30 | #define MAX_STACK_TRACE_ENTRIES 131072UL | ||
31 | |||
32 | extern struct list_head all_lock_classes; | ||
33 | |||
34 | extern void | ||
35 | get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4); | ||
36 | |||
37 | extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); | ||
38 | |||
39 | extern unsigned long nr_lock_classes; | ||
40 | extern unsigned long nr_list_entries; | ||
41 | extern unsigned long nr_lock_chains; | ||
42 | extern unsigned long nr_stack_trace_entries; | ||
43 | |||
44 | extern unsigned int nr_hardirq_chains; | ||
45 | extern unsigned int nr_softirq_chains; | ||
46 | extern unsigned int nr_process_chains; | ||
47 | extern unsigned int max_lockdep_depth; | ||
48 | extern unsigned int max_recursion_depth; | ||
49 | |||
50 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
51 | /* | ||
52 | * Various lockdep statistics: | ||
53 | */ | ||
54 | extern atomic_t chain_lookup_hits; | ||
55 | extern atomic_t chain_lookup_misses; | ||
56 | extern atomic_t hardirqs_on_events; | ||
57 | extern atomic_t hardirqs_off_events; | ||
58 | extern atomic_t redundant_hardirqs_on; | ||
59 | extern atomic_t redundant_hardirqs_off; | ||
60 | extern atomic_t softirqs_on_events; | ||
61 | extern atomic_t softirqs_off_events; | ||
62 | extern atomic_t redundant_softirqs_on; | ||
63 | extern atomic_t redundant_softirqs_off; | ||
64 | extern atomic_t nr_unused_locks; | ||
65 | extern atomic_t nr_cyclic_checks; | ||
66 | extern atomic_t nr_cyclic_check_recursions; | ||
67 | extern atomic_t nr_find_usage_forwards_checks; | ||
68 | extern atomic_t nr_find_usage_forwards_recursions; | ||
69 | extern atomic_t nr_find_usage_backwards_checks; | ||
70 | extern atomic_t nr_find_usage_backwards_recursions; | ||
71 | # define debug_atomic_inc(ptr) atomic_inc(ptr) | ||
72 | # define debug_atomic_dec(ptr) atomic_dec(ptr) | ||
73 | # define debug_atomic_read(ptr) atomic_read(ptr) | ||
74 | #else | ||
75 | # define debug_atomic_inc(ptr) do { } while (0) | ||
76 | # define debug_atomic_dec(ptr) do { } while (0) | ||
77 | # define debug_atomic_read(ptr) 0 | ||
78 | #endif | ||
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c new file mode 100644 index 000000000000..f6e72eaab3fa --- /dev/null +++ b/kernel/lockdep_proc.c | |||
@@ -0,0 +1,345 @@ | |||
1 | /* | ||
2 | * kernel/lockdep_proc.c | ||
3 | * | ||
4 | * Runtime locking correctness validator | ||
5 | * | ||
6 | * Started by Ingo Molnar: | ||
7 | * | ||
8 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
9 | * | ||
10 | * Code for /proc/lockdep and /proc/lockdep_stats: | ||
11 | * | ||
12 | */ | ||
13 | #include <linux/sched.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <linux/proc_fs.h> | ||
16 | #include <linux/seq_file.h> | ||
17 | #include <linux/kallsyms.h> | ||
18 | #include <linux/debug_locks.h> | ||
19 | |||
20 | #include "lockdep_internals.h" | ||
21 | |||
22 | static void *l_next(struct seq_file *m, void *v, loff_t *pos) | ||
23 | { | ||
24 | struct lock_class *class = v; | ||
25 | |||
26 | (*pos)++; | ||
27 | |||
28 | if (class->lock_entry.next != &all_lock_classes) | ||
29 | class = list_entry(class->lock_entry.next, struct lock_class, | ||
30 | lock_entry); | ||
31 | else | ||
32 | class = NULL; | ||
33 | m->private = class; | ||
34 | |||
35 | return class; | ||
36 | } | ||
37 | |||
38 | static void *l_start(struct seq_file *m, loff_t *pos) | ||
39 | { | ||
40 | struct lock_class *class = m->private; | ||
41 | |||
42 | if (&class->lock_entry == all_lock_classes.next) | ||
43 | seq_printf(m, "all lock classes:\n"); | ||
44 | |||
45 | return class; | ||
46 | } | ||
47 | |||
48 | static void l_stop(struct seq_file *m, void *v) | ||
49 | { | ||
50 | } | ||
51 | |||
52 | static unsigned long count_forward_deps(struct lock_class *class) | ||
53 | { | ||
54 | struct lock_list *entry; | ||
55 | unsigned long ret = 1; | ||
56 | |||
57 | /* | ||
58 | * Recurse this class's dependency list: | ||
59 | */ | ||
60 | list_for_each_entry(entry, &class->locks_after, entry) | ||
61 | ret += count_forward_deps(entry->class); | ||
62 | |||
63 | return ret; | ||
64 | } | ||
65 | |||
66 | static unsigned long count_backward_deps(struct lock_class *class) | ||
67 | { | ||
68 | struct lock_list *entry; | ||
69 | unsigned long ret = 1; | ||
70 | |||
71 | /* | ||
72 | * Recurse this class's dependency list: | ||
73 | */ | ||
74 | list_for_each_entry(entry, &class->locks_before, entry) | ||
75 | ret += count_backward_deps(entry->class); | ||
76 | |||
77 | return ret; | ||
78 | } | ||
79 | |||
80 | static int l_show(struct seq_file *m, void *v) | ||
81 | { | ||
82 | unsigned long nr_forward_deps, nr_backward_deps; | ||
83 | struct lock_class *class = m->private; | ||
84 | char str[128], c1, c2, c3, c4; | ||
85 | const char *name; | ||
86 | |||
87 | seq_printf(m, "%p", class->key); | ||
88 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
89 | seq_printf(m, " OPS:%8ld", class->ops); | ||
90 | #endif | ||
91 | nr_forward_deps = count_forward_deps(class); | ||
92 | seq_printf(m, " FD:%5ld", nr_forward_deps); | ||
93 | |||
94 | nr_backward_deps = count_backward_deps(class); | ||
95 | seq_printf(m, " BD:%5ld", nr_backward_deps); | ||
96 | |||
97 | get_usage_chars(class, &c1, &c2, &c3, &c4); | ||
98 | seq_printf(m, " %c%c%c%c", c1, c2, c3, c4); | ||
99 | |||
100 | name = class->name; | ||
101 | if (!name) { | ||
102 | name = __get_key_name(class->key, str); | ||
103 | seq_printf(m, ": %s", name); | ||
104 | } else{ | ||
105 | seq_printf(m, ": %s", name); | ||
106 | if (class->name_version > 1) | ||
107 | seq_printf(m, "#%d", class->name_version); | ||
108 | if (class->subclass) | ||
109 | seq_printf(m, "/%d", class->subclass); | ||
110 | } | ||
111 | seq_puts(m, "\n"); | ||
112 | |||
113 | return 0; | ||
114 | } | ||
115 | |||
116 | static struct seq_operations lockdep_ops = { | ||
117 | .start = l_start, | ||
118 | .next = l_next, | ||
119 | .stop = l_stop, | ||
120 | .show = l_show, | ||
121 | }; | ||
122 | |||
123 | static int lockdep_open(struct inode *inode, struct file *file) | ||
124 | { | ||
125 | int res = seq_open(file, &lockdep_ops); | ||
126 | if (!res) { | ||
127 | struct seq_file *m = file->private_data; | ||
128 | |||
129 | if (!list_empty(&all_lock_classes)) | ||
130 | m->private = list_entry(all_lock_classes.next, | ||
131 | struct lock_class, lock_entry); | ||
132 | else | ||
133 | m->private = NULL; | ||
134 | } | ||
135 | return res; | ||
136 | } | ||
137 | |||
138 | static struct file_operations proc_lockdep_operations = { | ||
139 | .open = lockdep_open, | ||
140 | .read = seq_read, | ||
141 | .llseek = seq_lseek, | ||
142 | .release = seq_release, | ||
143 | }; | ||
144 | |||
145 | static void lockdep_stats_debug_show(struct seq_file *m) | ||
146 | { | ||
147 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
148 | unsigned int hi1 = debug_atomic_read(&hardirqs_on_events), | ||
149 | hi2 = debug_atomic_read(&hardirqs_off_events), | ||
150 | hr1 = debug_atomic_read(&redundant_hardirqs_on), | ||
151 | hr2 = debug_atomic_read(&redundant_hardirqs_off), | ||
152 | si1 = debug_atomic_read(&softirqs_on_events), | ||
153 | si2 = debug_atomic_read(&softirqs_off_events), | ||
154 | sr1 = debug_atomic_read(&redundant_softirqs_on), | ||
155 | sr2 = debug_atomic_read(&redundant_softirqs_off); | ||
156 | |||
157 | seq_printf(m, " chain lookup misses: %11u\n", | ||
158 | debug_atomic_read(&chain_lookup_misses)); | ||
159 | seq_printf(m, " chain lookup hits: %11u\n", | ||
160 | debug_atomic_read(&chain_lookup_hits)); | ||
161 | seq_printf(m, " cyclic checks: %11u\n", | ||
162 | debug_atomic_read(&nr_cyclic_checks)); | ||
163 | seq_printf(m, " cyclic-check recursions: %11u\n", | ||
164 | debug_atomic_read(&nr_cyclic_check_recursions)); | ||
165 | seq_printf(m, " find-mask forwards checks: %11u\n", | ||
166 | debug_atomic_read(&nr_find_usage_forwards_checks)); | ||
167 | seq_printf(m, " find-mask forwards recursions: %11u\n", | ||
168 | debug_atomic_read(&nr_find_usage_forwards_recursions)); | ||
169 | seq_printf(m, " find-mask backwards checks: %11u\n", | ||
170 | debug_atomic_read(&nr_find_usage_backwards_checks)); | ||
171 | seq_printf(m, " find-mask backwards recursions:%11u\n", | ||
172 | debug_atomic_read(&nr_find_usage_backwards_recursions)); | ||
173 | |||
174 | seq_printf(m, " hardirq on events: %11u\n", hi1); | ||
175 | seq_printf(m, " hardirq off events: %11u\n", hi2); | ||
176 | seq_printf(m, " redundant hardirq ons: %11u\n", hr1); | ||
177 | seq_printf(m, " redundant hardirq offs: %11u\n", hr2); | ||
178 | seq_printf(m, " softirq on events: %11u\n", si1); | ||
179 | seq_printf(m, " softirq off events: %11u\n", si2); | ||
180 | seq_printf(m, " redundant softirq ons: %11u\n", sr1); | ||
181 | seq_printf(m, " redundant softirq offs: %11u\n", sr2); | ||
182 | #endif | ||
183 | } | ||
184 | |||
185 | static int lockdep_stats_show(struct seq_file *m, void *v) | ||
186 | { | ||
187 | struct lock_class *class; | ||
188 | unsigned long nr_unused = 0, nr_uncategorized = 0, | ||
189 | nr_irq_safe = 0, nr_irq_unsafe = 0, | ||
190 | nr_softirq_safe = 0, nr_softirq_unsafe = 0, | ||
191 | nr_hardirq_safe = 0, nr_hardirq_unsafe = 0, | ||
192 | nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, | ||
193 | nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, | ||
194 | nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, | ||
195 | sum_forward_deps = 0, factor = 0; | ||
196 | |||
197 | list_for_each_entry(class, &all_lock_classes, lock_entry) { | ||
198 | |||
199 | if (class->usage_mask == 0) | ||
200 | nr_unused++; | ||
201 | if (class->usage_mask == LOCKF_USED) | ||
202 | nr_uncategorized++; | ||
203 | if (class->usage_mask & LOCKF_USED_IN_IRQ) | ||
204 | nr_irq_safe++; | ||
205 | if (class->usage_mask & LOCKF_ENABLED_IRQS) | ||
206 | nr_irq_unsafe++; | ||
207 | if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) | ||
208 | nr_softirq_safe++; | ||
209 | if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS) | ||
210 | nr_softirq_unsafe++; | ||
211 | if (class->usage_mask & LOCKF_USED_IN_HARDIRQ) | ||
212 | nr_hardirq_safe++; | ||
213 | if (class->usage_mask & LOCKF_ENABLED_HARDIRQS) | ||
214 | nr_hardirq_unsafe++; | ||
215 | if (class->usage_mask & LOCKF_USED_IN_IRQ_READ) | ||
216 | nr_irq_read_safe++; | ||
217 | if (class->usage_mask & LOCKF_ENABLED_IRQS_READ) | ||
218 | nr_irq_read_unsafe++; | ||
219 | if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) | ||
220 | nr_softirq_read_safe++; | ||
221 | if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ) | ||
222 | nr_softirq_read_unsafe++; | ||
223 | if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) | ||
224 | nr_hardirq_read_safe++; | ||
225 | if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) | ||
226 | nr_hardirq_read_unsafe++; | ||
227 | |||
228 | sum_forward_deps += count_forward_deps(class); | ||
229 | } | ||
230 | #ifdef CONFIG_LOCKDEP_DEBUG | ||
231 | DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); | ||
232 | #endif | ||
233 | seq_printf(m, " lock-classes: %11lu [max: %lu]\n", | ||
234 | nr_lock_classes, MAX_LOCKDEP_KEYS); | ||
235 | seq_printf(m, " direct dependencies: %11lu [max: %lu]\n", | ||
236 | nr_list_entries, MAX_LOCKDEP_ENTRIES); | ||
237 | seq_printf(m, " indirect dependencies: %11lu\n", | ||
238 | sum_forward_deps); | ||
239 | |||
240 | /* | ||
241 | * Total number of dependencies: | ||
242 | * | ||
243 | * All irq-safe locks may nest inside irq-unsafe locks, | ||
244 | * plus all the other known dependencies: | ||
245 | */ | ||
246 | seq_printf(m, " all direct dependencies: %11lu\n", | ||
247 | nr_irq_unsafe * nr_irq_safe + | ||
248 | nr_hardirq_unsafe * nr_hardirq_safe + | ||
249 | nr_list_entries); | ||
250 | |||
251 | /* | ||
252 | * Estimated factor between direct and indirect | ||
253 | * dependencies: | ||
254 | */ | ||
255 | if (nr_list_entries) | ||
256 | factor = sum_forward_deps / nr_list_entries; | ||
257 | |||
258 | seq_printf(m, " dependency chains: %11lu [max: %lu]\n", | ||
259 | nr_lock_chains, MAX_LOCKDEP_CHAINS); | ||
260 | |||
261 | #ifdef CONFIG_TRACE_IRQFLAGS | ||
262 | seq_printf(m, " in-hardirq chains: %11u\n", | ||
263 | nr_hardirq_chains); | ||
264 | seq_printf(m, " in-softirq chains: %11u\n", | ||
265 | nr_softirq_chains); | ||
266 | #endif | ||
267 | seq_printf(m, " in-process chains: %11u\n", | ||
268 | nr_process_chains); | ||
269 | seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n", | ||
270 | nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES); | ||
271 | seq_printf(m, " combined max dependencies: %11u\n", | ||
272 | (nr_hardirq_chains + 1) * | ||
273 | (nr_softirq_chains + 1) * | ||
274 | (nr_process_chains + 1) | ||
275 | ); | ||
276 | seq_printf(m, " hardirq-safe locks: %11lu\n", | ||
277 | nr_hardirq_safe); | ||
278 | seq_printf(m, " hardirq-unsafe locks: %11lu\n", | ||
279 | nr_hardirq_unsafe); | ||
280 | seq_printf(m, " softirq-safe locks: %11lu\n", | ||
281 | nr_softirq_safe); | ||
282 | seq_printf(m, " softirq-unsafe locks: %11lu\n", | ||
283 | nr_softirq_unsafe); | ||
284 | seq_printf(m, " irq-safe locks: %11lu\n", | ||
285 | nr_irq_safe); | ||
286 | seq_printf(m, " irq-unsafe locks: %11lu\n", | ||
287 | nr_irq_unsafe); | ||
288 | |||
289 | seq_printf(m, " hardirq-read-safe locks: %11lu\n", | ||
290 | nr_hardirq_read_safe); | ||
291 | seq_printf(m, " hardirq-read-unsafe locks: %11lu\n", | ||
292 | nr_hardirq_read_unsafe); | ||
293 | seq_printf(m, " softirq-read-safe locks: %11lu\n", | ||
294 | nr_softirq_read_safe); | ||
295 | seq_printf(m, " softirq-read-unsafe locks: %11lu\n", | ||
296 | nr_softirq_read_unsafe); | ||
297 | seq_printf(m, " irq-read-safe locks: %11lu\n", | ||
298 | nr_irq_read_safe); | ||
299 | seq_printf(m, " irq-read-unsafe locks: %11lu\n", | ||
300 | nr_irq_read_unsafe); | ||
301 | |||
302 | seq_printf(m, " uncategorized locks: %11lu\n", | ||
303 | nr_uncategorized); | ||
304 | seq_printf(m, " unused locks: %11lu\n", | ||
305 | nr_unused); | ||
306 | seq_printf(m, " max locking depth: %11u\n", | ||
307 | max_lockdep_depth); | ||
308 | seq_printf(m, " max recursion depth: %11u\n", | ||
309 | max_recursion_depth); | ||
310 | lockdep_stats_debug_show(m); | ||
311 | seq_printf(m, " debug_locks: %11u\n", | ||
312 | debug_locks); | ||
313 | |||
314 | return 0; | ||
315 | } | ||
316 | |||
317 | static int lockdep_stats_open(struct inode *inode, struct file *file) | ||
318 | { | ||
319 | return single_open(file, lockdep_stats_show, NULL); | ||
320 | } | ||
321 | |||
322 | static struct file_operations proc_lockdep_stats_operations = { | ||
323 | .open = lockdep_stats_open, | ||
324 | .read = seq_read, | ||
325 | .llseek = seq_lseek, | ||
326 | .release = seq_release, | ||
327 | }; | ||
328 | |||
329 | static int __init lockdep_proc_init(void) | ||
330 | { | ||
331 | struct proc_dir_entry *entry; | ||
332 | |||
333 | entry = create_proc_entry("lockdep", S_IRUSR, NULL); | ||
334 | if (entry) | ||
335 | entry->proc_fops = &proc_lockdep_operations; | ||
336 | |||
337 | entry = create_proc_entry("lockdep_stats", S_IRUSR, NULL); | ||
338 | if (entry) | ||
339 | entry->proc_fops = &proc_lockdep_stats_operations; | ||
340 | |||
341 | return 0; | ||
342 | } | ||
343 | |||
344 | __initcall(lockdep_proc_init); | ||
345 | |||
diff --git a/kernel/module.c b/kernel/module.c index bbe04862e1b0..2a19cd47c046 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -1,4 +1,4 @@ | |||
1 | /* Rewritten by Rusty Russell, on the backs of many others... | 1 | /* |
2 | Copyright (C) 2002 Richard Henderson | 2 | Copyright (C) 2002 Richard Henderson |
3 | Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. | 3 | Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. |
4 | 4 | ||
@@ -16,7 +16,6 @@ | |||
16 | along with this program; if not, write to the Free Software | 16 | along with this program; if not, write to the Free Software |
17 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 17 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
18 | */ | 18 | */ |
19 | #include <linux/config.h> | ||
20 | #include <linux/module.h> | 19 | #include <linux/module.h> |
21 | #include <linux/moduleloader.h> | 20 | #include <linux/moduleloader.h> |
22 | #include <linux/init.h> | 21 | #include <linux/init.h> |
@@ -40,9 +39,11 @@ | |||
40 | #include <linux/string.h> | 39 | #include <linux/string.h> |
41 | #include <linux/sched.h> | 40 | #include <linux/sched.h> |
42 | #include <linux/mutex.h> | 41 | #include <linux/mutex.h> |
42 | #include <linux/unwind.h> | ||
43 | #include <asm/uaccess.h> | 43 | #include <asm/uaccess.h> |
44 | #include <asm/semaphore.h> | 44 | #include <asm/semaphore.h> |
45 | #include <asm/cacheflush.h> | 45 | #include <asm/cacheflush.h> |
46 | #include <linux/license.h> | ||
46 | 47 | ||
47 | #if 0 | 48 | #if 0 |
48 | #define DEBUGP printk | 49 | #define DEBUGP printk |
@@ -120,9 +121,17 @@ extern const struct kernel_symbol __start___ksymtab_gpl[]; | |||
120 | extern const struct kernel_symbol __stop___ksymtab_gpl[]; | 121 | extern const struct kernel_symbol __stop___ksymtab_gpl[]; |
121 | extern const struct kernel_symbol __start___ksymtab_gpl_future[]; | 122 | extern const struct kernel_symbol __start___ksymtab_gpl_future[]; |
122 | extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; | 123 | extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; |
124 | extern const struct kernel_symbol __start___ksymtab_unused[]; | ||
125 | extern const struct kernel_symbol __stop___ksymtab_unused[]; | ||
126 | extern const struct kernel_symbol __start___ksymtab_unused_gpl[]; | ||
127 | extern const struct kernel_symbol __stop___ksymtab_unused_gpl[]; | ||
128 | extern const struct kernel_symbol __start___ksymtab_gpl_future[]; | ||
129 | extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; | ||
123 | extern const unsigned long __start___kcrctab[]; | 130 | extern const unsigned long __start___kcrctab[]; |
124 | extern const unsigned long __start___kcrctab_gpl[]; | 131 | extern const unsigned long __start___kcrctab_gpl[]; |
125 | extern const unsigned long __start___kcrctab_gpl_future[]; | 132 | extern const unsigned long __start___kcrctab_gpl_future[]; |
133 | extern const unsigned long __start___kcrctab_unused[]; | ||
134 | extern const unsigned long __start___kcrctab_unused_gpl[]; | ||
126 | 135 | ||
127 | #ifndef CONFIG_MODVERSIONS | 136 | #ifndef CONFIG_MODVERSIONS |
128 | #define symversion(base, idx) NULL | 137 | #define symversion(base, idx) NULL |
@@ -142,6 +151,17 @@ static const struct kernel_symbol *lookup_symbol(const char *name, | |||
142 | return NULL; | 151 | return NULL; |
143 | } | 152 | } |
144 | 153 | ||
154 | static void printk_unused_warning(const char *name) | ||
155 | { | ||
156 | printk(KERN_WARNING "Symbol %s is marked as UNUSED, " | ||
157 | "however this module is using it.\n", name); | ||
158 | printk(KERN_WARNING "This symbol will go away in the future.\n"); | ||
159 | printk(KERN_WARNING "Please evalute if this is the right api to use, " | ||
160 | "and if it really is, submit a report the linux kernel " | ||
161 | "mailinglist together with submitting your code for " | ||
162 | "inclusion.\n"); | ||
163 | } | ||
164 | |||
145 | /* Find a symbol, return value, crc and module which owns it */ | 165 | /* Find a symbol, return value, crc and module which owns it */ |
146 | static unsigned long __find_symbol(const char *name, | 166 | static unsigned long __find_symbol(const char *name, |
147 | struct module **owner, | 167 | struct module **owner, |
@@ -184,6 +204,25 @@ static unsigned long __find_symbol(const char *name, | |||
184 | return ks->value; | 204 | return ks->value; |
185 | } | 205 | } |
186 | 206 | ||
207 | ks = lookup_symbol(name, __start___ksymtab_unused, | ||
208 | __stop___ksymtab_unused); | ||
209 | if (ks) { | ||
210 | printk_unused_warning(name); | ||
211 | *crc = symversion(__start___kcrctab_unused, | ||
212 | (ks - __start___ksymtab_unused)); | ||
213 | return ks->value; | ||
214 | } | ||
215 | |||
216 | if (gplok) | ||
217 | ks = lookup_symbol(name, __start___ksymtab_unused_gpl, | ||
218 | __stop___ksymtab_unused_gpl); | ||
219 | if (ks) { | ||
220 | printk_unused_warning(name); | ||
221 | *crc = symversion(__start___kcrctab_unused_gpl, | ||
222 | (ks - __start___ksymtab_unused_gpl)); | ||
223 | return ks->value; | ||
224 | } | ||
225 | |||
187 | /* Now try modules. */ | 226 | /* Now try modules. */ |
188 | list_for_each_entry(mod, &modules, list) { | 227 | list_for_each_entry(mod, &modules, list) { |
189 | *owner = mod; | 228 | *owner = mod; |
@@ -202,6 +241,23 @@ static unsigned long __find_symbol(const char *name, | |||
202 | return ks->value; | 241 | return ks->value; |
203 | } | 242 | } |
204 | } | 243 | } |
244 | ks = lookup_symbol(name, mod->unused_syms, mod->unused_syms + mod->num_unused_syms); | ||
245 | if (ks) { | ||
246 | printk_unused_warning(name); | ||
247 | *crc = symversion(mod->unused_crcs, (ks - mod->unused_syms)); | ||
248 | return ks->value; | ||
249 | } | ||
250 | |||
251 | if (gplok) { | ||
252 | ks = lookup_symbol(name, mod->unused_gpl_syms, | ||
253 | mod->unused_gpl_syms + mod->num_unused_gpl_syms); | ||
254 | if (ks) { | ||
255 | printk_unused_warning(name); | ||
256 | *crc = symversion(mod->unused_gpl_crcs, | ||
257 | (ks - mod->unused_gpl_syms)); | ||
258 | return ks->value; | ||
259 | } | ||
260 | } | ||
205 | ks = lookup_symbol(name, mod->gpl_future_syms, | 261 | ks = lookup_symbol(name, mod->gpl_future_syms, |
206 | (mod->gpl_future_syms + | 262 | (mod->gpl_future_syms + |
207 | mod->num_gpl_future_syms)); | 263 | mod->num_gpl_future_syms)); |
@@ -1051,6 +1107,8 @@ static void free_module(struct module *mod) | |||
1051 | remove_sect_attrs(mod); | 1107 | remove_sect_attrs(mod); |
1052 | mod_kobject_remove(mod); | 1108 | mod_kobject_remove(mod); |
1053 | 1109 | ||
1110 | unwind_remove_table(mod->unwind_info, 0); | ||
1111 | |||
1054 | /* Arch-specific cleanup. */ | 1112 | /* Arch-specific cleanup. */ |
1055 | module_arch_cleanup(mod); | 1113 | module_arch_cleanup(mod); |
1056 | 1114 | ||
@@ -1063,6 +1121,9 @@ static void free_module(struct module *mod) | |||
1063 | if (mod->percpu) | 1121 | if (mod->percpu) |
1064 | percpu_modfree(mod->percpu); | 1122 | percpu_modfree(mod->percpu); |
1065 | 1123 | ||
1124 | /* Free lock-classes: */ | ||
1125 | lockdep_free_key_range(mod->module_core, mod->core_size); | ||
1126 | |||
1066 | /* Finally, free the core (containing the module structure) */ | 1127 | /* Finally, free the core (containing the module structure) */ |
1067 | module_free(mod, mod->module_core); | 1128 | module_free(mod, mod->module_core); |
1068 | } | 1129 | } |
@@ -1248,16 +1309,6 @@ static void layout_sections(struct module *mod, | |||
1248 | } | 1309 | } |
1249 | } | 1310 | } |
1250 | 1311 | ||
1251 | static inline int license_is_gpl_compatible(const char *license) | ||
1252 | { | ||
1253 | return (strcmp(license, "GPL") == 0 | ||
1254 | || strcmp(license, "GPL v2") == 0 | ||
1255 | || strcmp(license, "GPL and additional rights") == 0 | ||
1256 | || strcmp(license, "Dual BSD/GPL") == 0 | ||
1257 | || strcmp(license, "Dual MIT/GPL") == 0 | ||
1258 | || strcmp(license, "Dual MPL/GPL") == 0); | ||
1259 | } | ||
1260 | |||
1261 | static void set_license(struct module *mod, const char *license) | 1312 | static void set_license(struct module *mod, const char *license) |
1262 | { | 1313 | { |
1263 | if (!license) | 1314 | if (!license) |
@@ -1326,7 +1377,7 @@ int is_exported(const char *name, const struct module *mod) | |||
1326 | if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) | 1377 | if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) |
1327 | return 1; | 1378 | return 1; |
1328 | else | 1379 | else |
1329 | if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) | 1380 | if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) |
1330 | return 1; | 1381 | return 1; |
1331 | else | 1382 | else |
1332 | return 0; | 1383 | return 0; |
@@ -1409,10 +1460,27 @@ static struct module *load_module(void __user *umod, | |||
1409 | Elf_Ehdr *hdr; | 1460 | Elf_Ehdr *hdr; |
1410 | Elf_Shdr *sechdrs; | 1461 | Elf_Shdr *sechdrs; |
1411 | char *secstrings, *args, *modmagic, *strtab = NULL; | 1462 | char *secstrings, *args, *modmagic, *strtab = NULL; |
1412 | unsigned int i, symindex = 0, strindex = 0, setupindex, exindex, | 1463 | unsigned int i; |
1413 | exportindex, modindex, obsparmindex, infoindex, gplindex, | 1464 | unsigned int symindex = 0; |
1414 | crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex, | 1465 | unsigned int strindex = 0; |
1415 | gplfuturecrcindex; | 1466 | unsigned int setupindex; |
1467 | unsigned int exindex; | ||
1468 | unsigned int exportindex; | ||
1469 | unsigned int modindex; | ||
1470 | unsigned int obsparmindex; | ||
1471 | unsigned int infoindex; | ||
1472 | unsigned int gplindex; | ||
1473 | unsigned int crcindex; | ||
1474 | unsigned int gplcrcindex; | ||
1475 | unsigned int versindex; | ||
1476 | unsigned int pcpuindex; | ||
1477 | unsigned int gplfutureindex; | ||
1478 | unsigned int gplfuturecrcindex; | ||
1479 | unsigned int unwindex = 0; | ||
1480 | unsigned int unusedindex; | ||
1481 | unsigned int unusedcrcindex; | ||
1482 | unsigned int unusedgplindex; | ||
1483 | unsigned int unusedgplcrcindex; | ||
1416 | struct module *mod; | 1484 | struct module *mod; |
1417 | long err = 0; | 1485 | long err = 0; |
1418 | void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ | 1486 | void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ |
@@ -1493,15 +1561,22 @@ static struct module *load_module(void __user *umod, | |||
1493 | exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); | 1561 | exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); |
1494 | gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); | 1562 | gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); |
1495 | gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future"); | 1563 | gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future"); |
1564 | unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused"); | ||
1565 | unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl"); | ||
1496 | crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); | 1566 | crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); |
1497 | gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); | 1567 | gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); |
1498 | gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future"); | 1568 | gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future"); |
1569 | unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused"); | ||
1570 | unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl"); | ||
1499 | setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); | 1571 | setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); |
1500 | exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); | 1572 | exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); |
1501 | obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); | 1573 | obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); |
1502 | versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); | 1574 | versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); |
1503 | infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); | 1575 | infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); |
1504 | pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); | 1576 | pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); |
1577 | #ifdef ARCH_UNWIND_SECTION_NAME | ||
1578 | unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME); | ||
1579 | #endif | ||
1505 | 1580 | ||
1506 | /* Don't keep modinfo section */ | 1581 | /* Don't keep modinfo section */ |
1507 | sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; | 1582 | sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; |
@@ -1510,6 +1585,8 @@ static struct module *load_module(void __user *umod, | |||
1510 | sechdrs[symindex].sh_flags |= SHF_ALLOC; | 1585 | sechdrs[symindex].sh_flags |= SHF_ALLOC; |
1511 | sechdrs[strindex].sh_flags |= SHF_ALLOC; | 1586 | sechdrs[strindex].sh_flags |= SHF_ALLOC; |
1512 | #endif | 1587 | #endif |
1588 | if (unwindex) | ||
1589 | sechdrs[unwindex].sh_flags |= SHF_ALLOC; | ||
1513 | 1590 | ||
1514 | /* Check module struct version now, before we try to use module. */ | 1591 | /* Check module struct version now, before we try to use module. */ |
1515 | if (!check_modstruct_version(sechdrs, versindex, mod)) { | 1592 | if (!check_modstruct_version(sechdrs, versindex, mod)) { |
@@ -1639,14 +1716,27 @@ static struct module *load_module(void __user *umod, | |||
1639 | mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; | 1716 | mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; |
1640 | mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / | 1717 | mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / |
1641 | sizeof(*mod->gpl_future_syms); | 1718 | sizeof(*mod->gpl_future_syms); |
1719 | mod->num_unused_syms = sechdrs[unusedindex].sh_size / | ||
1720 | sizeof(*mod->unused_syms); | ||
1721 | mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size / | ||
1722 | sizeof(*mod->unused_gpl_syms); | ||
1642 | mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; | 1723 | mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; |
1643 | if (gplfuturecrcindex) | 1724 | if (gplfuturecrcindex) |
1644 | mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; | 1725 | mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; |
1645 | 1726 | ||
1727 | mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr; | ||
1728 | if (unusedcrcindex) | ||
1729 | mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr; | ||
1730 | mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr; | ||
1731 | if (unusedgplcrcindex) | ||
1732 | mod->unused_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr; | ||
1733 | |||
1646 | #ifdef CONFIG_MODVERSIONS | 1734 | #ifdef CONFIG_MODVERSIONS |
1647 | if ((mod->num_syms && !crcindex) || | 1735 | if ((mod->num_syms && !crcindex) || |
1648 | (mod->num_gpl_syms && !gplcrcindex) || | 1736 | (mod->num_gpl_syms && !gplcrcindex) || |
1649 | (mod->num_gpl_future_syms && !gplfuturecrcindex)) { | 1737 | (mod->num_gpl_future_syms && !gplfuturecrcindex) || |
1738 | (mod->num_unused_syms && !unusedcrcindex) || | ||
1739 | (mod->num_unused_gpl_syms && !unusedgplcrcindex)) { | ||
1650 | printk(KERN_WARNING "%s: No versions for exported symbols." | 1740 | printk(KERN_WARNING "%s: No versions for exported symbols." |
1651 | " Tainting kernel.\n", mod->name); | 1741 | " Tainting kernel.\n", mod->name); |
1652 | add_taint(TAINT_FORCED_MODULE); | 1742 | add_taint(TAINT_FORCED_MODULE); |
@@ -1738,6 +1828,11 @@ static struct module *load_module(void __user *umod, | |||
1738 | goto arch_cleanup; | 1828 | goto arch_cleanup; |
1739 | add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); | 1829 | add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); |
1740 | 1830 | ||
1831 | /* Size of section 0 is 0, so this works well if no unwind info. */ | ||
1832 | mod->unwind_info = unwind_add_table(mod, | ||
1833 | (void *)sechdrs[unwindex].sh_addr, | ||
1834 | sechdrs[unwindex].sh_size); | ||
1835 | |||
1741 | /* Get rid of temporary copy */ | 1836 | /* Get rid of temporary copy */ |
1742 | vfree(hdr); | 1837 | vfree(hdr); |
1743 | 1838 | ||
@@ -1836,6 +1931,7 @@ sys_init_module(void __user *umod, | |||
1836 | mod->state = MODULE_STATE_LIVE; | 1931 | mod->state = MODULE_STATE_LIVE; |
1837 | /* Drop initial reference. */ | 1932 | /* Drop initial reference. */ |
1838 | module_put(mod); | 1933 | module_put(mod); |
1934 | unwind_remove_table(mod->unwind_info, 1); | ||
1839 | module_free(mod, mod->module_init); | 1935 | module_free(mod, mod->module_init); |
1840 | mod->module_init = NULL; | 1936 | mod->module_init = NULL; |
1841 | mod->init_size = 0; | 1937 | mod->init_size = 0; |
@@ -1923,10 +2019,8 @@ const char *module_address_lookup(unsigned long addr, | |||
1923 | return NULL; | 2019 | return NULL; |
1924 | } | 2020 | } |
1925 | 2021 | ||
1926 | struct module *module_get_kallsym(unsigned int symnum, | 2022 | struct module *module_get_kallsym(unsigned int symnum, unsigned long *value, |
1927 | unsigned long *value, | 2023 | char *type, char *name, size_t namelen) |
1928 | char *type, | ||
1929 | char namebuf[128]) | ||
1930 | { | 2024 | { |
1931 | struct module *mod; | 2025 | struct module *mod; |
1932 | 2026 | ||
@@ -1935,9 +2029,8 @@ struct module *module_get_kallsym(unsigned int symnum, | |||
1935 | if (symnum < mod->num_symtab) { | 2029 | if (symnum < mod->num_symtab) { |
1936 | *value = mod->symtab[symnum].st_value; | 2030 | *value = mod->symtab[symnum].st_value; |
1937 | *type = mod->symtab[symnum].st_info; | 2031 | *type = mod->symtab[symnum].st_info; |
1938 | strncpy(namebuf, | 2032 | strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, |
1939 | mod->strtab + mod->symtab[symnum].st_name, | 2033 | namelen); |
1940 | 127); | ||
1941 | mutex_unlock(&module_mutex); | 2034 | mutex_unlock(&module_mutex); |
1942 | return mod; | 2035 | return mod; |
1943 | } | 2036 | } |
@@ -2066,6 +2159,29 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) | |||
2066 | return e; | 2159 | return e; |
2067 | } | 2160 | } |
2068 | 2161 | ||
2162 | /* | ||
2163 | * Is this a valid module address? | ||
2164 | */ | ||
2165 | int is_module_address(unsigned long addr) | ||
2166 | { | ||
2167 | unsigned long flags; | ||
2168 | struct module *mod; | ||
2169 | |||
2170 | spin_lock_irqsave(&modlist_lock, flags); | ||
2171 | |||
2172 | list_for_each_entry(mod, &modules, list) { | ||
2173 | if (within(addr, mod->module_core, mod->core_size)) { | ||
2174 | spin_unlock_irqrestore(&modlist_lock, flags); | ||
2175 | return 1; | ||
2176 | } | ||
2177 | } | ||
2178 | |||
2179 | spin_unlock_irqrestore(&modlist_lock, flags); | ||
2180 | |||
2181 | return 0; | ||
2182 | } | ||
2183 | |||
2184 | |||
2069 | /* Is this a valid kernel address? We don't grab the lock: we are oopsing. */ | 2185 | /* Is this a valid kernel address? We don't grab the lock: we are oopsing. */ |
2070 | struct module *__module_text_address(unsigned long addr) | 2186 | struct module *__module_text_address(unsigned long addr) |
2071 | { | 2187 | { |
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index f4913c376950..e3203c654dda 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c | |||
@@ -16,395 +16,48 @@ | |||
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <linux/delay.h> | 17 | #include <linux/delay.h> |
18 | #include <linux/module.h> | 18 | #include <linux/module.h> |
19 | #include <linux/poison.h> | ||
19 | #include <linux/spinlock.h> | 20 | #include <linux/spinlock.h> |
20 | #include <linux/kallsyms.h> | 21 | #include <linux/kallsyms.h> |
21 | #include <linux/interrupt.h> | 22 | #include <linux/interrupt.h> |
23 | #include <linux/debug_locks.h> | ||
22 | 24 | ||
23 | #include "mutex-debug.h" | 25 | #include "mutex-debug.h" |
24 | 26 | ||
25 | /* | 27 | /* |
26 | * We need a global lock when we walk through the multi-process | ||
27 | * lock tree. Only used in the deadlock-debugging case. | ||
28 | */ | ||
29 | DEFINE_SPINLOCK(debug_mutex_lock); | ||
30 | |||
31 | /* | ||
32 | * All locks held by all tasks, in a single global list: | ||
33 | */ | ||
34 | LIST_HEAD(debug_mutex_held_locks); | ||
35 | |||
36 | /* | ||
37 | * In the debug case we carry the caller's instruction pointer into | ||
38 | * other functions, but we dont want the function argument overhead | ||
39 | * in the nondebug case - hence these macros: | ||
40 | */ | ||
41 | #define __IP_DECL__ , unsigned long ip | ||
42 | #define __IP__ , ip | ||
43 | #define __RET_IP__ , (unsigned long)__builtin_return_address(0) | ||
44 | |||
45 | /* | ||
46 | * "mutex debugging enabled" flag. We turn it off when we detect | ||
47 | * the first problem because we dont want to recurse back | ||
48 | * into the tracing code when doing error printk or | ||
49 | * executing a BUG(): | ||
50 | */ | ||
51 | int debug_mutex_on = 1; | ||
52 | |||
53 | static void printk_task(struct task_struct *p) | ||
54 | { | ||
55 | if (p) | ||
56 | printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio); | ||
57 | else | ||
58 | printk("<none>"); | ||
59 | } | ||
60 | |||
61 | static void printk_ti(struct thread_info *ti) | ||
62 | { | ||
63 | if (ti) | ||
64 | printk_task(ti->task); | ||
65 | else | ||
66 | printk("<none>"); | ||
67 | } | ||
68 | |||
69 | static void printk_task_short(struct task_struct *p) | ||
70 | { | ||
71 | if (p) | ||
72 | printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio); | ||
73 | else | ||
74 | printk("<none>"); | ||
75 | } | ||
76 | |||
77 | static void printk_lock(struct mutex *lock, int print_owner) | ||
78 | { | ||
79 | printk(" [%p] {%s}\n", lock, lock->name); | ||
80 | |||
81 | if (print_owner && lock->owner) { | ||
82 | printk(".. held by: "); | ||
83 | printk_ti(lock->owner); | ||
84 | printk("\n"); | ||
85 | } | ||
86 | if (lock->owner) { | ||
87 | printk("... acquired at: "); | ||
88 | print_symbol("%s\n", lock->acquire_ip); | ||
89 | } | ||
90 | } | ||
91 | |||
92 | /* | ||
93 | * printk locks held by a task: | ||
94 | */ | ||
95 | static void show_task_locks(struct task_struct *p) | ||
96 | { | ||
97 | switch (p->state) { | ||
98 | case TASK_RUNNING: printk("R"); break; | ||
99 | case TASK_INTERRUPTIBLE: printk("S"); break; | ||
100 | case TASK_UNINTERRUPTIBLE: printk("D"); break; | ||
101 | case TASK_STOPPED: printk("T"); break; | ||
102 | case EXIT_ZOMBIE: printk("Z"); break; | ||
103 | case EXIT_DEAD: printk("X"); break; | ||
104 | default: printk("?"); break; | ||
105 | } | ||
106 | printk_task(p); | ||
107 | if (p->blocked_on) { | ||
108 | struct mutex *lock = p->blocked_on->lock; | ||
109 | |||
110 | printk(" blocked on mutex:"); | ||
111 | printk_lock(lock, 1); | ||
112 | } else | ||
113 | printk(" (not blocked on mutex)\n"); | ||
114 | } | ||
115 | |||
116 | /* | ||
117 | * printk all locks held in the system (if filter == NULL), | ||
118 | * or all locks belonging to a single task (if filter != NULL): | ||
119 | */ | ||
120 | void show_held_locks(struct task_struct *filter) | ||
121 | { | ||
122 | struct list_head *curr, *cursor = NULL; | ||
123 | struct mutex *lock; | ||
124 | struct thread_info *t; | ||
125 | unsigned long flags; | ||
126 | int count = 0; | ||
127 | |||
128 | if (filter) { | ||
129 | printk("------------------------------\n"); | ||
130 | printk("| showing all locks held by: | ("); | ||
131 | printk_task_short(filter); | ||
132 | printk("):\n"); | ||
133 | printk("------------------------------\n"); | ||
134 | } else { | ||
135 | printk("---------------------------\n"); | ||
136 | printk("| showing all locks held: |\n"); | ||
137 | printk("---------------------------\n"); | ||
138 | } | ||
139 | |||
140 | /* | ||
141 | * Play safe and acquire the global trace lock. We | ||
142 | * cannot printk with that lock held so we iterate | ||
143 | * very carefully: | ||
144 | */ | ||
145 | next: | ||
146 | debug_spin_lock_save(&debug_mutex_lock, flags); | ||
147 | list_for_each(curr, &debug_mutex_held_locks) { | ||
148 | if (cursor && curr != cursor) | ||
149 | continue; | ||
150 | lock = list_entry(curr, struct mutex, held_list); | ||
151 | t = lock->owner; | ||
152 | if (filter && (t != filter->thread_info)) | ||
153 | continue; | ||
154 | count++; | ||
155 | cursor = curr->next; | ||
156 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
157 | |||
158 | printk("\n#%03d: ", count); | ||
159 | printk_lock(lock, filter ? 0 : 1); | ||
160 | goto next; | ||
161 | } | ||
162 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
163 | printk("\n"); | ||
164 | } | ||
165 | |||
166 | void mutex_debug_show_all_locks(void) | ||
167 | { | ||
168 | struct task_struct *g, *p; | ||
169 | int count = 10; | ||
170 | int unlock = 1; | ||
171 | |||
172 | printk("\nShowing all blocking locks in the system:\n"); | ||
173 | |||
174 | /* | ||
175 | * Here we try to get the tasklist_lock as hard as possible, | ||
176 | * if not successful after 2 seconds we ignore it (but keep | ||
177 | * trying). This is to enable a debug printout even if a | ||
178 | * tasklist_lock-holding task deadlocks or crashes. | ||
179 | */ | ||
180 | retry: | ||
181 | if (!read_trylock(&tasklist_lock)) { | ||
182 | if (count == 10) | ||
183 | printk("hm, tasklist_lock locked, retrying... "); | ||
184 | if (count) { | ||
185 | count--; | ||
186 | printk(" #%d", 10-count); | ||
187 | mdelay(200); | ||
188 | goto retry; | ||
189 | } | ||
190 | printk(" ignoring it.\n"); | ||
191 | unlock = 0; | ||
192 | } | ||
193 | if (count != 10) | ||
194 | printk(" locked it.\n"); | ||
195 | |||
196 | do_each_thread(g, p) { | ||
197 | show_task_locks(p); | ||
198 | if (!unlock) | ||
199 | if (read_trylock(&tasklist_lock)) | ||
200 | unlock = 1; | ||
201 | } while_each_thread(g, p); | ||
202 | |||
203 | printk("\n"); | ||
204 | show_held_locks(NULL); | ||
205 | printk("=============================================\n\n"); | ||
206 | |||
207 | if (unlock) | ||
208 | read_unlock(&tasklist_lock); | ||
209 | } | ||
210 | |||
211 | static void report_deadlock(struct task_struct *task, struct mutex *lock, | ||
212 | struct mutex *lockblk, unsigned long ip) | ||
213 | { | ||
214 | printk("\n%s/%d is trying to acquire this lock:\n", | ||
215 | current->comm, current->pid); | ||
216 | printk_lock(lock, 1); | ||
217 | printk("... trying at: "); | ||
218 | print_symbol("%s\n", ip); | ||
219 | show_held_locks(current); | ||
220 | |||
221 | if (lockblk) { | ||
222 | printk("but %s/%d is deadlocking current task %s/%d!\n\n", | ||
223 | task->comm, task->pid, current->comm, current->pid); | ||
224 | printk("\n%s/%d is blocked on this lock:\n", | ||
225 | task->comm, task->pid); | ||
226 | printk_lock(lockblk, 1); | ||
227 | |||
228 | show_held_locks(task); | ||
229 | |||
230 | printk("\n%s/%d's [blocked] stackdump:\n\n", | ||
231 | task->comm, task->pid); | ||
232 | show_stack(task, NULL); | ||
233 | } | ||
234 | |||
235 | printk("\n%s/%d's [current] stackdump:\n\n", | ||
236 | current->comm, current->pid); | ||
237 | dump_stack(); | ||
238 | mutex_debug_show_all_locks(); | ||
239 | printk("[ turning off deadlock detection. Please report this. ]\n\n"); | ||
240 | local_irq_disable(); | ||
241 | } | ||
242 | |||
243 | /* | ||
244 | * Recursively check for mutex deadlocks: | ||
245 | */ | ||
246 | static int check_deadlock(struct mutex *lock, int depth, | ||
247 | struct thread_info *ti, unsigned long ip) | ||
248 | { | ||
249 | struct mutex *lockblk; | ||
250 | struct task_struct *task; | ||
251 | |||
252 | if (!debug_mutex_on) | ||
253 | return 0; | ||
254 | |||
255 | ti = lock->owner; | ||
256 | if (!ti) | ||
257 | return 0; | ||
258 | |||
259 | task = ti->task; | ||
260 | lockblk = NULL; | ||
261 | if (task->blocked_on) | ||
262 | lockblk = task->blocked_on->lock; | ||
263 | |||
264 | /* Self-deadlock: */ | ||
265 | if (current == task) { | ||
266 | DEBUG_OFF(); | ||
267 | if (depth) | ||
268 | return 1; | ||
269 | printk("\n==========================================\n"); | ||
270 | printk( "[ BUG: lock recursion deadlock detected! |\n"); | ||
271 | printk( "------------------------------------------\n"); | ||
272 | report_deadlock(task, lock, NULL, ip); | ||
273 | return 0; | ||
274 | } | ||
275 | |||
276 | /* Ugh, something corrupted the lock data structure? */ | ||
277 | if (depth > 20) { | ||
278 | DEBUG_OFF(); | ||
279 | printk("\n===========================================\n"); | ||
280 | printk( "[ BUG: infinite lock dependency detected!? |\n"); | ||
281 | printk( "-------------------------------------------\n"); | ||
282 | report_deadlock(task, lock, lockblk, ip); | ||
283 | return 0; | ||
284 | } | ||
285 | |||
286 | /* Recursively check for dependencies: */ | ||
287 | if (lockblk && check_deadlock(lockblk, depth+1, ti, ip)) { | ||
288 | printk("\n============================================\n"); | ||
289 | printk( "[ BUG: circular locking deadlock detected! ]\n"); | ||
290 | printk( "--------------------------------------------\n"); | ||
291 | report_deadlock(task, lock, lockblk, ip); | ||
292 | return 0; | ||
293 | } | ||
294 | return 0; | ||
295 | } | ||
296 | |||
297 | /* | ||
298 | * Called when a task exits, this function checks whether the | ||
299 | * task is holding any locks, and reports the first one if so: | ||
300 | */ | ||
301 | void mutex_debug_check_no_locks_held(struct task_struct *task) | ||
302 | { | ||
303 | struct list_head *curr, *next; | ||
304 | struct thread_info *t; | ||
305 | unsigned long flags; | ||
306 | struct mutex *lock; | ||
307 | |||
308 | if (!debug_mutex_on) | ||
309 | return; | ||
310 | |||
311 | debug_spin_lock_save(&debug_mutex_lock, flags); | ||
312 | list_for_each_safe(curr, next, &debug_mutex_held_locks) { | ||
313 | lock = list_entry(curr, struct mutex, held_list); | ||
314 | t = lock->owner; | ||
315 | if (t != task->thread_info) | ||
316 | continue; | ||
317 | list_del_init(curr); | ||
318 | DEBUG_OFF(); | ||
319 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
320 | |||
321 | printk("BUG: %s/%d, lock held at task exit time!\n", | ||
322 | task->comm, task->pid); | ||
323 | printk_lock(lock, 1); | ||
324 | if (lock->owner != task->thread_info) | ||
325 | printk("exiting task is not even the owner??\n"); | ||
326 | return; | ||
327 | } | ||
328 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
329 | } | ||
330 | |||
331 | /* | ||
332 | * Called when kernel memory is freed (or unmapped), or if a mutex | ||
333 | * is destroyed or reinitialized - this code checks whether there is | ||
334 | * any held lock in the memory range of <from> to <to>: | ||
335 | */ | ||
336 | void mutex_debug_check_no_locks_freed(const void *from, unsigned long len) | ||
337 | { | ||
338 | struct list_head *curr, *next; | ||
339 | const void *to = from + len; | ||
340 | unsigned long flags; | ||
341 | struct mutex *lock; | ||
342 | void *lock_addr; | ||
343 | |||
344 | if (!debug_mutex_on) | ||
345 | return; | ||
346 | |||
347 | debug_spin_lock_save(&debug_mutex_lock, flags); | ||
348 | list_for_each_safe(curr, next, &debug_mutex_held_locks) { | ||
349 | lock = list_entry(curr, struct mutex, held_list); | ||
350 | lock_addr = lock; | ||
351 | if (lock_addr < from || lock_addr >= to) | ||
352 | continue; | ||
353 | list_del_init(curr); | ||
354 | DEBUG_OFF(); | ||
355 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
356 | |||
357 | printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", | ||
358 | current->comm, current->pid, lock, from, to); | ||
359 | dump_stack(); | ||
360 | printk_lock(lock, 1); | ||
361 | if (lock->owner != current_thread_info()) | ||
362 | printk("freeing task is not even the owner??\n"); | ||
363 | return; | ||
364 | } | ||
365 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
366 | } | ||
367 | |||
368 | /* | ||
369 | * Must be called with lock->wait_lock held. | 28 | * Must be called with lock->wait_lock held. |
370 | */ | 29 | */ |
371 | void debug_mutex_set_owner(struct mutex *lock, | 30 | void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner) |
372 | struct thread_info *new_owner __IP_DECL__) | ||
373 | { | 31 | { |
374 | lock->owner = new_owner; | 32 | lock->owner = new_owner; |
375 | DEBUG_WARN_ON(!list_empty(&lock->held_list)); | ||
376 | if (debug_mutex_on) { | ||
377 | list_add_tail(&lock->held_list, &debug_mutex_held_locks); | ||
378 | lock->acquire_ip = ip; | ||
379 | } | ||
380 | } | 33 | } |
381 | 34 | ||
382 | void debug_mutex_init_waiter(struct mutex_waiter *waiter) | 35 | void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) |
383 | { | 36 | { |
384 | memset(waiter, 0x11, sizeof(*waiter)); | 37 | memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); |
385 | waiter->magic = waiter; | 38 | waiter->magic = waiter; |
386 | INIT_LIST_HEAD(&waiter->list); | 39 | INIT_LIST_HEAD(&waiter->list); |
387 | } | 40 | } |
388 | 41 | ||
389 | void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) | 42 | void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) |
390 | { | 43 | { |
391 | SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock)); | 44 | SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); |
392 | DEBUG_WARN_ON(list_empty(&lock->wait_list)); | 45 | DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list)); |
393 | DEBUG_WARN_ON(waiter->magic != waiter); | 46 | DEBUG_LOCKS_WARN_ON(waiter->magic != waiter); |
394 | DEBUG_WARN_ON(list_empty(&waiter->list)); | 47 | DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); |
395 | } | 48 | } |
396 | 49 | ||
397 | void debug_mutex_free_waiter(struct mutex_waiter *waiter) | 50 | void debug_mutex_free_waiter(struct mutex_waiter *waiter) |
398 | { | 51 | { |
399 | DEBUG_WARN_ON(!list_empty(&waiter->list)); | 52 | DEBUG_LOCKS_WARN_ON(!list_empty(&waiter->list)); |
400 | memset(waiter, 0x22, sizeof(*waiter)); | 53 | memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter)); |
401 | } | 54 | } |
402 | 55 | ||
403 | void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, | 56 | void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, |
404 | struct thread_info *ti __IP_DECL__) | 57 | struct thread_info *ti) |
405 | { | 58 | { |
406 | SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock)); | 59 | SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); |
407 | check_deadlock(lock, 0, ti, ip); | 60 | |
408 | /* Mark the current thread as blocked on the lock: */ | 61 | /* Mark the current thread as blocked on the lock: */ |
409 | ti->task->blocked_on = waiter; | 62 | ti->task->blocked_on = waiter; |
410 | waiter->lock = lock; | 63 | waiter->lock = lock; |
@@ -413,9 +66,9 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, | |||
413 | void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, | 66 | void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, |
414 | struct thread_info *ti) | 67 | struct thread_info *ti) |
415 | { | 68 | { |
416 | DEBUG_WARN_ON(list_empty(&waiter->list)); | 69 | DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); |
417 | DEBUG_WARN_ON(waiter->task != ti->task); | 70 | DEBUG_LOCKS_WARN_ON(waiter->task != ti->task); |
418 | DEBUG_WARN_ON(ti->task->blocked_on != waiter); | 71 | DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter); |
419 | ti->task->blocked_on = NULL; | 72 | ti->task->blocked_on = NULL; |
420 | 73 | ||
421 | list_del_init(&waiter->list); | 74 | list_del_init(&waiter->list); |
@@ -424,24 +77,23 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, | |||
424 | 77 | ||
425 | void debug_mutex_unlock(struct mutex *lock) | 78 | void debug_mutex_unlock(struct mutex *lock) |
426 | { | 79 | { |
427 | DEBUG_WARN_ON(lock->magic != lock); | 80 | DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); |
428 | DEBUG_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); | 81 | DEBUG_LOCKS_WARN_ON(lock->magic != lock); |
429 | DEBUG_WARN_ON(lock->owner != current_thread_info()); | 82 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); |
430 | if (debug_mutex_on) { | 83 | DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); |
431 | DEBUG_WARN_ON(list_empty(&lock->held_list)); | ||
432 | list_del_init(&lock->held_list); | ||
433 | } | ||
434 | } | 84 | } |
435 | 85 | ||
436 | void debug_mutex_init(struct mutex *lock, const char *name) | 86 | void debug_mutex_init(struct mutex *lock, const char *name, |
87 | struct lock_class_key *key) | ||
437 | { | 88 | { |
89 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
438 | /* | 90 | /* |
439 | * Make sure we are not reinitializing a held lock: | 91 | * Make sure we are not reinitializing a held lock: |
440 | */ | 92 | */ |
441 | mutex_debug_check_no_locks_freed((void *)lock, sizeof(*lock)); | 93 | debug_check_no_locks_freed((void *)lock, sizeof(*lock)); |
94 | lockdep_init_map(&lock->dep_map, name, key); | ||
95 | #endif | ||
442 | lock->owner = NULL; | 96 | lock->owner = NULL; |
443 | INIT_LIST_HEAD(&lock->held_list); | ||
444 | lock->name = name; | ||
445 | lock->magic = lock; | 97 | lock->magic = lock; |
446 | } | 98 | } |
447 | 99 | ||
@@ -455,7 +107,7 @@ void debug_mutex_init(struct mutex *lock, const char *name) | |||
455 | */ | 107 | */ |
456 | void fastcall mutex_destroy(struct mutex *lock) | 108 | void fastcall mutex_destroy(struct mutex *lock) |
457 | { | 109 | { |
458 | DEBUG_WARN_ON(mutex_is_locked(lock)); | 110 | DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock)); |
459 | lock->magic = NULL; | 111 | lock->magic = NULL; |
460 | } | 112 | } |
461 | 113 | ||
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index fd384050acb1..babfbdfc534b 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h | |||
@@ -10,125 +10,44 @@ | |||
10 | * More details are in kernel/mutex-debug.c. | 10 | * More details are in kernel/mutex-debug.c. |
11 | */ | 11 | */ |
12 | 12 | ||
13 | extern spinlock_t debug_mutex_lock; | ||
14 | extern struct list_head debug_mutex_held_locks; | ||
15 | extern int debug_mutex_on; | ||
16 | |||
17 | /* | ||
18 | * In the debug case we carry the caller's instruction pointer into | ||
19 | * other functions, but we dont want the function argument overhead | ||
20 | * in the nondebug case - hence these macros: | ||
21 | */ | ||
22 | #define __IP_DECL__ , unsigned long ip | ||
23 | #define __IP__ , ip | ||
24 | #define __RET_IP__ , (unsigned long)__builtin_return_address(0) | ||
25 | |||
26 | /* | 13 | /* |
27 | * This must be called with lock->wait_lock held. | 14 | * This must be called with lock->wait_lock held. |
28 | */ | 15 | */ |
29 | extern void debug_mutex_set_owner(struct mutex *lock, | 16 | extern void |
30 | struct thread_info *new_owner __IP_DECL__); | 17 | debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner); |
31 | 18 | ||
32 | static inline void debug_mutex_clear_owner(struct mutex *lock) | 19 | static inline void debug_mutex_clear_owner(struct mutex *lock) |
33 | { | 20 | { |
34 | lock->owner = NULL; | 21 | lock->owner = NULL; |
35 | } | 22 | } |
36 | 23 | ||
37 | extern void debug_mutex_init_waiter(struct mutex_waiter *waiter); | 24 | extern void debug_mutex_lock_common(struct mutex *lock, |
25 | struct mutex_waiter *waiter); | ||
38 | extern void debug_mutex_wake_waiter(struct mutex *lock, | 26 | extern void debug_mutex_wake_waiter(struct mutex *lock, |
39 | struct mutex_waiter *waiter); | 27 | struct mutex_waiter *waiter); |
40 | extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); | 28 | extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); |
41 | extern void debug_mutex_add_waiter(struct mutex *lock, | 29 | extern void debug_mutex_add_waiter(struct mutex *lock, |
42 | struct mutex_waiter *waiter, | 30 | struct mutex_waiter *waiter, |
43 | struct thread_info *ti __IP_DECL__); | 31 | struct thread_info *ti); |
44 | extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, | 32 | extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, |
45 | struct thread_info *ti); | 33 | struct thread_info *ti); |
46 | extern void debug_mutex_unlock(struct mutex *lock); | 34 | extern void debug_mutex_unlock(struct mutex *lock); |
47 | extern void debug_mutex_init(struct mutex *lock, const char *name); | 35 | extern void debug_mutex_init(struct mutex *lock, const char *name, |
48 | 36 | struct lock_class_key *key); | |
49 | #define debug_spin_lock(lock) \ | ||
50 | do { \ | ||
51 | local_irq_disable(); \ | ||
52 | if (debug_mutex_on) \ | ||
53 | spin_lock(lock); \ | ||
54 | } while (0) | ||
55 | 37 | ||
56 | #define debug_spin_unlock(lock) \ | 38 | #define spin_lock_mutex(lock, flags) \ |
57 | do { \ | ||
58 | if (debug_mutex_on) \ | ||
59 | spin_unlock(lock); \ | ||
60 | local_irq_enable(); \ | ||
61 | preempt_check_resched(); \ | ||
62 | } while (0) | ||
63 | |||
64 | #define debug_spin_lock_save(lock, flags) \ | ||
65 | do { \ | 39 | do { \ |
40 | struct mutex *l = container_of(lock, struct mutex, wait_lock); \ | ||
41 | \ | ||
42 | DEBUG_LOCKS_WARN_ON(in_interrupt()); \ | ||
66 | local_irq_save(flags); \ | 43 | local_irq_save(flags); \ |
67 | if (debug_mutex_on) \ | 44 | __raw_spin_lock(&(lock)->raw_lock); \ |
68 | spin_lock(lock); \ | 45 | DEBUG_LOCKS_WARN_ON(l->magic != l); \ |
69 | } while (0) | 46 | } while (0) |
70 | 47 | ||
71 | #define debug_spin_lock_restore(lock, flags) \ | 48 | #define spin_unlock_mutex(lock, flags) \ |
72 | do { \ | 49 | do { \ |
73 | if (debug_mutex_on) \ | 50 | __raw_spin_unlock(&(lock)->raw_lock); \ |
74 | spin_unlock(lock); \ | ||
75 | local_irq_restore(flags); \ | 51 | local_irq_restore(flags); \ |
76 | preempt_check_resched(); \ | 52 | preempt_check_resched(); \ |
77 | } while (0) | 53 | } while (0) |
78 | |||
79 | #define spin_lock_mutex(lock) \ | ||
80 | do { \ | ||
81 | struct mutex *l = container_of(lock, struct mutex, wait_lock); \ | ||
82 | \ | ||
83 | DEBUG_WARN_ON(in_interrupt()); \ | ||
84 | debug_spin_lock(&debug_mutex_lock); \ | ||
85 | spin_lock(lock); \ | ||
86 | DEBUG_WARN_ON(l->magic != l); \ | ||
87 | } while (0) | ||
88 | |||
89 | #define spin_unlock_mutex(lock) \ | ||
90 | do { \ | ||
91 | spin_unlock(lock); \ | ||
92 | debug_spin_unlock(&debug_mutex_lock); \ | ||
93 | } while (0) | ||
94 | |||
95 | #define DEBUG_OFF() \ | ||
96 | do { \ | ||
97 | if (debug_mutex_on) { \ | ||
98 | debug_mutex_on = 0; \ | ||
99 | console_verbose(); \ | ||
100 | if (spin_is_locked(&debug_mutex_lock)) \ | ||
101 | spin_unlock(&debug_mutex_lock); \ | ||
102 | } \ | ||
103 | } while (0) | ||
104 | |||
105 | #define DEBUG_BUG() \ | ||
106 | do { \ | ||
107 | if (debug_mutex_on) { \ | ||
108 | DEBUG_OFF(); \ | ||
109 | BUG(); \ | ||
110 | } \ | ||
111 | } while (0) | ||
112 | |||
113 | #define DEBUG_WARN_ON(c) \ | ||
114 | do { \ | ||
115 | if (unlikely(c && debug_mutex_on)) { \ | ||
116 | DEBUG_OFF(); \ | ||
117 | WARN_ON(1); \ | ||
118 | } \ | ||
119 | } while (0) | ||
120 | |||
121 | # define DEBUG_BUG_ON(c) \ | ||
122 | do { \ | ||
123 | if (unlikely(c)) \ | ||
124 | DEBUG_BUG(); \ | ||
125 | } while (0) | ||
126 | |||
127 | #ifdef CONFIG_SMP | ||
128 | # define SMP_DEBUG_WARN_ON(c) DEBUG_WARN_ON(c) | ||
129 | # define SMP_DEBUG_BUG_ON(c) DEBUG_BUG_ON(c) | ||
130 | #else | ||
131 | # define SMP_DEBUG_WARN_ON(c) do { } while (0) | ||
132 | # define SMP_DEBUG_BUG_ON(c) do { } while (0) | ||
133 | #endif | ||
134 | |||
diff --git a/kernel/mutex.c b/kernel/mutex.c index 5449b210d9ed..8c71cf72a497 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <linux/spinlock.h> | 18 | #include <linux/spinlock.h> |
19 | #include <linux/interrupt.h> | 19 | #include <linux/interrupt.h> |
20 | #include <linux/debug_locks.h> | ||
20 | 21 | ||
21 | /* | 22 | /* |
22 | * In the DEBUG case we are using the "NULL fastpath" for mutexes, | 23 | * In the DEBUG case we are using the "NULL fastpath" for mutexes, |
@@ -38,13 +39,14 @@ | |||
38 | * | 39 | * |
39 | * It is not allowed to initialize an already locked mutex. | 40 | * It is not allowed to initialize an already locked mutex. |
40 | */ | 41 | */ |
41 | void fastcall __mutex_init(struct mutex *lock, const char *name) | 42 | void |
43 | __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) | ||
42 | { | 44 | { |
43 | atomic_set(&lock->count, 1); | 45 | atomic_set(&lock->count, 1); |
44 | spin_lock_init(&lock->wait_lock); | 46 | spin_lock_init(&lock->wait_lock); |
45 | INIT_LIST_HEAD(&lock->wait_list); | 47 | INIT_LIST_HEAD(&lock->wait_list); |
46 | 48 | ||
47 | debug_mutex_init(lock, name); | 49 | debug_mutex_init(lock, name, key); |
48 | } | 50 | } |
49 | 51 | ||
50 | EXPORT_SYMBOL(__mutex_init); | 52 | EXPORT_SYMBOL(__mutex_init); |
@@ -56,7 +58,7 @@ EXPORT_SYMBOL(__mutex_init); | |||
56 | * branch is predicted by the CPU as default-untaken. | 58 | * branch is predicted by the CPU as default-untaken. |
57 | */ | 59 | */ |
58 | static void fastcall noinline __sched | 60 | static void fastcall noinline __sched |
59 | __mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__); | 61 | __mutex_lock_slowpath(atomic_t *lock_count); |
60 | 62 | ||
61 | /*** | 63 | /*** |
62 | * mutex_lock - acquire the mutex | 64 | * mutex_lock - acquire the mutex |
@@ -79,7 +81,7 @@ __mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__); | |||
79 | * | 81 | * |
80 | * This function is similar to (but not equivalent to) down(). | 82 | * This function is similar to (but not equivalent to) down(). |
81 | */ | 83 | */ |
82 | void fastcall __sched mutex_lock(struct mutex *lock) | 84 | void inline fastcall __sched mutex_lock(struct mutex *lock) |
83 | { | 85 | { |
84 | might_sleep(); | 86 | might_sleep(); |
85 | /* | 87 | /* |
@@ -92,7 +94,7 @@ void fastcall __sched mutex_lock(struct mutex *lock) | |||
92 | EXPORT_SYMBOL(mutex_lock); | 94 | EXPORT_SYMBOL(mutex_lock); |
93 | 95 | ||
94 | static void fastcall noinline __sched | 96 | static void fastcall noinline __sched |
95 | __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__); | 97 | __mutex_unlock_slowpath(atomic_t *lock_count); |
96 | 98 | ||
97 | /*** | 99 | /*** |
98 | * mutex_unlock - release the mutex | 100 | * mutex_unlock - release the mutex |
@@ -120,17 +122,18 @@ EXPORT_SYMBOL(mutex_unlock); | |||
120 | * Lock a mutex (possibly interruptible), slowpath: | 122 | * Lock a mutex (possibly interruptible), slowpath: |
121 | */ | 123 | */ |
122 | static inline int __sched | 124 | static inline int __sched |
123 | __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) | 125 | __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) |
124 | { | 126 | { |
125 | struct task_struct *task = current; | 127 | struct task_struct *task = current; |
126 | struct mutex_waiter waiter; | 128 | struct mutex_waiter waiter; |
127 | unsigned int old_val; | 129 | unsigned int old_val; |
130 | unsigned long flags; | ||
128 | 131 | ||
129 | debug_mutex_init_waiter(&waiter); | 132 | spin_lock_mutex(&lock->wait_lock, flags); |
130 | 133 | ||
131 | spin_lock_mutex(&lock->wait_lock); | 134 | debug_mutex_lock_common(lock, &waiter); |
132 | 135 | mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); | |
133 | debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip); | 136 | debug_mutex_add_waiter(lock, &waiter, task->thread_info); |
134 | 137 | ||
135 | /* add waiting tasks to the end of the waitqueue (FIFO): */ | 138 | /* add waiting tasks to the end of the waitqueue (FIFO): */ |
136 | list_add_tail(&waiter.list, &lock->wait_list); | 139 | list_add_tail(&waiter.list, &lock->wait_list); |
@@ -157,7 +160,8 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) | |||
157 | if (unlikely(state == TASK_INTERRUPTIBLE && | 160 | if (unlikely(state == TASK_INTERRUPTIBLE && |
158 | signal_pending(task))) { | 161 | signal_pending(task))) { |
159 | mutex_remove_waiter(lock, &waiter, task->thread_info); | 162 | mutex_remove_waiter(lock, &waiter, task->thread_info); |
160 | spin_unlock_mutex(&lock->wait_lock); | 163 | mutex_release(&lock->dep_map, 1, _RET_IP_); |
164 | spin_unlock_mutex(&lock->wait_lock, flags); | ||
161 | 165 | ||
162 | debug_mutex_free_waiter(&waiter); | 166 | debug_mutex_free_waiter(&waiter); |
163 | return -EINTR; | 167 | return -EINTR; |
@@ -165,48 +169,57 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) | |||
165 | __set_task_state(task, state); | 169 | __set_task_state(task, state); |
166 | 170 | ||
167 | /* didnt get the lock, go to sleep: */ | 171 | /* didnt get the lock, go to sleep: */ |
168 | spin_unlock_mutex(&lock->wait_lock); | 172 | spin_unlock_mutex(&lock->wait_lock, flags); |
169 | schedule(); | 173 | schedule(); |
170 | spin_lock_mutex(&lock->wait_lock); | 174 | spin_lock_mutex(&lock->wait_lock, flags); |
171 | } | 175 | } |
172 | 176 | ||
173 | /* got the lock - rejoice! */ | 177 | /* got the lock - rejoice! */ |
174 | mutex_remove_waiter(lock, &waiter, task->thread_info); | 178 | mutex_remove_waiter(lock, &waiter, task->thread_info); |
175 | debug_mutex_set_owner(lock, task->thread_info __IP__); | 179 | debug_mutex_set_owner(lock, task->thread_info); |
176 | 180 | ||
177 | /* set it to 0 if there are no waiters left: */ | 181 | /* set it to 0 if there are no waiters left: */ |
178 | if (likely(list_empty(&lock->wait_list))) | 182 | if (likely(list_empty(&lock->wait_list))) |
179 | atomic_set(&lock->count, 0); | 183 | atomic_set(&lock->count, 0); |
180 | 184 | ||
181 | spin_unlock_mutex(&lock->wait_lock); | 185 | spin_unlock_mutex(&lock->wait_lock, flags); |
182 | 186 | ||
183 | debug_mutex_free_waiter(&waiter); | 187 | debug_mutex_free_waiter(&waiter); |
184 | 188 | ||
185 | DEBUG_WARN_ON(list_empty(&lock->held_list)); | ||
186 | DEBUG_WARN_ON(lock->owner != task->thread_info); | ||
187 | |||
188 | return 0; | 189 | return 0; |
189 | } | 190 | } |
190 | 191 | ||
191 | static void fastcall noinline __sched | 192 | static void fastcall noinline __sched |
192 | __mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__) | 193 | __mutex_lock_slowpath(atomic_t *lock_count) |
193 | { | 194 | { |
194 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 195 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
195 | 196 | ||
196 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE __IP__); | 197 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0); |
198 | } | ||
199 | |||
200 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
201 | void __sched | ||
202 | mutex_lock_nested(struct mutex *lock, unsigned int subclass) | ||
203 | { | ||
204 | might_sleep(); | ||
205 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass); | ||
197 | } | 206 | } |
198 | 207 | ||
208 | EXPORT_SYMBOL_GPL(mutex_lock_nested); | ||
209 | #endif | ||
210 | |||
199 | /* | 211 | /* |
200 | * Release the lock, slowpath: | 212 | * Release the lock, slowpath: |
201 | */ | 213 | */ |
202 | static fastcall noinline void | 214 | static fastcall inline void |
203 | __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) | 215 | __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) |
204 | { | 216 | { |
205 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 217 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
218 | unsigned long flags; | ||
206 | 219 | ||
207 | DEBUG_WARN_ON(lock->owner != current_thread_info()); | 220 | spin_lock_mutex(&lock->wait_lock, flags); |
208 | 221 | mutex_release(&lock->dep_map, nested, _RET_IP_); | |
209 | spin_lock_mutex(&lock->wait_lock); | 222 | debug_mutex_unlock(lock); |
210 | 223 | ||
211 | /* | 224 | /* |
212 | * some architectures leave the lock unlocked in the fastpath failure | 225 | * some architectures leave the lock unlocked in the fastpath failure |
@@ -216,8 +229,6 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) | |||
216 | if (__mutex_slowpath_needs_to_unlock()) | 229 | if (__mutex_slowpath_needs_to_unlock()) |
217 | atomic_set(&lock->count, 1); | 230 | atomic_set(&lock->count, 1); |
218 | 231 | ||
219 | debug_mutex_unlock(lock); | ||
220 | |||
221 | if (!list_empty(&lock->wait_list)) { | 232 | if (!list_empty(&lock->wait_list)) { |
222 | /* get the first entry from the wait-list: */ | 233 | /* get the first entry from the wait-list: */ |
223 | struct mutex_waiter *waiter = | 234 | struct mutex_waiter *waiter = |
@@ -231,7 +242,16 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) | |||
231 | 242 | ||
232 | debug_mutex_clear_owner(lock); | 243 | debug_mutex_clear_owner(lock); |
233 | 244 | ||
234 | spin_unlock_mutex(&lock->wait_lock); | 245 | spin_unlock_mutex(&lock->wait_lock, flags); |
246 | } | ||
247 | |||
248 | /* | ||
249 | * Release the lock, slowpath: | ||
250 | */ | ||
251 | static fastcall noinline void | ||
252 | __mutex_unlock_slowpath(atomic_t *lock_count) | ||
253 | { | ||
254 | __mutex_unlock_common_slowpath(lock_count, 1); | ||
235 | } | 255 | } |
236 | 256 | ||
237 | /* | 257 | /* |
@@ -239,7 +259,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) | |||
239 | * mutex_lock_interruptible() and mutex_trylock(). | 259 | * mutex_lock_interruptible() and mutex_trylock(). |
240 | */ | 260 | */ |
241 | static int fastcall noinline __sched | 261 | static int fastcall noinline __sched |
242 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__); | 262 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count); |
243 | 263 | ||
244 | /*** | 264 | /*** |
245 | * mutex_lock_interruptible - acquire the mutex, interruptable | 265 | * mutex_lock_interruptible - acquire the mutex, interruptable |
@@ -262,11 +282,11 @@ int fastcall __sched mutex_lock_interruptible(struct mutex *lock) | |||
262 | EXPORT_SYMBOL(mutex_lock_interruptible); | 282 | EXPORT_SYMBOL(mutex_lock_interruptible); |
263 | 283 | ||
264 | static int fastcall noinline __sched | 284 | static int fastcall noinline __sched |
265 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__) | 285 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count) |
266 | { | 286 | { |
267 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 287 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
268 | 288 | ||
269 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE __IP__); | 289 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0); |
270 | } | 290 | } |
271 | 291 | ||
272 | /* | 292 | /* |
@@ -276,18 +296,21 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__) | |||
276 | static inline int __mutex_trylock_slowpath(atomic_t *lock_count) | 296 | static inline int __mutex_trylock_slowpath(atomic_t *lock_count) |
277 | { | 297 | { |
278 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 298 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
299 | unsigned long flags; | ||
279 | int prev; | 300 | int prev; |
280 | 301 | ||
281 | spin_lock_mutex(&lock->wait_lock); | 302 | spin_lock_mutex(&lock->wait_lock, flags); |
282 | 303 | ||
283 | prev = atomic_xchg(&lock->count, -1); | 304 | prev = atomic_xchg(&lock->count, -1); |
284 | if (likely(prev == 1)) | 305 | if (likely(prev == 1)) { |
285 | debug_mutex_set_owner(lock, current_thread_info() __RET_IP__); | 306 | debug_mutex_set_owner(lock, current_thread_info()); |
307 | mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); | ||
308 | } | ||
286 | /* Set it back to 0 if there are no waiters: */ | 309 | /* Set it back to 0 if there are no waiters: */ |
287 | if (likely(list_empty(&lock->wait_list))) | 310 | if (likely(list_empty(&lock->wait_list))) |
288 | atomic_set(&lock->count, 0); | 311 | atomic_set(&lock->count, 0); |
289 | 312 | ||
290 | spin_unlock_mutex(&lock->wait_lock); | 313 | spin_unlock_mutex(&lock->wait_lock, flags); |
291 | 314 | ||
292 | return prev == 1; | 315 | return prev == 1; |
293 | } | 316 | } |
@@ -306,7 +329,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) | |||
306 | * This function must not be used in interrupt context. The | 329 | * This function must not be used in interrupt context. The |
307 | * mutex must be released by the same task that acquired it. | 330 | * mutex must be released by the same task that acquired it. |
308 | */ | 331 | */ |
309 | int fastcall mutex_trylock(struct mutex *lock) | 332 | int fastcall __sched mutex_trylock(struct mutex *lock) |
310 | { | 333 | { |
311 | return __mutex_fastpath_trylock(&lock->count, | 334 | return __mutex_fastpath_trylock(&lock->count, |
312 | __mutex_trylock_slowpath); | 335 | __mutex_trylock_slowpath); |
diff --git a/kernel/mutex.h b/kernel/mutex.h index 00fe84e7b672..a075dafbb290 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h | |||
@@ -9,27 +9,22 @@ | |||
9 | * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: | 9 | * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #define spin_lock_mutex(lock) spin_lock(lock) | 12 | #define spin_lock_mutex(lock, flags) \ |
13 | #define spin_unlock_mutex(lock) spin_unlock(lock) | 13 | do { spin_lock(lock); (void)(flags); } while (0) |
14 | #define spin_unlock_mutex(lock, flags) \ | ||
15 | do { spin_unlock(lock); (void)(flags); } while (0) | ||
14 | #define mutex_remove_waiter(lock, waiter, ti) \ | 16 | #define mutex_remove_waiter(lock, waiter, ti) \ |
15 | __list_del((waiter)->list.prev, (waiter)->list.next) | 17 | __list_del((waiter)->list.prev, (waiter)->list.next) |
16 | 18 | ||
17 | #define DEBUG_WARN_ON(c) do { } while (0) | ||
18 | #define debug_mutex_set_owner(lock, new_owner) do { } while (0) | 19 | #define debug_mutex_set_owner(lock, new_owner) do { } while (0) |
19 | #define debug_mutex_clear_owner(lock) do { } while (0) | 20 | #define debug_mutex_clear_owner(lock) do { } while (0) |
20 | #define debug_mutex_init_waiter(waiter) do { } while (0) | ||
21 | #define debug_mutex_wake_waiter(lock, waiter) do { } while (0) | 21 | #define debug_mutex_wake_waiter(lock, waiter) do { } while (0) |
22 | #define debug_mutex_free_waiter(waiter) do { } while (0) | 22 | #define debug_mutex_free_waiter(waiter) do { } while (0) |
23 | #define debug_mutex_add_waiter(lock, waiter, ti, ip) do { } while (0) | 23 | #define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) |
24 | #define debug_mutex_unlock(lock) do { } while (0) | 24 | #define debug_mutex_unlock(lock) do { } while (0) |
25 | #define debug_mutex_init(lock, name) do { } while (0) | 25 | #define debug_mutex_init(lock, name, key) do { } while (0) |
26 | |||
27 | /* | ||
28 | * Return-address parameters/declarations. They are very useful for | ||
29 | * debugging, but add overhead in the !DEBUG case - so we go the | ||
30 | * trouble of using this not too elegant but zero-cost solution: | ||
31 | */ | ||
32 | #define __IP_DECL__ | ||
33 | #define __IP__ | ||
34 | #define __RET_IP__ | ||
35 | 26 | ||
27 | static inline void | ||
28 | debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) | ||
29 | { | ||
30 | } | ||
diff --git a/kernel/panic.c b/kernel/panic.c index cc2a4c9c36ac..d8a0bca21233 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -8,7 +8,6 @@ | |||
8 | * This function is used through-out the kernel (including mm and fs) | 8 | * This function is used through-out the kernel (including mm and fs) |
9 | * to indicate a major problem. | 9 | * to indicate a major problem. |
10 | */ | 10 | */ |
11 | #include <linux/config.h> | ||
12 | #include <linux/module.h> | 11 | #include <linux/module.h> |
13 | #include <linux/sched.h> | 12 | #include <linux/sched.h> |
14 | #include <linux/delay.h> | 13 | #include <linux/delay.h> |
@@ -173,6 +172,7 @@ const char *print_tainted(void) | |||
173 | 172 | ||
174 | void add_taint(unsigned flag) | 173 | void add_taint(unsigned flag) |
175 | { | 174 | { |
175 | debug_locks_off(); /* can't trust the integrity of the kernel anymore */ | ||
176 | tainted |= flag; | 176 | tainted |= flag; |
177 | } | 177 | } |
178 | EXPORT_SYMBOL(add_taint); | 178 | EXPORT_SYMBOL(add_taint); |
@@ -257,6 +257,7 @@ int oops_may_print(void) | |||
257 | */ | 257 | */ |
258 | void oops_enter(void) | 258 | void oops_enter(void) |
259 | { | 259 | { |
260 | debug_locks_off(); /* can't trust the integrity of the kernel anymore */ | ||
260 | do_oops_enter_exit(); | 261 | do_oops_enter_exit(); |
261 | } | 262 | } |
262 | 263 | ||
diff --git a/kernel/params.c b/kernel/params.c index af43ecdc8d9b..91aea7aa532e 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -15,7 +15,6 @@ | |||
15 | along with this program; if not, write to the Free Software | 15 | along with this program; if not, write to the Free Software |
16 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 16 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
17 | */ | 17 | */ |
18 | #include <linux/config.h> | ||
19 | #include <linux/moduleparam.h> | 18 | #include <linux/moduleparam.h> |
20 | #include <linux/kernel.h> | 19 | #include <linux/kernel.h> |
21 | #include <linux/string.h> | 20 | #include <linux/string.h> |
diff --git a/kernel/pid.c b/kernel/pid.c index eeb836b65ca4..93e212f20671 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -218,7 +218,7 @@ struct pid * fastcall find_pid(int nr) | |||
218 | return NULL; | 218 | return NULL; |
219 | } | 219 | } |
220 | 220 | ||
221 | int fastcall attach_pid(task_t *task, enum pid_type type, int nr) | 221 | int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr) |
222 | { | 222 | { |
223 | struct pid_link *link; | 223 | struct pid_link *link; |
224 | struct pid *pid; | 224 | struct pid *pid; |
@@ -233,7 +233,7 @@ int fastcall attach_pid(task_t *task, enum pid_type type, int nr) | |||
233 | return 0; | 233 | return 0; |
234 | } | 234 | } |
235 | 235 | ||
236 | void fastcall detach_pid(task_t *task, enum pid_type type) | 236 | void fastcall detach_pid(struct task_struct *task, enum pid_type type) |
237 | { | 237 | { |
238 | struct pid_link *link; | 238 | struct pid_link *link; |
239 | struct pid *pid; | 239 | struct pid *pid; |
@@ -267,7 +267,7 @@ struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type) | |||
267 | /* | 267 | /* |
268 | * Must be called under rcu_read_lock() or with tasklist_lock read-held. | 268 | * Must be called under rcu_read_lock() or with tasklist_lock read-held. |
269 | */ | 269 | */ |
270 | task_t *find_task_by_pid_type(int type, int nr) | 270 | struct task_struct *find_task_by_pid_type(int type, int nr) |
271 | { | 271 | { |
272 | return pid_task(find_pid(nr), type); | 272 | return pid_task(find_pid(nr), type); |
273 | } | 273 | } |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 520f6c59948d..d38d9ec3276c 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -555,9 +555,6 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) | |||
555 | struct cpu_timer_list *next; | 555 | struct cpu_timer_list *next; |
556 | unsigned long i; | 556 | unsigned long i; |
557 | 557 | ||
558 | if (CPUCLOCK_PERTHREAD(timer->it_clock) && (p->flags & PF_EXITING)) | ||
559 | return; | ||
560 | |||
561 | head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? | 558 | head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? |
562 | p->cpu_timers : p->signal->cpu_timers); | 559 | p->cpu_timers : p->signal->cpu_timers); |
563 | head += CPUCLOCK_WHICH(timer->it_clock); | 560 | head += CPUCLOCK_WHICH(timer->it_clock); |
@@ -1173,6 +1170,9 @@ static void check_process_timers(struct task_struct *tsk, | |||
1173 | } | 1170 | } |
1174 | t = tsk; | 1171 | t = tsk; |
1175 | do { | 1172 | do { |
1173 | if (unlikely(t->flags & PF_EXITING)) | ||
1174 | continue; | ||
1175 | |||
1176 | ticks = cputime_add(cputime_add(t->utime, t->stime), | 1176 | ticks = cputime_add(cputime_add(t->utime, t->stime), |
1177 | prof_left); | 1177 | prof_left); |
1178 | if (!cputime_eq(prof_expires, cputime_zero) && | 1178 | if (!cputime_eq(prof_expires, cputime_zero) && |
@@ -1193,11 +1193,7 @@ static void check_process_timers(struct task_struct *tsk, | |||
1193 | t->it_sched_expires > sched)) { | 1193 | t->it_sched_expires > sched)) { |
1194 | t->it_sched_expires = sched; | 1194 | t->it_sched_expires = sched; |
1195 | } | 1195 | } |
1196 | 1196 | } while ((t = next_thread(t)) != tsk); | |
1197 | do { | ||
1198 | t = next_thread(t); | ||
1199 | } while (unlikely(t->flags & PF_EXITING)); | ||
1200 | } while (t != tsk); | ||
1201 | } | 1197 | } |
1202 | } | 1198 | } |
1203 | 1199 | ||
@@ -1289,30 +1285,30 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
1289 | 1285 | ||
1290 | #undef UNEXPIRED | 1286 | #undef UNEXPIRED |
1291 | 1287 | ||
1292 | BUG_ON(tsk->exit_state); | ||
1293 | |||
1294 | /* | 1288 | /* |
1295 | * Double-check with locks held. | 1289 | * Double-check with locks held. |
1296 | */ | 1290 | */ |
1297 | read_lock(&tasklist_lock); | 1291 | read_lock(&tasklist_lock); |
1298 | spin_lock(&tsk->sighand->siglock); | 1292 | if (likely(tsk->signal != NULL)) { |
1293 | spin_lock(&tsk->sighand->siglock); | ||
1299 | 1294 | ||
1300 | /* | 1295 | /* |
1301 | * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] | 1296 | * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] |
1302 | * all the timers that are firing, and put them on the firing list. | 1297 | * all the timers that are firing, and put them on the firing list. |
1303 | */ | 1298 | */ |
1304 | check_thread_timers(tsk, &firing); | 1299 | check_thread_timers(tsk, &firing); |
1305 | check_process_timers(tsk, &firing); | 1300 | check_process_timers(tsk, &firing); |
1306 | 1301 | ||
1307 | /* | 1302 | /* |
1308 | * We must release these locks before taking any timer's lock. | 1303 | * We must release these locks before taking any timer's lock. |
1309 | * There is a potential race with timer deletion here, as the | 1304 | * There is a potential race with timer deletion here, as the |
1310 | * siglock now protects our private firing list. We have set | 1305 | * siglock now protects our private firing list. We have set |
1311 | * the firing flag in each timer, so that a deletion attempt | 1306 | * the firing flag in each timer, so that a deletion attempt |
1312 | * that gets the timer lock before we do will give it up and | 1307 | * that gets the timer lock before we do will give it up and |
1313 | * spin until we've taken care of that timer below. | 1308 | * spin until we've taken care of that timer below. |
1314 | */ | 1309 | */ |
1315 | spin_unlock(&tsk->sighand->siglock); | 1310 | spin_unlock(&tsk->sighand->siglock); |
1311 | } | ||
1316 | read_unlock(&tasklist_lock); | 1312 | read_unlock(&tasklist_lock); |
1317 | 1313 | ||
1318 | /* | 1314 | /* |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ce0dfb8f4a4e..ae44a70aae8a 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -36,6 +36,24 @@ config PM_DEBUG | |||
36 | code. This is helpful when debugging and reporting various PM bugs, | 36 | code. This is helpful when debugging and reporting various PM bugs, |
37 | like suspend support. | 37 | like suspend support. |
38 | 38 | ||
39 | config PM_TRACE | ||
40 | bool "Suspend/resume event tracing" | ||
41 | depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL | ||
42 | default n | ||
43 | ---help--- | ||
44 | This enables some cheesy code to save the last PM event point in the | ||
45 | RTC across reboots, so that you can debug a machine that just hangs | ||
46 | during suspend (or more commonly, during resume). | ||
47 | |||
48 | To use this debugging feature you should attempt to suspend the machine, | ||
49 | then reboot it, then run | ||
50 | |||
51 | dmesg -s 1000000 | grep 'hash matches' | ||
52 | |||
53 | CAUTION: this option will cause your machine's real-time clock to be | ||
54 | set to an invalid time after a resume. | ||
55 | |||
56 | |||
39 | config SOFTWARE_SUSPEND | 57 | config SOFTWARE_SUSPEND |
40 | bool "Software Suspend" | 58 | bool "Software Suspend" |
41 | depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) | 59 | depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) |
@@ -82,18 +100,6 @@ config PM_STD_PARTITION | |||
82 | suspended image to. It will simply pick the first available swap | 100 | suspended image to. It will simply pick the first available swap |
83 | device. | 101 | device. |
84 | 102 | ||
85 | config SWSUSP_ENCRYPT | ||
86 | bool "Encrypt suspend image" | ||
87 | depends on SOFTWARE_SUSPEND && CRYPTO=y && (CRYPTO_AES=y || CRYPTO_AES_586=y || CRYPTO_AES_X86_64=y) | ||
88 | default "" | ||
89 | ---help--- | ||
90 | To prevent data gathering from swap after resume you can encrypt | ||
91 | the suspend image with a temporary key that is deleted on | ||
92 | resume. | ||
93 | |||
94 | Note that the temporary key is stored unencrypted on disk while the | ||
95 | system is suspended. | ||
96 | |||
97 | config SUSPEND_SMP | 103 | config SUSPEND_SMP |
98 | bool | 104 | bool |
99 | depends on HOTPLUG_CPU && X86 && PM | 105 | depends on HOTPLUG_CPU && X86 && PM |
diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 81d4d982f3f0..e13e74067845 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c | |||
@@ -231,7 +231,7 @@ static int software_resume(void) | |||
231 | late_initcall(software_resume); | 231 | late_initcall(software_resume); |
232 | 232 | ||
233 | 233 | ||
234 | static char * pm_disk_modes[] = { | 234 | static const char * const pm_disk_modes[] = { |
235 | [PM_DISK_FIRMWARE] = "firmware", | 235 | [PM_DISK_FIRMWARE] = "firmware", |
236 | [PM_DISK_PLATFORM] = "platform", | 236 | [PM_DISK_PLATFORM] = "platform", |
237 | [PM_DISK_SHUTDOWN] = "shutdown", | 237 | [PM_DISK_SHUTDOWN] = "shutdown", |
diff --git a/kernel/power/main.c b/kernel/power/main.c index a6d9ef46009e..6d295c776794 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -15,7 +15,7 @@ | |||
15 | #include <linux/errno.h> | 15 | #include <linux/errno.h> |
16 | #include <linux/init.h> | 16 | #include <linux/init.h> |
17 | #include <linux/pm.h> | 17 | #include <linux/pm.h> |
18 | 18 | #include <linux/console.h> | |
19 | 19 | ||
20 | #include "power.h" | 20 | #include "power.h" |
21 | 21 | ||
@@ -86,6 +86,7 @@ static int suspend_prepare(suspend_state_t state) | |||
86 | goto Thaw; | 86 | goto Thaw; |
87 | } | 87 | } |
88 | 88 | ||
89 | suspend_console(); | ||
89 | if ((error = device_suspend(PMSG_SUSPEND))) { | 90 | if ((error = device_suspend(PMSG_SUSPEND))) { |
90 | printk(KERN_ERR "Some devices failed to suspend\n"); | 91 | printk(KERN_ERR "Some devices failed to suspend\n"); |
91 | goto Finish; | 92 | goto Finish; |
@@ -133,6 +134,7 @@ int suspend_enter(suspend_state_t state) | |||
133 | static void suspend_finish(suspend_state_t state) | 134 | static void suspend_finish(suspend_state_t state) |
134 | { | 135 | { |
135 | device_resume(); | 136 | device_resume(); |
137 | resume_console(); | ||
136 | thaw_processes(); | 138 | thaw_processes(); |
137 | enable_nonboot_cpus(); | 139 | enable_nonboot_cpus(); |
138 | if (pm_ops && pm_ops->finish) | 140 | if (pm_ops && pm_ops->finish) |
@@ -143,7 +145,7 @@ static void suspend_finish(suspend_state_t state) | |||
143 | 145 | ||
144 | 146 | ||
145 | 147 | ||
146 | static char *pm_states[PM_SUSPEND_MAX] = { | 148 | static const char * const pm_states[PM_SUSPEND_MAX] = { |
147 | [PM_SUSPEND_STANDBY] = "standby", | 149 | [PM_SUSPEND_STANDBY] = "standby", |
148 | [PM_SUSPEND_MEM] = "mem", | 150 | [PM_SUSPEND_MEM] = "mem", |
149 | #ifdef CONFIG_SOFTWARE_SUSPEND | 151 | #ifdef CONFIG_SOFTWARE_SUSPEND |
@@ -260,7 +262,7 @@ static ssize_t state_show(struct subsystem * subsys, char * buf) | |||
260 | static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n) | 262 | static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n) |
261 | { | 263 | { |
262 | suspend_state_t state = PM_SUSPEND_STANDBY; | 264 | suspend_state_t state = PM_SUSPEND_STANDBY; |
263 | char ** s; | 265 | const char * const *s; |
264 | char *p; | 266 | char *p; |
265 | int error; | 267 | int error; |
266 | int len; | 268 | int len; |
diff --git a/kernel/power/pm.c b/kernel/power/pm.c index 84063ac8fcfc..c50d15266c10 100644 --- a/kernel/power/pm.c +++ b/kernel/power/pm.c | |||
@@ -75,42 +75,6 @@ struct pm_dev *pm_register(pm_dev_t type, | |||
75 | return dev; | 75 | return dev; |
76 | } | 76 | } |
77 | 77 | ||
78 | static void __pm_unregister(struct pm_dev *dev) | ||
79 | { | ||
80 | if (dev) { | ||
81 | list_del(&dev->entry); | ||
82 | kfree(dev); | ||
83 | } | ||
84 | } | ||
85 | |||
86 | /** | ||
87 | * pm_unregister_all - unregister all devices with matching callback | ||
88 | * @callback: callback function pointer | ||
89 | * | ||
90 | * Unregister every device that would call the callback passed. This | ||
91 | * is primarily meant as a helper function for loadable modules. It | ||
92 | * enables a module to give up all its managed devices without keeping | ||
93 | * its own private list. | ||
94 | */ | ||
95 | |||
96 | void pm_unregister_all(pm_callback callback) | ||
97 | { | ||
98 | struct list_head *entry; | ||
99 | |||
100 | if (!callback) | ||
101 | return; | ||
102 | |||
103 | mutex_lock(&pm_devs_lock); | ||
104 | entry = pm_devs.next; | ||
105 | while (entry != &pm_devs) { | ||
106 | struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); | ||
107 | entry = entry->next; | ||
108 | if (dev->callback == callback) | ||
109 | __pm_unregister(dev); | ||
110 | } | ||
111 | mutex_unlock(&pm_devs_lock); | ||
112 | } | ||
113 | |||
114 | /** | 78 | /** |
115 | * pm_send - send request to a single device | 79 | * pm_send - send request to a single device |
116 | * @dev: device to send to | 80 | * @dev: device to send to |
@@ -239,7 +203,6 @@ int pm_send_all(pm_request_t rqst, void *data) | |||
239 | } | 203 | } |
240 | 204 | ||
241 | EXPORT_SYMBOL(pm_register); | 205 | EXPORT_SYMBOL(pm_register); |
242 | EXPORT_SYMBOL(pm_unregister_all); | ||
243 | EXPORT_SYMBOL(pm_send_all); | 206 | EXPORT_SYMBOL(pm_send_all); |
244 | EXPORT_SYMBOL(pm_active); | 207 | EXPORT_SYMBOL(pm_active); |
245 | 208 | ||
diff --git a/kernel/power/power.h b/kernel/power/power.h index f06f12f21767..57a792982fb9 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -55,7 +55,7 @@ struct snapshot_handle { | |||
55 | unsigned int page; | 55 | unsigned int page; |
56 | unsigned int page_offset; | 56 | unsigned int page_offset; |
57 | unsigned int prev; | 57 | unsigned int prev; |
58 | struct pbe *pbe; | 58 | struct pbe *pbe, *last_pbe; |
59 | void *buffer; | 59 | void *buffer; |
60 | unsigned int buf_offset; | 60 | unsigned int buf_offset; |
61 | }; | 61 | }; |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3eeedbb13b78..75d4886e648e 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -150,6 +150,10 @@ int restore_highmem(void) | |||
150 | } | 150 | } |
151 | return 0; | 151 | return 0; |
152 | } | 152 | } |
153 | #else | ||
154 | static inline unsigned int count_highmem_pages(void) {return 0;} | ||
155 | static inline int save_highmem(void) {return 0;} | ||
156 | static inline int restore_highmem(void) {return 0;} | ||
153 | #endif | 157 | #endif |
154 | 158 | ||
155 | static int pfn_is_nosave(unsigned long pfn) | 159 | static int pfn_is_nosave(unsigned long pfn) |
@@ -223,11 +227,17 @@ static void copy_data_pages(struct pbe *pblist) | |||
223 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { | 227 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { |
224 | if (saveable(zone, &zone_pfn)) { | 228 | if (saveable(zone, &zone_pfn)) { |
225 | struct page *page; | 229 | struct page *page; |
230 | long *src, *dst; | ||
231 | int n; | ||
232 | |||
226 | page = pfn_to_page(zone_pfn + zone->zone_start_pfn); | 233 | page = pfn_to_page(zone_pfn + zone->zone_start_pfn); |
227 | BUG_ON(!pbe); | 234 | BUG_ON(!pbe); |
228 | pbe->orig_address = (unsigned long)page_address(page); | 235 | pbe->orig_address = (unsigned long)page_address(page); |
229 | /* copy_page is not usable for copying task structs. */ | 236 | /* copy_page and memcpy are not usable for copying task structs. */ |
230 | memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE); | 237 | dst = (long *)pbe->address; |
238 | src = (long *)pbe->orig_address; | ||
239 | for (n = PAGE_SIZE / sizeof(long); n; n--) | ||
240 | *dst++ = *src++; | ||
231 | pbe = pbe->next; | 241 | pbe = pbe->next; |
232 | } | 242 | } |
233 | } | 243 | } |
@@ -293,62 +303,29 @@ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) | |||
293 | } | 303 | } |
294 | } | 304 | } |
295 | 305 | ||
296 | /** | 306 | static unsigned int unsafe_pages; |
297 | * On resume it is necessary to trace and eventually free the unsafe | ||
298 | * pages that have been allocated, because they are needed for I/O | ||
299 | * (on x86-64 we likely will "eat" these pages once again while | ||
300 | * creating the temporary page translation tables) | ||
301 | */ | ||
302 | |||
303 | struct eaten_page { | ||
304 | struct eaten_page *next; | ||
305 | char padding[PAGE_SIZE - sizeof(void *)]; | ||
306 | }; | ||
307 | |||
308 | static struct eaten_page *eaten_pages = NULL; | ||
309 | |||
310 | static void release_eaten_pages(void) | ||
311 | { | ||
312 | struct eaten_page *p, *q; | ||
313 | |||
314 | p = eaten_pages; | ||
315 | while (p) { | ||
316 | q = p->next; | ||
317 | /* We don't want swsusp_free() to free this page again */ | ||
318 | ClearPageNosave(virt_to_page(p)); | ||
319 | free_page((unsigned long)p); | ||
320 | p = q; | ||
321 | } | ||
322 | eaten_pages = NULL; | ||
323 | } | ||
324 | 307 | ||
325 | /** | 308 | /** |
326 | * @safe_needed - on resume, for storing the PBE list and the image, | 309 | * @safe_needed - on resume, for storing the PBE list and the image, |
327 | * we can only use memory pages that do not conflict with the pages | 310 | * we can only use memory pages that do not conflict with the pages |
328 | * which had been used before suspend. | 311 | * used before suspend. |
329 | * | 312 | * |
330 | * The unsafe pages are marked with the PG_nosave_free flag | 313 | * The unsafe pages are marked with the PG_nosave_free flag |
331 | * | 314 | * and we count them using unsafe_pages |
332 | * Allocated but unusable (ie eaten) memory pages should be marked | ||
333 | * so that swsusp_free() can release them | ||
334 | */ | 315 | */ |
335 | 316 | ||
336 | static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) | 317 | static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) |
337 | { | 318 | { |
338 | void *res; | 319 | void *res; |
339 | 320 | ||
321 | res = (void *)get_zeroed_page(gfp_mask); | ||
340 | if (safe_needed) | 322 | if (safe_needed) |
341 | do { | 323 | while (res && PageNosaveFree(virt_to_page(res))) { |
324 | /* The page is unsafe, mark it for swsusp_free() */ | ||
325 | SetPageNosave(virt_to_page(res)); | ||
326 | unsafe_pages++; | ||
342 | res = (void *)get_zeroed_page(gfp_mask); | 327 | res = (void *)get_zeroed_page(gfp_mask); |
343 | if (res && PageNosaveFree(virt_to_page(res))) { | 328 | } |
344 | /* This is for swsusp_free() */ | ||
345 | SetPageNosave(virt_to_page(res)); | ||
346 | ((struct eaten_page *)res)->next = eaten_pages; | ||
347 | eaten_pages = res; | ||
348 | } | ||
349 | } while (res && PageNosaveFree(virt_to_page(res))); | ||
350 | else | ||
351 | res = (void *)get_zeroed_page(gfp_mask); | ||
352 | if (res) { | 329 | if (res) { |
353 | SetPageNosave(virt_to_page(res)); | 330 | SetPageNosave(virt_to_page(res)); |
354 | SetPageNosaveFree(virt_to_page(res)); | 331 | SetPageNosaveFree(virt_to_page(res)); |
@@ -374,7 +351,8 @@ unsigned long get_safe_page(gfp_t gfp_mask) | |||
374 | * On each page we set up a list of struct_pbe elements. | 351 | * On each page we set up a list of struct_pbe elements. |
375 | */ | 352 | */ |
376 | 353 | ||
377 | struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed) | 354 | static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, |
355 | int safe_needed) | ||
378 | { | 356 | { |
379 | unsigned int num; | 357 | unsigned int num; |
380 | struct pbe *pblist, *pbe; | 358 | struct pbe *pblist, *pbe; |
@@ -642,6 +620,8 @@ static int mark_unsafe_pages(struct pbe *pblist) | |||
642 | return -EFAULT; | 620 | return -EFAULT; |
643 | } | 621 | } |
644 | 622 | ||
623 | unsafe_pages = 0; | ||
624 | |||
645 | return 0; | 625 | return 0; |
646 | } | 626 | } |
647 | 627 | ||
@@ -719,42 +699,99 @@ static inline struct pbe *unpack_orig_addresses(unsigned long *buf, | |||
719 | } | 699 | } |
720 | 700 | ||
721 | /** | 701 | /** |
722 | * create_image - use metadata contained in the PBE list | 702 | * prepare_image - use metadata contained in the PBE list |
723 | * pointed to by pagedir_nosave to mark the pages that will | 703 | * pointed to by pagedir_nosave to mark the pages that will |
724 | * be overwritten in the process of restoring the system | 704 | * be overwritten in the process of restoring the system |
725 | * memory state from the image and allocate memory for | 705 | * memory state from the image ("unsafe" pages) and allocate |
726 | * the image avoiding these pages | 706 | * memory for the image |
707 | * | ||
708 | * The idea is to allocate the PBE list first and then | ||
709 | * allocate as many pages as it's needed for the image data, | ||
710 | * but not to assign these pages to the PBEs initially. | ||
711 | * Instead, we just mark them as allocated and create a list | ||
712 | * of "safe" which will be used later | ||
727 | */ | 713 | */ |
728 | 714 | ||
729 | static int create_image(struct snapshot_handle *handle) | 715 | struct safe_page { |
716 | struct safe_page *next; | ||
717 | char padding[PAGE_SIZE - sizeof(void *)]; | ||
718 | }; | ||
719 | |||
720 | static struct safe_page *safe_pages; | ||
721 | |||
722 | static int prepare_image(struct snapshot_handle *handle) | ||
730 | { | 723 | { |
731 | int error = 0; | 724 | int error = 0; |
732 | struct pbe *p, *pblist; | 725 | unsigned int nr_pages = nr_copy_pages; |
726 | struct pbe *p, *pblist = NULL; | ||
733 | 727 | ||
734 | p = pagedir_nosave; | 728 | p = pagedir_nosave; |
735 | error = mark_unsafe_pages(p); | 729 | error = mark_unsafe_pages(p); |
736 | if (!error) { | 730 | if (!error) { |
737 | pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); | 731 | pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); |
738 | if (pblist) | 732 | if (pblist) |
739 | copy_page_backup_list(pblist, p); | 733 | copy_page_backup_list(pblist, p); |
740 | free_pagedir(p, 0); | 734 | free_pagedir(p, 0); |
741 | if (!pblist) | 735 | if (!pblist) |
742 | error = -ENOMEM; | 736 | error = -ENOMEM; |
743 | } | 737 | } |
744 | if (!error) | 738 | safe_pages = NULL; |
745 | error = alloc_data_pages(pblist, GFP_ATOMIC, 1); | 739 | if (!error && nr_pages > unsafe_pages) { |
740 | nr_pages -= unsafe_pages; | ||
741 | while (nr_pages--) { | ||
742 | struct safe_page *ptr; | ||
743 | |||
744 | ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC); | ||
745 | if (!ptr) { | ||
746 | error = -ENOMEM; | ||
747 | break; | ||
748 | } | ||
749 | if (!PageNosaveFree(virt_to_page(ptr))) { | ||
750 | /* The page is "safe", add it to the list */ | ||
751 | ptr->next = safe_pages; | ||
752 | safe_pages = ptr; | ||
753 | } | ||
754 | /* Mark the page as allocated */ | ||
755 | SetPageNosave(virt_to_page(ptr)); | ||
756 | SetPageNosaveFree(virt_to_page(ptr)); | ||
757 | } | ||
758 | } | ||
746 | if (!error) { | 759 | if (!error) { |
747 | release_eaten_pages(); | ||
748 | pagedir_nosave = pblist; | 760 | pagedir_nosave = pblist; |
749 | } else { | 761 | } else { |
750 | pagedir_nosave = NULL; | ||
751 | handle->pbe = NULL; | 762 | handle->pbe = NULL; |
752 | nr_copy_pages = 0; | 763 | swsusp_free(); |
753 | nr_meta_pages = 0; | ||
754 | } | 764 | } |
755 | return error; | 765 | return error; |
756 | } | 766 | } |
757 | 767 | ||
768 | static void *get_buffer(struct snapshot_handle *handle) | ||
769 | { | ||
770 | struct pbe *pbe = handle->pbe, *last = handle->last_pbe; | ||
771 | struct page *page = virt_to_page(pbe->orig_address); | ||
772 | |||
773 | if (PageNosave(page) && PageNosaveFree(page)) { | ||
774 | /* | ||
775 | * We have allocated the "original" page frame and we can | ||
776 | * use it directly to store the read page | ||
777 | */ | ||
778 | pbe->address = 0; | ||
779 | if (last && last->next) | ||
780 | last->next = NULL; | ||
781 | return (void *)pbe->orig_address; | ||
782 | } | ||
783 | /* | ||
784 | * The "original" page frame has not been allocated and we have to | ||
785 | * use a "safe" page frame to store the read page | ||
786 | */ | ||
787 | pbe->address = (unsigned long)safe_pages; | ||
788 | safe_pages = safe_pages->next; | ||
789 | if (last) | ||
790 | last->next = pbe; | ||
791 | handle->last_pbe = pbe; | ||
792 | return (void *)pbe->address; | ||
793 | } | ||
794 | |||
758 | /** | 795 | /** |
759 | * snapshot_write_next - used for writing the system memory snapshot. | 796 | * snapshot_write_next - used for writing the system memory snapshot. |
760 | * | 797 | * |
@@ -799,15 +836,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) | |||
799 | } else if (handle->prev <= nr_meta_pages) { | 836 | } else if (handle->prev <= nr_meta_pages) { |
800 | handle->pbe = unpack_orig_addresses(buffer, handle->pbe); | 837 | handle->pbe = unpack_orig_addresses(buffer, handle->pbe); |
801 | if (!handle->pbe) { | 838 | if (!handle->pbe) { |
802 | error = create_image(handle); | 839 | error = prepare_image(handle); |
803 | if (error) | 840 | if (error) |
804 | return error; | 841 | return error; |
805 | handle->pbe = pagedir_nosave; | 842 | handle->pbe = pagedir_nosave; |
806 | handle->buffer = (void *)handle->pbe->address; | 843 | handle->last_pbe = NULL; |
844 | handle->buffer = get_buffer(handle); | ||
807 | } | 845 | } |
808 | } else { | 846 | } else { |
809 | handle->pbe = handle->pbe->next; | 847 | handle->pbe = handle->pbe->next; |
810 | handle->buffer = (void *)handle->pbe->address; | 848 | handle->buffer = get_buffer(handle); |
811 | } | 849 | } |
812 | handle->prev = handle->page; | 850 | handle->prev = handle->page; |
813 | } | 851 | } |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 044b8e0c1025..f1dd146bd64d 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -263,7 +263,6 @@ int swsusp_write(void) | |||
263 | struct swap_map_handle handle; | 263 | struct swap_map_handle handle; |
264 | struct snapshot_handle snapshot; | 264 | struct snapshot_handle snapshot; |
265 | struct swsusp_info *header; | 265 | struct swsusp_info *header; |
266 | unsigned long start; | ||
267 | int error; | 266 | int error; |
268 | 267 | ||
269 | if ((error = swsusp_swap_check())) { | 268 | if ((error = swsusp_swap_check())) { |
@@ -281,16 +280,17 @@ int swsusp_write(void) | |||
281 | } | 280 | } |
282 | error = get_swap_writer(&handle); | 281 | error = get_swap_writer(&handle); |
283 | if (!error) { | 282 | if (!error) { |
284 | start = handle.cur_swap; | 283 | unsigned long start = handle.cur_swap; |
285 | error = swap_write_page(&handle, header); | 284 | error = swap_write_page(&handle, header); |
286 | } | 285 | if (!error) |
287 | if (!error) | 286 | error = save_image(&handle, &snapshot, |
288 | error = save_image(&handle, &snapshot, header->pages - 1); | 287 | header->pages - 1); |
289 | if (!error) { | 288 | if (!error) { |
290 | flush_swap_writer(&handle); | 289 | flush_swap_writer(&handle); |
291 | printk("S"); | 290 | printk("S"); |
292 | error = mark_swapfiles(swp_entry(root_swap, start)); | 291 | error = mark_swapfiles(swp_entry(root_swap, start)); |
293 | printk("|\n"); | 292 | printk("|\n"); |
293 | } | ||
294 | } | 294 | } |
295 | if (error) | 295 | if (error) |
296 | free_all_swap_pages(root_swap, handle.bitmap); | 296 | free_all_swap_pages(root_swap, handle.bitmap); |
@@ -311,8 +311,10 @@ static atomic_t io_done = ATOMIC_INIT(0); | |||
311 | 311 | ||
312 | static int end_io(struct bio *bio, unsigned int num, int err) | 312 | static int end_io(struct bio *bio, unsigned int num, int err) |
313 | { | 313 | { |
314 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | 314 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { |
315 | panic("I/O error reading memory image"); | 315 | printk(KERN_ERR "I/O error reading swsusp image.\n"); |
316 | return -EIO; | ||
317 | } | ||
316 | atomic_set(&io_done, 0); | 318 | atomic_set(&io_done, 0); |
317 | return 0; | 319 | return 0; |
318 | } | 320 | } |
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index c4016cbbd3e0..17f669c83012 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c | |||
@@ -67,9 +67,9 @@ unsigned int count_highmem_pages(void); | |||
67 | int save_highmem(void); | 67 | int save_highmem(void); |
68 | int restore_highmem(void); | 68 | int restore_highmem(void); |
69 | #else | 69 | #else |
70 | static int save_highmem(void) { return 0; } | 70 | static inline int save_highmem(void) { return 0; } |
71 | static int restore_highmem(void) { return 0; } | 71 | static inline int restore_highmem(void) { return 0; } |
72 | static unsigned int count_highmem_pages(void) { return 0; } | 72 | static inline unsigned int count_highmem_pages(void) { return 0; } |
73 | #endif | 73 | #endif |
74 | 74 | ||
75 | /** | 75 | /** |
@@ -175,6 +175,12 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap) | |||
175 | */ | 175 | */ |
176 | 176 | ||
177 | #define SHRINK_BITE 10000 | 177 | #define SHRINK_BITE 10000 |
178 | static inline unsigned long __shrink_memory(long tmp) | ||
179 | { | ||
180 | if (tmp > SHRINK_BITE) | ||
181 | tmp = SHRINK_BITE; | ||
182 | return shrink_all_memory(tmp); | ||
183 | } | ||
178 | 184 | ||
179 | int swsusp_shrink_memory(void) | 185 | int swsusp_shrink_memory(void) |
180 | { | 186 | { |
@@ -192,15 +198,17 @@ int swsusp_shrink_memory(void) | |||
192 | PAGES_FOR_IO; | 198 | PAGES_FOR_IO; |
193 | tmp = size; | 199 | tmp = size; |
194 | for_each_zone (zone) | 200 | for_each_zone (zone) |
195 | if (!is_highmem(zone)) | 201 | if (!is_highmem(zone) && populated_zone(zone)) { |
196 | tmp -= zone->free_pages; | 202 | tmp -= zone->free_pages; |
203 | tmp += zone->lowmem_reserve[ZONE_NORMAL]; | ||
204 | } | ||
197 | if (tmp > 0) { | 205 | if (tmp > 0) { |
198 | tmp = shrink_all_memory(SHRINK_BITE); | 206 | tmp = __shrink_memory(tmp); |
199 | if (!tmp) | 207 | if (!tmp) |
200 | return -ENOMEM; | 208 | return -ENOMEM; |
201 | pages += tmp; | 209 | pages += tmp; |
202 | } else if (size > image_size / PAGE_SIZE) { | 210 | } else if (size > image_size / PAGE_SIZE) { |
203 | tmp = shrink_all_memory(SHRINK_BITE); | 211 | tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); |
204 | pages += tmp; | 212 | pages += tmp; |
205 | } | 213 | } |
206 | printk("\b%c", p[i++%4]); | 214 | printk("\b%c", p[i++%4]); |
diff --git a/kernel/printk.c b/kernel/printk.c index c056f3324432..65ca0688f86f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -24,8 +24,8 @@ | |||
24 | #include <linux/console.h> | 24 | #include <linux/console.h> |
25 | #include <linux/init.h> | 25 | #include <linux/init.h> |
26 | #include <linux/module.h> | 26 | #include <linux/module.h> |
27 | #include <linux/moduleparam.h> | ||
27 | #include <linux/interrupt.h> /* For in_interrupt() */ | 28 | #include <linux/interrupt.h> /* For in_interrupt() */ |
28 | #include <linux/config.h> | ||
29 | #include <linux/delay.h> | 29 | #include <linux/delay.h> |
30 | #include <linux/smp.h> | 30 | #include <linux/smp.h> |
31 | #include <linux/security.h> | 31 | #include <linux/security.h> |
@@ -52,7 +52,7 @@ int console_printk[4] = { | |||
52 | DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ | 52 | DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ |
53 | }; | 53 | }; |
54 | 54 | ||
55 | EXPORT_SYMBOL(console_printk); | 55 | EXPORT_UNUSED_SYMBOL(console_printk); /* June 2006 */ |
56 | 56 | ||
57 | /* | 57 | /* |
58 | * Low lever drivers may need that to know if they can schedule in | 58 | * Low lever drivers may need that to know if they can schedule in |
@@ -67,6 +67,7 @@ EXPORT_SYMBOL(oops_in_progress); | |||
67 | * driver system. | 67 | * driver system. |
68 | */ | 68 | */ |
69 | static DECLARE_MUTEX(console_sem); | 69 | static DECLARE_MUTEX(console_sem); |
70 | static DECLARE_MUTEX(secondary_console_sem); | ||
70 | struct console *console_drivers; | 71 | struct console *console_drivers; |
71 | /* | 72 | /* |
72 | * This is used for debugging the mess that is the VT code by | 73 | * This is used for debugging the mess that is the VT code by |
@@ -76,7 +77,7 @@ struct console *console_drivers; | |||
76 | * path in the console code where we end up in places I want | 77 | * path in the console code where we end up in places I want |
77 | * locked without the console sempahore held | 78 | * locked without the console sempahore held |
78 | */ | 79 | */ |
79 | static int console_locked; | 80 | static int console_locked, console_suspended; |
80 | 81 | ||
81 | /* | 82 | /* |
82 | * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars | 83 | * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars |
@@ -326,7 +327,9 @@ static void __call_console_drivers(unsigned long start, unsigned long end) | |||
326 | struct console *con; | 327 | struct console *con; |
327 | 328 | ||
328 | for (con = console_drivers; con; con = con->next) { | 329 | for (con = console_drivers; con; con = con->next) { |
329 | if ((con->flags & CON_ENABLED) && con->write) | 330 | if ((con->flags & CON_ENABLED) && con->write && |
331 | (cpu_online(smp_processor_id()) || | ||
332 | (con->flags & CON_ANYTIME))) | ||
330 | con->write(con, &LOG_BUF(start), end - start); | 333 | con->write(con, &LOG_BUF(start), end - start); |
331 | } | 334 | } |
332 | } | 335 | } |
@@ -436,6 +439,7 @@ static int printk_time = 1; | |||
436 | #else | 439 | #else |
437 | static int printk_time = 0; | 440 | static int printk_time = 0; |
438 | #endif | 441 | #endif |
442 | module_param(printk_time, int, S_IRUGO | S_IWUSR); | ||
439 | 443 | ||
440 | static int __init printk_time_setup(char *str) | 444 | static int __init printk_time_setup(char *str) |
441 | { | 445 | { |
@@ -452,6 +456,18 @@ __attribute__((weak)) unsigned long long printk_clock(void) | |||
452 | return sched_clock(); | 456 | return sched_clock(); |
453 | } | 457 | } |
454 | 458 | ||
459 | /* Check if we have any console registered that can be called early in boot. */ | ||
460 | static int have_callable_console(void) | ||
461 | { | ||
462 | struct console *con; | ||
463 | |||
464 | for (con = console_drivers; con; con = con->next) | ||
465 | if (con->flags & CON_ANYTIME) | ||
466 | return 1; | ||
467 | |||
468 | return 0; | ||
469 | } | ||
470 | |||
455 | /** | 471 | /** |
456 | * printk - print a kernel message | 472 | * printk - print a kernel message |
457 | * @fmt: format string | 473 | * @fmt: format string |
@@ -502,7 +518,9 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
502 | zap_locks(); | 518 | zap_locks(); |
503 | 519 | ||
504 | /* This stops the holder of console_sem just where we want him */ | 520 | /* This stops the holder of console_sem just where we want him */ |
505 | spin_lock_irqsave(&logbuf_lock, flags); | 521 | local_irq_save(flags); |
522 | lockdep_off(); | ||
523 | spin_lock(&logbuf_lock); | ||
506 | printk_cpu = smp_processor_id(); | 524 | printk_cpu = smp_processor_id(); |
507 | 525 | ||
508 | /* Emit the output into the temporary buffer */ | 526 | /* Emit the output into the temporary buffer */ |
@@ -565,27 +583,31 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
565 | log_level_unknown = 1; | 583 | log_level_unknown = 1; |
566 | } | 584 | } |
567 | 585 | ||
568 | if (!cpu_online(smp_processor_id())) { | 586 | if (!down_trylock(&console_sem)) { |
569 | /* | 587 | /* |
570 | * Some console drivers may assume that per-cpu resources have | 588 | * We own the drivers. We can drop the spinlock and |
571 | * been allocated. So don't allow them to be called by this | 589 | * let release_console_sem() print the text, maybe ... |
572 | * CPU until it is officially up. We shouldn't be calling into | ||
573 | * random console drivers on a CPU which doesn't exist yet.. | ||
574 | */ | 590 | */ |
575 | printk_cpu = UINT_MAX; | ||
576 | spin_unlock_irqrestore(&logbuf_lock, flags); | ||
577 | goto out; | ||
578 | } | ||
579 | if (!down_trylock(&console_sem)) { | ||
580 | console_locked = 1; | 591 | console_locked = 1; |
592 | printk_cpu = UINT_MAX; | ||
593 | spin_unlock(&logbuf_lock); | ||
594 | |||
581 | /* | 595 | /* |
582 | * We own the drivers. We can drop the spinlock and let | 596 | * Console drivers may assume that per-cpu resources have |
583 | * release_console_sem() print the text | 597 | * been allocated. So unless they're explicitly marked as |
598 | * being able to cope (CON_ANYTIME) don't call them until | ||
599 | * this CPU is officially up. | ||
584 | */ | 600 | */ |
585 | printk_cpu = UINT_MAX; | 601 | if (cpu_online(smp_processor_id()) || have_callable_console()) { |
586 | spin_unlock_irqrestore(&logbuf_lock, flags); | 602 | console_may_schedule = 0; |
587 | console_may_schedule = 0; | 603 | release_console_sem(); |
588 | release_console_sem(); | 604 | } else { |
605 | /* Release by hand to avoid flushing the buffer. */ | ||
606 | console_locked = 0; | ||
607 | up(&console_sem); | ||
608 | } | ||
609 | lockdep_on(); | ||
610 | local_irq_restore(flags); | ||
589 | } else { | 611 | } else { |
590 | /* | 612 | /* |
591 | * Someone else owns the drivers. We drop the spinlock, which | 613 | * Someone else owns the drivers. We drop the spinlock, which |
@@ -593,9 +615,11 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
593 | * console drivers with the output which we just produced. | 615 | * console drivers with the output which we just produced. |
594 | */ | 616 | */ |
595 | printk_cpu = UINT_MAX; | 617 | printk_cpu = UINT_MAX; |
596 | spin_unlock_irqrestore(&logbuf_lock, flags); | 618 | spin_unlock(&logbuf_lock); |
619 | lockdep_on(); | ||
620 | local_irq_restore(flags); | ||
597 | } | 621 | } |
598 | out: | 622 | |
599 | preempt_enable(); | 623 | preempt_enable(); |
600 | return printed_len; | 624 | return printed_len; |
601 | } | 625 | } |
@@ -698,6 +722,23 @@ int __init add_preferred_console(char *name, int idx, char *options) | |||
698 | } | 722 | } |
699 | 723 | ||
700 | /** | 724 | /** |
725 | * suspend_console - suspend the console subsystem | ||
726 | * | ||
727 | * This disables printk() while we go into suspend states | ||
728 | */ | ||
729 | void suspend_console(void) | ||
730 | { | ||
731 | acquire_console_sem(); | ||
732 | console_suspended = 1; | ||
733 | } | ||
734 | |||
735 | void resume_console(void) | ||
736 | { | ||
737 | console_suspended = 0; | ||
738 | release_console_sem(); | ||
739 | } | ||
740 | |||
741 | /** | ||
701 | * acquire_console_sem - lock the console system for exclusive use. | 742 | * acquire_console_sem - lock the console system for exclusive use. |
702 | * | 743 | * |
703 | * Acquires a semaphore which guarantees that the caller has | 744 | * Acquires a semaphore which guarantees that the caller has |
@@ -708,6 +749,10 @@ int __init add_preferred_console(char *name, int idx, char *options) | |||
708 | void acquire_console_sem(void) | 749 | void acquire_console_sem(void) |
709 | { | 750 | { |
710 | BUG_ON(in_interrupt()); | 751 | BUG_ON(in_interrupt()); |
752 | if (console_suspended) { | ||
753 | down(&secondary_console_sem); | ||
754 | return; | ||
755 | } | ||
711 | down(&console_sem); | 756 | down(&console_sem); |
712 | console_locked = 1; | 757 | console_locked = 1; |
713 | console_may_schedule = 1; | 758 | console_may_schedule = 1; |
@@ -728,7 +773,7 @@ int is_console_locked(void) | |||
728 | { | 773 | { |
729 | return console_locked; | 774 | return console_locked; |
730 | } | 775 | } |
731 | EXPORT_SYMBOL(is_console_locked); | 776 | EXPORT_UNUSED_SYMBOL(is_console_locked); /* June 2006 */ |
732 | 777 | ||
733 | /** | 778 | /** |
734 | * release_console_sem - unlock the console system | 779 | * release_console_sem - unlock the console system |
@@ -750,6 +795,10 @@ void release_console_sem(void) | |||
750 | unsigned long _con_start, _log_end; | 795 | unsigned long _con_start, _log_end; |
751 | unsigned long wake_klogd = 0; | 796 | unsigned long wake_klogd = 0; |
752 | 797 | ||
798 | if (console_suspended) { | ||
799 | up(&secondary_console_sem); | ||
800 | return; | ||
801 | } | ||
753 | for ( ; ; ) { | 802 | for ( ; ; ) { |
754 | spin_lock_irqsave(&logbuf_lock, flags); | 803 | spin_lock_irqsave(&logbuf_lock, flags); |
755 | wake_klogd |= log_start - log_end; | 804 | wake_klogd |= log_start - log_end; |
@@ -766,8 +815,15 @@ void release_console_sem(void) | |||
766 | console_may_schedule = 0; | 815 | console_may_schedule = 0; |
767 | up(&console_sem); | 816 | up(&console_sem); |
768 | spin_unlock_irqrestore(&logbuf_lock, flags); | 817 | spin_unlock_irqrestore(&logbuf_lock, flags); |
769 | if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) | 818 | if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) { |
770 | wake_up_interruptible(&log_wait); | 819 | /* |
820 | * If we printk from within the lock dependency code, | ||
821 | * from within the scheduler code, then do not lock | ||
822 | * up due to self-recursion: | ||
823 | */ | ||
824 | if (!lockdep_internal()) | ||
825 | wake_up_interruptible(&log_wait); | ||
826 | } | ||
771 | } | 827 | } |
772 | EXPORT_SYMBOL(release_console_sem); | 828 | EXPORT_SYMBOL(release_console_sem); |
773 | 829 | ||
diff --git a/kernel/profile.c b/kernel/profile.c index 68afe121e507..d5bd75e7501c 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -13,7 +13,6 @@ | |||
13 | * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 | 13 | * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/config.h> | ||
17 | #include <linux/module.h> | 16 | #include <linux/module.h> |
18 | #include <linux/profile.h> | 17 | #include <linux/profile.h> |
19 | #include <linux/bootmem.h> | 18 | #include <linux/bootmem.h> |
@@ -299,7 +298,7 @@ out: | |||
299 | } | 298 | } |
300 | 299 | ||
301 | #ifdef CONFIG_HOTPLUG_CPU | 300 | #ifdef CONFIG_HOTPLUG_CPU |
302 | static int profile_cpu_callback(struct notifier_block *info, | 301 | static int __devinit profile_cpu_callback(struct notifier_block *info, |
303 | unsigned long action, void *__cpu) | 302 | unsigned long action, void *__cpu) |
304 | { | 303 | { |
305 | int node, cpu = (unsigned long)__cpu; | 304 | int node, cpu = (unsigned long)__cpu; |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 921c22ad16e4..9a111f70145c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -28,7 +28,7 @@ | |||
28 | * | 28 | * |
29 | * Must be called with the tasklist lock write-held. | 29 | * Must be called with the tasklist lock write-held. |
30 | */ | 30 | */ |
31 | void __ptrace_link(task_t *child, task_t *new_parent) | 31 | void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) |
32 | { | 32 | { |
33 | BUG_ON(!list_empty(&child->ptrace_list)); | 33 | BUG_ON(!list_empty(&child->ptrace_list)); |
34 | if (child->parent == new_parent) | 34 | if (child->parent == new_parent) |
@@ -46,7 +46,7 @@ void __ptrace_link(task_t *child, task_t *new_parent) | |||
46 | * TASK_TRACED, resume it now. | 46 | * TASK_TRACED, resume it now. |
47 | * Requires that irqs be disabled. | 47 | * Requires that irqs be disabled. |
48 | */ | 48 | */ |
49 | void ptrace_untrace(task_t *child) | 49 | void ptrace_untrace(struct task_struct *child) |
50 | { | 50 | { |
51 | spin_lock(&child->sighand->siglock); | 51 | spin_lock(&child->sighand->siglock); |
52 | if (child->state == TASK_TRACED) { | 52 | if (child->state == TASK_TRACED) { |
@@ -65,7 +65,7 @@ void ptrace_untrace(task_t *child) | |||
65 | * | 65 | * |
66 | * Must be called with the tasklist lock write-held. | 66 | * Must be called with the tasklist lock write-held. |
67 | */ | 67 | */ |
68 | void __ptrace_unlink(task_t *child) | 68 | void __ptrace_unlink(struct task_struct *child) |
69 | { | 69 | { |
70 | BUG_ON(!child->ptrace); | 70 | BUG_ON(!child->ptrace); |
71 | 71 | ||
@@ -120,8 +120,18 @@ int ptrace_check_attach(struct task_struct *child, int kill) | |||
120 | 120 | ||
121 | static int may_attach(struct task_struct *task) | 121 | static int may_attach(struct task_struct *task) |
122 | { | 122 | { |
123 | if (!task->mm) | 123 | /* May we inspect the given task? |
124 | return -EPERM; | 124 | * This check is used both for attaching with ptrace |
125 | * and for allowing access to sensitive information in /proc. | ||
126 | * | ||
127 | * ptrace_attach denies several cases that /proc allows | ||
128 | * because setting up the necessary parent/child relationship | ||
129 | * or halting the specified task is impossible. | ||
130 | */ | ||
131 | int dumpable = 0; | ||
132 | /* Don't let security modules deny introspection */ | ||
133 | if (task == current) | ||
134 | return 0; | ||
125 | if (((current->uid != task->euid) || | 135 | if (((current->uid != task->euid) || |
126 | (current->uid != task->suid) || | 136 | (current->uid != task->suid) || |
127 | (current->uid != task->uid) || | 137 | (current->uid != task->uid) || |
@@ -130,7 +140,9 @@ static int may_attach(struct task_struct *task) | |||
130 | (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) | 140 | (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) |
131 | return -EPERM; | 141 | return -EPERM; |
132 | smp_rmb(); | 142 | smp_rmb(); |
133 | if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) | 143 | if (task->mm) |
144 | dumpable = task->mm->dumpable; | ||
145 | if (!dumpable && !capable(CAP_SYS_PTRACE)) | ||
134 | return -EPERM; | 146 | return -EPERM; |
135 | 147 | ||
136 | return security_ptrace(current, task); | 148 | return security_ptrace(current, task); |
@@ -176,6 +188,8 @@ repeat: | |||
176 | goto repeat; | 188 | goto repeat; |
177 | } | 189 | } |
178 | 190 | ||
191 | if (!task->mm) | ||
192 | goto bad; | ||
179 | /* the same process cannot be attached many times */ | 193 | /* the same process cannot be attached many times */ |
180 | if (task->ptrace & PT_PTRACED) | 194 | if (task->ptrace & PT_PTRACED) |
181 | goto bad; | 195 | goto bad; |
@@ -200,7 +214,7 @@ out: | |||
200 | return retval; | 214 | return retval; |
201 | } | 215 | } |
202 | 216 | ||
203 | void __ptrace_detach(struct task_struct *child, unsigned int data) | 217 | static inline void __ptrace_detach(struct task_struct *child, unsigned int data) |
204 | { | 218 | { |
205 | child->exit_code = data; | 219 | child->exit_code = data; |
206 | /* .. re-parent .. */ | 220 | /* .. re-parent .. */ |
@@ -219,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data) | |||
219 | ptrace_disable(child); | 233 | ptrace_disable(child); |
220 | 234 | ||
221 | write_lock_irq(&tasklist_lock); | 235 | write_lock_irq(&tasklist_lock); |
236 | /* protect against de_thread()->release_task() */ | ||
222 | if (child->ptrace) | 237 | if (child->ptrace) |
223 | __ptrace_detach(child, data); | 238 | __ptrace_detach(child, data); |
224 | write_unlock_irq(&tasklist_lock); | 239 | write_unlock_irq(&tasklist_lock); |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 2058f88c7bbb..759805c9859a 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -53,13 +53,13 @@ | |||
53 | static struct rcu_ctrlblk rcu_ctrlblk = { | 53 | static struct rcu_ctrlblk rcu_ctrlblk = { |
54 | .cur = -300, | 54 | .cur = -300, |
55 | .completed = -300, | 55 | .completed = -300, |
56 | .lock = SPIN_LOCK_UNLOCKED, | 56 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), |
57 | .cpumask = CPU_MASK_NONE, | 57 | .cpumask = CPU_MASK_NONE, |
58 | }; | 58 | }; |
59 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | 59 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { |
60 | .cur = -300, | 60 | .cur = -300, |
61 | .completed = -300, | 61 | .completed = -300, |
62 | .lock = SPIN_LOCK_UNLOCKED, | 62 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), |
63 | .cpumask = CPU_MASK_NONE, | 63 | .cpumask = CPU_MASK_NONE, |
64 | }; | 64 | }; |
65 | 65 | ||
@@ -182,6 +182,15 @@ long rcu_batches_completed(void) | |||
182 | return rcu_ctrlblk.completed; | 182 | return rcu_ctrlblk.completed; |
183 | } | 183 | } |
184 | 184 | ||
185 | /* | ||
186 | * Return the number of RCU batches processed thus far. Useful | ||
187 | * for debug and statistics. | ||
188 | */ | ||
189 | long rcu_batches_completed_bh(void) | ||
190 | { | ||
191 | return rcu_bh_ctrlblk.completed; | ||
192 | } | ||
193 | |||
185 | static void rcu_barrier_callback(struct rcu_head *notused) | 194 | static void rcu_barrier_callback(struct rcu_head *notused) |
186 | { | 195 | { |
187 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 196 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) |
@@ -539,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu) | |||
539 | tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); | 548 | tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); |
540 | } | 549 | } |
541 | 550 | ||
542 | static int rcu_cpu_notify(struct notifier_block *self, | 551 | static int __devinit rcu_cpu_notify(struct notifier_block *self, |
543 | unsigned long action, void *hcpu) | 552 | unsigned long action, void *hcpu) |
544 | { | 553 | { |
545 | long cpu = (long)hcpu; | 554 | long cpu = (long)hcpu; |
@@ -556,7 +565,7 @@ static int rcu_cpu_notify(struct notifier_block *self, | |||
556 | return NOTIFY_OK; | 565 | return NOTIFY_OK; |
557 | } | 566 | } |
558 | 567 | ||
559 | static struct notifier_block rcu_nb = { | 568 | static struct notifier_block __devinitdata rcu_nb = { |
560 | .notifier_call = rcu_cpu_notify, | 569 | .notifier_call = rcu_cpu_notify, |
561 | }; | 570 | }; |
562 | 571 | ||
@@ -612,14 +621,6 @@ void synchronize_rcu(void) | |||
612 | wait_for_completion(&rcu.completion); | 621 | wait_for_completion(&rcu.completion); |
613 | } | 622 | } |
614 | 623 | ||
615 | /* | ||
616 | * Deprecated, use synchronize_rcu() or synchronize_sched() instead. | ||
617 | */ | ||
618 | void synchronize_kernel(void) | ||
619 | { | ||
620 | synchronize_rcu(); | ||
621 | } | ||
622 | |||
623 | module_param(blimit, int, 0); | 624 | module_param(blimit, int, 0); |
624 | module_param(qhimark, int, 0); | 625 | module_param(qhimark, int, 0); |
625 | module_param(qlowmark, int, 0); | 626 | module_param(qlowmark, int, 0); |
@@ -627,7 +628,7 @@ module_param(qlowmark, int, 0); | |||
627 | module_param(rsinterval, int, 0); | 628 | module_param(rsinterval, int, 0); |
628 | #endif | 629 | #endif |
629 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | 630 | EXPORT_SYMBOL_GPL(rcu_batches_completed); |
630 | EXPORT_SYMBOL_GPL_FUTURE(call_rcu); /* WARNING: GPL-only in April 2006. */ | 631 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); |
631 | EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ | 632 | EXPORT_SYMBOL_GPL(call_rcu); |
633 | EXPORT_SYMBOL_GPL(call_rcu_bh); | ||
632 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 634 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
633 | EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */ | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 8154e7589d12..4d1c3d247127 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Read-Copy Update /proc-based torture test facility | 2 | * Read-Copy Update module-based torture test facility |
3 | * | 3 | * |
4 | * This program is free software; you can redistribute it and/or modify | 4 | * This program is free software; you can redistribute it and/or modify |
5 | * it under the terms of the GNU General Public License as published by | 5 | * it under the terms of the GNU General Public License as published by |
@@ -53,6 +53,7 @@ static int stat_interval; /* Interval between stats, in seconds. */ | |||
53 | static int verbose; /* Print more debug info. */ | 53 | static int verbose; /* Print more debug info. */ |
54 | static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ | 54 | static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ |
55 | static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ | 55 | static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ |
56 | static char *torture_type = "rcu"; /* What to torture. */ | ||
56 | 57 | ||
57 | module_param(nreaders, int, 0); | 58 | module_param(nreaders, int, 0); |
58 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | 59 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); |
@@ -64,13 +65,16 @@ module_param(test_no_idle_hz, bool, 0); | |||
64 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); | 65 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); |
65 | module_param(shuffle_interval, int, 0); | 66 | module_param(shuffle_interval, int, 0); |
66 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); | 67 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); |
67 | #define TORTURE_FLAG "rcutorture: " | 68 | module_param(torture_type, charp, 0); |
69 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)"); | ||
70 | |||
71 | #define TORTURE_FLAG "-torture:" | ||
68 | #define PRINTK_STRING(s) \ | 72 | #define PRINTK_STRING(s) \ |
69 | do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) | 73 | do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) |
70 | #define VERBOSE_PRINTK_STRING(s) \ | 74 | #define VERBOSE_PRINTK_STRING(s) \ |
71 | do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) | 75 | do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) |
72 | #define VERBOSE_PRINTK_ERRSTRING(s) \ | 76 | #define VERBOSE_PRINTK_ERRSTRING(s) \ |
73 | do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) | 77 | do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) |
74 | 78 | ||
75 | static char printk_buf[4096]; | 79 | static char printk_buf[4096]; |
76 | 80 | ||
@@ -139,28 +143,6 @@ rcu_torture_free(struct rcu_torture *p) | |||
139 | spin_unlock_bh(&rcu_torture_lock); | 143 | spin_unlock_bh(&rcu_torture_lock); |
140 | } | 144 | } |
141 | 145 | ||
142 | static void | ||
143 | rcu_torture_cb(struct rcu_head *p) | ||
144 | { | ||
145 | int i; | ||
146 | struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); | ||
147 | |||
148 | if (fullstop) { | ||
149 | /* Test is ending, just drop callbacks on the floor. */ | ||
150 | /* The next initialization will pick up the pieces. */ | ||
151 | return; | ||
152 | } | ||
153 | i = rp->rtort_pipe_count; | ||
154 | if (i > RCU_TORTURE_PIPE_LEN) | ||
155 | i = RCU_TORTURE_PIPE_LEN; | ||
156 | atomic_inc(&rcu_torture_wcount[i]); | ||
157 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | ||
158 | rp->rtort_mbtest = 0; | ||
159 | rcu_torture_free(rp); | ||
160 | } else | ||
161 | call_rcu(p, rcu_torture_cb); | ||
162 | } | ||
163 | |||
164 | struct rcu_random_state { | 146 | struct rcu_random_state { |
165 | unsigned long rrs_state; | 147 | unsigned long rrs_state; |
166 | unsigned long rrs_count; | 148 | unsigned long rrs_count; |
@@ -191,6 +173,119 @@ rcu_random(struct rcu_random_state *rrsp) | |||
191 | } | 173 | } |
192 | 174 | ||
193 | /* | 175 | /* |
176 | * Operations vector for selecting different types of tests. | ||
177 | */ | ||
178 | |||
179 | struct rcu_torture_ops { | ||
180 | void (*init)(void); | ||
181 | void (*cleanup)(void); | ||
182 | int (*readlock)(void); | ||
183 | void (*readunlock)(int idx); | ||
184 | int (*completed)(void); | ||
185 | void (*deferredfree)(struct rcu_torture *p); | ||
186 | int (*stats)(char *page); | ||
187 | char *name; | ||
188 | }; | ||
189 | static struct rcu_torture_ops *cur_ops = NULL; | ||
190 | |||
191 | /* | ||
192 | * Definitions for rcu torture testing. | ||
193 | */ | ||
194 | |||
195 | static int rcu_torture_read_lock(void) | ||
196 | { | ||
197 | rcu_read_lock(); | ||
198 | return 0; | ||
199 | } | ||
200 | |||
201 | static void rcu_torture_read_unlock(int idx) | ||
202 | { | ||
203 | rcu_read_unlock(); | ||
204 | } | ||
205 | |||
206 | static int rcu_torture_completed(void) | ||
207 | { | ||
208 | return rcu_batches_completed(); | ||
209 | } | ||
210 | |||
211 | static void | ||
212 | rcu_torture_cb(struct rcu_head *p) | ||
213 | { | ||
214 | int i; | ||
215 | struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); | ||
216 | |||
217 | if (fullstop) { | ||
218 | /* Test is ending, just drop callbacks on the floor. */ | ||
219 | /* The next initialization will pick up the pieces. */ | ||
220 | return; | ||
221 | } | ||
222 | i = rp->rtort_pipe_count; | ||
223 | if (i > RCU_TORTURE_PIPE_LEN) | ||
224 | i = RCU_TORTURE_PIPE_LEN; | ||
225 | atomic_inc(&rcu_torture_wcount[i]); | ||
226 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | ||
227 | rp->rtort_mbtest = 0; | ||
228 | rcu_torture_free(rp); | ||
229 | } else | ||
230 | cur_ops->deferredfree(rp); | ||
231 | } | ||
232 | |||
233 | static void rcu_torture_deferred_free(struct rcu_torture *p) | ||
234 | { | ||
235 | call_rcu(&p->rtort_rcu, rcu_torture_cb); | ||
236 | } | ||
237 | |||
238 | static struct rcu_torture_ops rcu_ops = { | ||
239 | .init = NULL, | ||
240 | .cleanup = NULL, | ||
241 | .readlock = rcu_torture_read_lock, | ||
242 | .readunlock = rcu_torture_read_unlock, | ||
243 | .completed = rcu_torture_completed, | ||
244 | .deferredfree = rcu_torture_deferred_free, | ||
245 | .stats = NULL, | ||
246 | .name = "rcu" | ||
247 | }; | ||
248 | |||
249 | /* | ||
250 | * Definitions for rcu_bh torture testing. | ||
251 | */ | ||
252 | |||
253 | static int rcu_bh_torture_read_lock(void) | ||
254 | { | ||
255 | rcu_read_lock_bh(); | ||
256 | return 0; | ||
257 | } | ||
258 | |||
259 | static void rcu_bh_torture_read_unlock(int idx) | ||
260 | { | ||
261 | rcu_read_unlock_bh(); | ||
262 | } | ||
263 | |||
264 | static int rcu_bh_torture_completed(void) | ||
265 | { | ||
266 | return rcu_batches_completed_bh(); | ||
267 | } | ||
268 | |||
269 | static void rcu_bh_torture_deferred_free(struct rcu_torture *p) | ||
270 | { | ||
271 | call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); | ||
272 | } | ||
273 | |||
274 | static struct rcu_torture_ops rcu_bh_ops = { | ||
275 | .init = NULL, | ||
276 | .cleanup = NULL, | ||
277 | .readlock = rcu_bh_torture_read_lock, | ||
278 | .readunlock = rcu_bh_torture_read_unlock, | ||
279 | .completed = rcu_bh_torture_completed, | ||
280 | .deferredfree = rcu_bh_torture_deferred_free, | ||
281 | .stats = NULL, | ||
282 | .name = "rcu_bh" | ||
283 | }; | ||
284 | |||
285 | static struct rcu_torture_ops *torture_ops[] = | ||
286 | { &rcu_ops, &rcu_bh_ops, NULL }; | ||
287 | |||
288 | /* | ||
194 | * RCU torture writer kthread. Repeatedly substitutes a new structure | 289 | * RCU torture writer kthread. Repeatedly substitutes a new structure |
195 | * for that pointed to by rcu_torture_current, freeing the old structure | 290 | * for that pointed to by rcu_torture_current, freeing the old structure |
196 | * after a series of grace periods (the "pipeline"). | 291 | * after a series of grace periods (the "pipeline"). |
@@ -209,8 +304,6 @@ rcu_torture_writer(void *arg) | |||
209 | 304 | ||
210 | do { | 305 | do { |
211 | schedule_timeout_uninterruptible(1); | 306 | schedule_timeout_uninterruptible(1); |
212 | if (rcu_batches_completed() == oldbatch) | ||
213 | continue; | ||
214 | if ((rp = rcu_torture_alloc()) == NULL) | 307 | if ((rp = rcu_torture_alloc()) == NULL) |
215 | continue; | 308 | continue; |
216 | rp->rtort_pipe_count = 0; | 309 | rp->rtort_pipe_count = 0; |
@@ -225,10 +318,10 @@ rcu_torture_writer(void *arg) | |||
225 | i = RCU_TORTURE_PIPE_LEN; | 318 | i = RCU_TORTURE_PIPE_LEN; |
226 | atomic_inc(&rcu_torture_wcount[i]); | 319 | atomic_inc(&rcu_torture_wcount[i]); |
227 | old_rp->rtort_pipe_count++; | 320 | old_rp->rtort_pipe_count++; |
228 | call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); | 321 | cur_ops->deferredfree(old_rp); |
229 | } | 322 | } |
230 | rcu_torture_current_version++; | 323 | rcu_torture_current_version++; |
231 | oldbatch = rcu_batches_completed(); | 324 | oldbatch = cur_ops->completed(); |
232 | } while (!kthread_should_stop() && !fullstop); | 325 | } while (!kthread_should_stop() && !fullstop); |
233 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); | 326 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); |
234 | while (!kthread_should_stop()) | 327 | while (!kthread_should_stop()) |
@@ -246,6 +339,7 @@ static int | |||
246 | rcu_torture_reader(void *arg) | 339 | rcu_torture_reader(void *arg) |
247 | { | 340 | { |
248 | int completed; | 341 | int completed; |
342 | int idx; | ||
249 | DEFINE_RCU_RANDOM(rand); | 343 | DEFINE_RCU_RANDOM(rand); |
250 | struct rcu_torture *p; | 344 | struct rcu_torture *p; |
251 | int pipe_count; | 345 | int pipe_count; |
@@ -254,12 +348,12 @@ rcu_torture_reader(void *arg) | |||
254 | set_user_nice(current, 19); | 348 | set_user_nice(current, 19); |
255 | 349 | ||
256 | do { | 350 | do { |
257 | rcu_read_lock(); | 351 | idx = cur_ops->readlock(); |
258 | completed = rcu_batches_completed(); | 352 | completed = cur_ops->completed(); |
259 | p = rcu_dereference(rcu_torture_current); | 353 | p = rcu_dereference(rcu_torture_current); |
260 | if (p == NULL) { | 354 | if (p == NULL) { |
261 | /* Wait for rcu_torture_writer to get underway */ | 355 | /* Wait for rcu_torture_writer to get underway */ |
262 | rcu_read_unlock(); | 356 | cur_ops->readunlock(idx); |
263 | schedule_timeout_interruptible(HZ); | 357 | schedule_timeout_interruptible(HZ); |
264 | continue; | 358 | continue; |
265 | } | 359 | } |
@@ -273,14 +367,14 @@ rcu_torture_reader(void *arg) | |||
273 | pipe_count = RCU_TORTURE_PIPE_LEN; | 367 | pipe_count = RCU_TORTURE_PIPE_LEN; |
274 | } | 368 | } |
275 | ++__get_cpu_var(rcu_torture_count)[pipe_count]; | 369 | ++__get_cpu_var(rcu_torture_count)[pipe_count]; |
276 | completed = rcu_batches_completed() - completed; | 370 | completed = cur_ops->completed() - completed; |
277 | if (completed > RCU_TORTURE_PIPE_LEN) { | 371 | if (completed > RCU_TORTURE_PIPE_LEN) { |
278 | /* Should not happen, but... */ | 372 | /* Should not happen, but... */ |
279 | completed = RCU_TORTURE_PIPE_LEN; | 373 | completed = RCU_TORTURE_PIPE_LEN; |
280 | } | 374 | } |
281 | ++__get_cpu_var(rcu_torture_batch)[completed]; | 375 | ++__get_cpu_var(rcu_torture_batch)[completed]; |
282 | preempt_enable(); | 376 | preempt_enable(); |
283 | rcu_read_unlock(); | 377 | cur_ops->readunlock(idx); |
284 | schedule(); | 378 | schedule(); |
285 | } while (!kthread_should_stop() && !fullstop); | 379 | } while (!kthread_should_stop() && !fullstop); |
286 | VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); | 380 | VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); |
@@ -311,7 +405,7 @@ rcu_torture_printk(char *page) | |||
311 | if (pipesummary[i] != 0) | 405 | if (pipesummary[i] != 0) |
312 | break; | 406 | break; |
313 | } | 407 | } |
314 | cnt += sprintf(&page[cnt], "rcutorture: "); | 408 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); |
315 | cnt += sprintf(&page[cnt], | 409 | cnt += sprintf(&page[cnt], |
316 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " | 410 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " |
317 | "rtmbe: %d", | 411 | "rtmbe: %d", |
@@ -324,7 +418,7 @@ rcu_torture_printk(char *page) | |||
324 | atomic_read(&n_rcu_torture_mberror)); | 418 | atomic_read(&n_rcu_torture_mberror)); |
325 | if (atomic_read(&n_rcu_torture_mberror) != 0) | 419 | if (atomic_read(&n_rcu_torture_mberror) != 0) |
326 | cnt += sprintf(&page[cnt], " !!!"); | 420 | cnt += sprintf(&page[cnt], " !!!"); |
327 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | 421 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
328 | if (i > 1) { | 422 | if (i > 1) { |
329 | cnt += sprintf(&page[cnt], "!!! "); | 423 | cnt += sprintf(&page[cnt], "!!! "); |
330 | atomic_inc(&n_rcu_torture_error); | 424 | atomic_inc(&n_rcu_torture_error); |
@@ -332,17 +426,19 @@ rcu_torture_printk(char *page) | |||
332 | cnt += sprintf(&page[cnt], "Reader Pipe: "); | 426 | cnt += sprintf(&page[cnt], "Reader Pipe: "); |
333 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 427 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
334 | cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); | 428 | cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); |
335 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | 429 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
336 | cnt += sprintf(&page[cnt], "Reader Batch: "); | 430 | cnt += sprintf(&page[cnt], "Reader Batch: "); |
337 | for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) | 431 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
338 | cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); | 432 | cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); |
339 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | 433 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
340 | cnt += sprintf(&page[cnt], "Free-Block Circulation: "); | 434 | cnt += sprintf(&page[cnt], "Free-Block Circulation: "); |
341 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | 435 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { |
342 | cnt += sprintf(&page[cnt], " %d", | 436 | cnt += sprintf(&page[cnt], " %d", |
343 | atomic_read(&rcu_torture_wcount[i])); | 437 | atomic_read(&rcu_torture_wcount[i])); |
344 | } | 438 | } |
345 | cnt += sprintf(&page[cnt], "\n"); | 439 | cnt += sprintf(&page[cnt], "\n"); |
440 | if (cur_ops->stats != NULL) | ||
441 | cnt += cur_ops->stats(&page[cnt]); | ||
346 | return cnt; | 442 | return cnt; |
347 | } | 443 | } |
348 | 444 | ||
@@ -444,11 +540,11 @@ rcu_torture_shuffle(void *arg) | |||
444 | static inline void | 540 | static inline void |
445 | rcu_torture_print_module_parms(char *tag) | 541 | rcu_torture_print_module_parms(char *tag) |
446 | { | 542 | { |
447 | printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d " | 543 | printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d " |
448 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " | 544 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " |
449 | "shuffle_interval = %d\n", | 545 | "shuffle_interval = %d\n", |
450 | tag, nrealreaders, stat_interval, verbose, test_no_idle_hz, | 546 | torture_type, tag, nrealreaders, stat_interval, verbose, |
451 | shuffle_interval); | 547 | test_no_idle_hz, shuffle_interval); |
452 | } | 548 | } |
453 | 549 | ||
454 | static void | 550 | static void |
@@ -493,6 +589,9 @@ rcu_torture_cleanup(void) | |||
493 | rcu_barrier(); | 589 | rcu_barrier(); |
494 | 590 | ||
495 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ | 591 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ |
592 | |||
593 | if (cur_ops->cleanup != NULL) | ||
594 | cur_ops->cleanup(); | ||
496 | if (atomic_read(&n_rcu_torture_error)) | 595 | if (atomic_read(&n_rcu_torture_error)) |
497 | rcu_torture_print_module_parms("End of test: FAILURE"); | 596 | rcu_torture_print_module_parms("End of test: FAILURE"); |
498 | else | 597 | else |
@@ -508,6 +607,20 @@ rcu_torture_init(void) | |||
508 | 607 | ||
509 | /* Process args and tell the world that the torturer is on the job. */ | 608 | /* Process args and tell the world that the torturer is on the job. */ |
510 | 609 | ||
610 | for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) { | ||
611 | cur_ops = torture_ops[i]; | ||
612 | if (strcmp(torture_type, cur_ops->name) == 0) { | ||
613 | break; | ||
614 | } | ||
615 | } | ||
616 | if (cur_ops == NULL) { | ||
617 | printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", | ||
618 | torture_type); | ||
619 | return (-EINVAL); | ||
620 | } | ||
621 | if (cur_ops->init != NULL) | ||
622 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ | ||
623 | |||
511 | if (nreaders >= 0) | 624 | if (nreaders >= 0) |
512 | nrealreaders = nreaders; | 625 | nrealreaders = nreaders; |
513 | else | 626 | else |
diff --git a/kernel/resource.c b/kernel/resource.c index e3080fcc66a3..0dd3a857579e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -7,7 +7,6 @@ | |||
7 | * Arbitrary resource management. | 7 | * Arbitrary resource management. |
8 | */ | 8 | */ |
9 | 9 | ||
10 | #include <linux/config.h> | ||
11 | #include <linux/module.h> | 10 | #include <linux/module.h> |
12 | #include <linux/sched.h> | 11 | #include <linux/sched.h> |
13 | #include <linux/errno.h> | 12 | #include <linux/errno.h> |
@@ -23,20 +22,18 @@ | |||
23 | 22 | ||
24 | struct resource ioport_resource = { | 23 | struct resource ioport_resource = { |
25 | .name = "PCI IO", | 24 | .name = "PCI IO", |
26 | .start = 0x0000, | 25 | .start = 0, |
27 | .end = IO_SPACE_LIMIT, | 26 | .end = IO_SPACE_LIMIT, |
28 | .flags = IORESOURCE_IO, | 27 | .flags = IORESOURCE_IO, |
29 | }; | 28 | }; |
30 | |||
31 | EXPORT_SYMBOL(ioport_resource); | 29 | EXPORT_SYMBOL(ioport_resource); |
32 | 30 | ||
33 | struct resource iomem_resource = { | 31 | struct resource iomem_resource = { |
34 | .name = "PCI mem", | 32 | .name = "PCI mem", |
35 | .start = 0UL, | 33 | .start = 0, |
36 | .end = ~0UL, | 34 | .end = -1, |
37 | .flags = IORESOURCE_MEM, | 35 | .flags = IORESOURCE_MEM, |
38 | }; | 36 | }; |
39 | |||
40 | EXPORT_SYMBOL(iomem_resource); | 37 | EXPORT_SYMBOL(iomem_resource); |
41 | 38 | ||
42 | static DEFINE_RWLOCK(resource_lock); | 39 | static DEFINE_RWLOCK(resource_lock); |
@@ -83,10 +80,10 @@ static int r_show(struct seq_file *m, void *v) | |||
83 | for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) | 80 | for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) |
84 | if (p->parent == root) | 81 | if (p->parent == root) |
85 | break; | 82 | break; |
86 | seq_printf(m, "%*s%0*lx-%0*lx : %s\n", | 83 | seq_printf(m, "%*s%0*llx-%0*llx : %s\n", |
87 | depth * 2, "", | 84 | depth * 2, "", |
88 | width, r->start, | 85 | width, (unsigned long long) r->start, |
89 | width, r->end, | 86 | width, (unsigned long long) r->end, |
90 | r->name ? r->name : "<BAD>"); | 87 | r->name ? r->name : "<BAD>"); |
91 | return 0; | 88 | return 0; |
92 | } | 89 | } |
@@ -151,8 +148,8 @@ __initcall(ioresources_init); | |||
151 | /* Return the conflict entry if you can't request it */ | 148 | /* Return the conflict entry if you can't request it */ |
152 | static struct resource * __request_resource(struct resource *root, struct resource *new) | 149 | static struct resource * __request_resource(struct resource *root, struct resource *new) |
153 | { | 150 | { |
154 | unsigned long start = new->start; | 151 | resource_size_t start = new->start; |
155 | unsigned long end = new->end; | 152 | resource_size_t end = new->end; |
156 | struct resource *tmp, **p; | 153 | struct resource *tmp, **p; |
157 | 154 | ||
158 | if (end < start) | 155 | if (end < start) |
@@ -232,15 +229,52 @@ int release_resource(struct resource *old) | |||
232 | 229 | ||
233 | EXPORT_SYMBOL(release_resource); | 230 | EXPORT_SYMBOL(release_resource); |
234 | 231 | ||
232 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
233 | /* | ||
234 | * Finds the lowest memory reosurce exists within [res->start.res->end) | ||
235 | * the caller must specify res->start, res->end, res->flags. | ||
236 | * If found, returns 0, res is overwritten, if not found, returns -1. | ||
237 | */ | ||
238 | int find_next_system_ram(struct resource *res) | ||
239 | { | ||
240 | resource_size_t start, end; | ||
241 | struct resource *p; | ||
242 | |||
243 | BUG_ON(!res); | ||
244 | |||
245 | start = res->start; | ||
246 | end = res->end; | ||
247 | |||
248 | read_lock(&resource_lock); | ||
249 | for (p = iomem_resource.child; p ; p = p->sibling) { | ||
250 | /* system ram is just marked as IORESOURCE_MEM */ | ||
251 | if (p->flags != res->flags) | ||
252 | continue; | ||
253 | if (p->start > end) { | ||
254 | p = NULL; | ||
255 | break; | ||
256 | } | ||
257 | if (p->start >= start) | ||
258 | break; | ||
259 | } | ||
260 | read_unlock(&resource_lock); | ||
261 | if (!p) | ||
262 | return -1; | ||
263 | /* copy data */ | ||
264 | res->start = p->start; | ||
265 | res->end = p->end; | ||
266 | return 0; | ||
267 | } | ||
268 | #endif | ||
269 | |||
235 | /* | 270 | /* |
236 | * Find empty slot in the resource tree given range and alignment. | 271 | * Find empty slot in the resource tree given range and alignment. |
237 | */ | 272 | */ |
238 | static int find_resource(struct resource *root, struct resource *new, | 273 | static int find_resource(struct resource *root, struct resource *new, |
239 | unsigned long size, | 274 | resource_size_t size, resource_size_t min, |
240 | unsigned long min, unsigned long max, | 275 | resource_size_t max, resource_size_t align, |
241 | unsigned long align, | ||
242 | void (*alignf)(void *, struct resource *, | 276 | void (*alignf)(void *, struct resource *, |
243 | unsigned long, unsigned long), | 277 | resource_size_t, resource_size_t), |
244 | void *alignf_data) | 278 | void *alignf_data) |
245 | { | 279 | { |
246 | struct resource *this = root->child; | 280 | struct resource *this = root->child; |
@@ -282,11 +316,10 @@ static int find_resource(struct resource *root, struct resource *new, | |||
282 | * Allocate empty slot in the resource tree given range and alignment. | 316 | * Allocate empty slot in the resource tree given range and alignment. |
283 | */ | 317 | */ |
284 | int allocate_resource(struct resource *root, struct resource *new, | 318 | int allocate_resource(struct resource *root, struct resource *new, |
285 | unsigned long size, | 319 | resource_size_t size, resource_size_t min, |
286 | unsigned long min, unsigned long max, | 320 | resource_size_t max, resource_size_t align, |
287 | unsigned long align, | ||
288 | void (*alignf)(void *, struct resource *, | 321 | void (*alignf)(void *, struct resource *, |
289 | unsigned long, unsigned long), | 322 | resource_size_t, resource_size_t), |
290 | void *alignf_data) | 323 | void *alignf_data) |
291 | { | 324 | { |
292 | int err; | 325 | int err; |
@@ -371,17 +404,15 @@ int insert_resource(struct resource *parent, struct resource *new) | |||
371 | return result; | 404 | return result; |
372 | } | 405 | } |
373 | 406 | ||
374 | EXPORT_SYMBOL(insert_resource); | ||
375 | |||
376 | /* | 407 | /* |
377 | * Given an existing resource, change its start and size to match the | 408 | * Given an existing resource, change its start and size to match the |
378 | * arguments. Returns -EBUSY if it can't fit. Existing children of | 409 | * arguments. Returns -EBUSY if it can't fit. Existing children of |
379 | * the resource are assumed to be immutable. | 410 | * the resource are assumed to be immutable. |
380 | */ | 411 | */ |
381 | int adjust_resource(struct resource *res, unsigned long start, unsigned long size) | 412 | int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) |
382 | { | 413 | { |
383 | struct resource *tmp, *parent = res->parent; | 414 | struct resource *tmp, *parent = res->parent; |
384 | unsigned long end = start + size - 1; | 415 | resource_size_t end = start + size - 1; |
385 | int result = -EBUSY; | 416 | int result = -EBUSY; |
386 | 417 | ||
387 | write_lock(&resource_lock); | 418 | write_lock(&resource_lock); |
@@ -428,7 +459,9 @@ EXPORT_SYMBOL(adjust_resource); | |||
428 | * | 459 | * |
429 | * Release-region releases a matching busy region. | 460 | * Release-region releases a matching busy region. |
430 | */ | 461 | */ |
431 | struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name) | 462 | struct resource * __request_region(struct resource *parent, |
463 | resource_size_t start, resource_size_t n, | ||
464 | const char *name) | ||
432 | { | 465 | { |
433 | struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); | 466 | struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); |
434 | 467 | ||
@@ -464,7 +497,8 @@ struct resource * __request_region(struct resource *parent, unsigned long start, | |||
464 | 497 | ||
465 | EXPORT_SYMBOL(__request_region); | 498 | EXPORT_SYMBOL(__request_region); |
466 | 499 | ||
467 | int __check_region(struct resource *parent, unsigned long start, unsigned long n) | 500 | int __check_region(struct resource *parent, resource_size_t start, |
501 | resource_size_t n) | ||
468 | { | 502 | { |
469 | struct resource * res; | 503 | struct resource * res; |
470 | 504 | ||
@@ -479,10 +513,11 @@ int __check_region(struct resource *parent, unsigned long start, unsigned long n | |||
479 | 513 | ||
480 | EXPORT_SYMBOL(__check_region); | 514 | EXPORT_SYMBOL(__check_region); |
481 | 515 | ||
482 | void __release_region(struct resource *parent, unsigned long start, unsigned long n) | 516 | void __release_region(struct resource *parent, resource_size_t start, |
517 | resource_size_t n) | ||
483 | { | 518 | { |
484 | struct resource **p; | 519 | struct resource **p; |
485 | unsigned long end; | 520 | resource_size_t end; |
486 | 521 | ||
487 | p = &parent->child; | 522 | p = &parent->child; |
488 | end = start + n - 1; | 523 | end = start + n - 1; |
@@ -511,7 +546,9 @@ void __release_region(struct resource *parent, unsigned long start, unsigned lon | |||
511 | 546 | ||
512 | write_unlock(&resource_lock); | 547 | write_unlock(&resource_lock); |
513 | 548 | ||
514 | printk(KERN_WARNING "Trying to free nonexistent resource <%08lx-%08lx>\n", start, end); | 549 | printk(KERN_WARNING "Trying to free nonexistent resource " |
550 | "<%016llx-%016llx>\n", (unsigned long long)start, | ||
551 | (unsigned long long)end); | ||
515 | } | 552 | } |
516 | 553 | ||
517 | EXPORT_SYMBOL(__release_region); | 554 | EXPORT_SYMBOL(__release_region); |
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c new file mode 100644 index 000000000000..0c1faa950af7 --- /dev/null +++ b/kernel/rtmutex-debug.c | |||
@@ -0,0 +1,242 @@ | |||
1 | /* | ||
2 | * RT-Mutexes: blocking mutual exclusion locks with PI support | ||
3 | * | ||
4 | * started by Ingo Molnar and Thomas Gleixner: | ||
5 | * | ||
6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
8 | * | ||
9 | * This code is based on the rt.c implementation in the preempt-rt tree. | ||
10 | * Portions of said code are | ||
11 | * | ||
12 | * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey | ||
13 | * Copyright (C) 2006 Esben Nielsen | ||
14 | * Copyright (C) 2006 Kihon Technologies Inc., | ||
15 | * Steven Rostedt <rostedt@goodmis.org> | ||
16 | * | ||
17 | * See rt.c in preempt-rt for proper credits and further information | ||
18 | */ | ||
19 | #include <linux/config.h> | ||
20 | #include <linux/sched.h> | ||
21 | #include <linux/delay.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <linux/spinlock.h> | ||
24 | #include <linux/kallsyms.h> | ||
25 | #include <linux/syscalls.h> | ||
26 | #include <linux/interrupt.h> | ||
27 | #include <linux/plist.h> | ||
28 | #include <linux/fs.h> | ||
29 | #include <linux/debug_locks.h> | ||
30 | |||
31 | #include "rtmutex_common.h" | ||
32 | |||
33 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
34 | # include "rtmutex-debug.h" | ||
35 | #else | ||
36 | # include "rtmutex.h" | ||
37 | #endif | ||
38 | |||
39 | # define TRACE_WARN_ON(x) WARN_ON(x) | ||
40 | # define TRACE_BUG_ON(x) BUG_ON(x) | ||
41 | |||
42 | # define TRACE_OFF() \ | ||
43 | do { \ | ||
44 | if (rt_trace_on) { \ | ||
45 | rt_trace_on = 0; \ | ||
46 | console_verbose(); \ | ||
47 | if (spin_is_locked(¤t->pi_lock)) \ | ||
48 | spin_unlock(¤t->pi_lock); \ | ||
49 | } \ | ||
50 | } while (0) | ||
51 | |||
52 | # define TRACE_OFF_NOLOCK() \ | ||
53 | do { \ | ||
54 | if (rt_trace_on) { \ | ||
55 | rt_trace_on = 0; \ | ||
56 | console_verbose(); \ | ||
57 | } \ | ||
58 | } while (0) | ||
59 | |||
60 | # define TRACE_BUG_LOCKED() \ | ||
61 | do { \ | ||
62 | TRACE_OFF(); \ | ||
63 | BUG(); \ | ||
64 | } while (0) | ||
65 | |||
66 | # define TRACE_WARN_ON_LOCKED(c) \ | ||
67 | do { \ | ||
68 | if (unlikely(c)) { \ | ||
69 | TRACE_OFF(); \ | ||
70 | WARN_ON(1); \ | ||
71 | } \ | ||
72 | } while (0) | ||
73 | |||
74 | # define TRACE_BUG_ON_LOCKED(c) \ | ||
75 | do { \ | ||
76 | if (unlikely(c)) \ | ||
77 | TRACE_BUG_LOCKED(); \ | ||
78 | } while (0) | ||
79 | |||
80 | #ifdef CONFIG_SMP | ||
81 | # define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) | ||
82 | #else | ||
83 | # define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) | ||
84 | #endif | ||
85 | |||
86 | /* | ||
87 | * deadlock detection flag. We turn it off when we detect | ||
88 | * the first problem because we dont want to recurse back | ||
89 | * into the tracing code when doing error printk or | ||
90 | * executing a BUG(): | ||
91 | */ | ||
92 | int rt_trace_on = 1; | ||
93 | |||
94 | void deadlock_trace_off(void) | ||
95 | { | ||
96 | rt_trace_on = 0; | ||
97 | } | ||
98 | |||
99 | static void printk_task(struct task_struct *p) | ||
100 | { | ||
101 | if (p) | ||
102 | printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio); | ||
103 | else | ||
104 | printk("<none>"); | ||
105 | } | ||
106 | |||
107 | static void printk_lock(struct rt_mutex *lock, int print_owner) | ||
108 | { | ||
109 | if (lock->name) | ||
110 | printk(" [%p] {%s}\n", | ||
111 | lock, lock->name); | ||
112 | else | ||
113 | printk(" [%p] {%s:%d}\n", | ||
114 | lock, lock->file, lock->line); | ||
115 | |||
116 | if (print_owner && rt_mutex_owner(lock)) { | ||
117 | printk(".. ->owner: %p\n", lock->owner); | ||
118 | printk(".. held by: "); | ||
119 | printk_task(rt_mutex_owner(lock)); | ||
120 | printk("\n"); | ||
121 | } | ||
122 | } | ||
123 | |||
124 | void rt_mutex_debug_task_free(struct task_struct *task) | ||
125 | { | ||
126 | WARN_ON(!plist_head_empty(&task->pi_waiters)); | ||
127 | WARN_ON(task->pi_blocked_on); | ||
128 | } | ||
129 | |||
130 | /* | ||
131 | * We fill out the fields in the waiter to store the information about | ||
132 | * the deadlock. We print when we return. act_waiter can be NULL in | ||
133 | * case of a remove waiter operation. | ||
134 | */ | ||
135 | void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, | ||
136 | struct rt_mutex *lock) | ||
137 | { | ||
138 | struct task_struct *task; | ||
139 | |||
140 | if (!rt_trace_on || detect || !act_waiter) | ||
141 | return; | ||
142 | |||
143 | task = rt_mutex_owner(act_waiter->lock); | ||
144 | if (task && task != current) { | ||
145 | act_waiter->deadlock_task_pid = task->pid; | ||
146 | act_waiter->deadlock_lock = lock; | ||
147 | } | ||
148 | } | ||
149 | |||
150 | void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) | ||
151 | { | ||
152 | struct task_struct *task; | ||
153 | |||
154 | if (!waiter->deadlock_lock || !rt_trace_on) | ||
155 | return; | ||
156 | |||
157 | task = find_task_by_pid(waiter->deadlock_task_pid); | ||
158 | if (!task) | ||
159 | return; | ||
160 | |||
161 | TRACE_OFF_NOLOCK(); | ||
162 | |||
163 | printk("\n============================================\n"); | ||
164 | printk( "[ BUG: circular locking deadlock detected! ]\n"); | ||
165 | printk( "--------------------------------------------\n"); | ||
166 | printk("%s/%d is deadlocking current task %s/%d\n\n", | ||
167 | task->comm, task->pid, current->comm, current->pid); | ||
168 | |||
169 | printk("\n1) %s/%d is trying to acquire this lock:\n", | ||
170 | current->comm, current->pid); | ||
171 | printk_lock(waiter->lock, 1); | ||
172 | |||
173 | printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid); | ||
174 | printk_lock(waiter->deadlock_lock, 1); | ||
175 | |||
176 | debug_show_held_locks(current); | ||
177 | debug_show_held_locks(task); | ||
178 | |||
179 | printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid); | ||
180 | show_stack(task, NULL); | ||
181 | printk("\n%s/%d's [current] stackdump:\n\n", | ||
182 | current->comm, current->pid); | ||
183 | dump_stack(); | ||
184 | debug_show_all_locks(); | ||
185 | |||
186 | printk("[ turning off deadlock detection." | ||
187 | "Please report this trace. ]\n\n"); | ||
188 | local_irq_disable(); | ||
189 | } | ||
190 | |||
191 | void debug_rt_mutex_lock(struct rt_mutex *lock) | ||
192 | { | ||
193 | } | ||
194 | |||
195 | void debug_rt_mutex_unlock(struct rt_mutex *lock) | ||
196 | { | ||
197 | TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); | ||
198 | } | ||
199 | |||
200 | void | ||
201 | debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner) | ||
202 | { | ||
203 | } | ||
204 | |||
205 | void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) | ||
206 | { | ||
207 | TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock)); | ||
208 | } | ||
209 | |||
210 | void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) | ||
211 | { | ||
212 | memset(waiter, 0x11, sizeof(*waiter)); | ||
213 | plist_node_init(&waiter->list_entry, MAX_PRIO); | ||
214 | plist_node_init(&waiter->pi_list_entry, MAX_PRIO); | ||
215 | } | ||
216 | |||
217 | void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) | ||
218 | { | ||
219 | TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); | ||
220 | TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); | ||
221 | TRACE_WARN_ON(waiter->task); | ||
222 | memset(waiter, 0x22, sizeof(*waiter)); | ||
223 | } | ||
224 | |||
225 | void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) | ||
226 | { | ||
227 | /* | ||
228 | * Make sure we are not reinitializing a held lock: | ||
229 | */ | ||
230 | debug_check_no_locks_freed((void *)lock, sizeof(*lock)); | ||
231 | lock->name = name; | ||
232 | } | ||
233 | |||
234 | void | ||
235 | rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) | ||
236 | { | ||
237 | } | ||
238 | |||
239 | void rt_mutex_deadlock_account_unlock(struct task_struct *task) | ||
240 | { | ||
241 | } | ||
242 | |||
diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h new file mode 100644 index 000000000000..14193d596d78 --- /dev/null +++ b/kernel/rtmutex-debug.h | |||
@@ -0,0 +1,33 @@ | |||
1 | /* | ||
2 | * RT-Mutexes: blocking mutual exclusion locks with PI support | ||
3 | * | ||
4 | * started by Ingo Molnar and Thomas Gleixner: | ||
5 | * | ||
6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
8 | * | ||
9 | * This file contains macros used solely by rtmutex.c. Debug version. | ||
10 | */ | ||
11 | |||
12 | extern void | ||
13 | rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task); | ||
14 | extern void rt_mutex_deadlock_account_unlock(struct task_struct *task); | ||
15 | extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); | ||
16 | extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); | ||
17 | extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); | ||
18 | extern void debug_rt_mutex_lock(struct rt_mutex *lock); | ||
19 | extern void debug_rt_mutex_unlock(struct rt_mutex *lock); | ||
20 | extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, | ||
21 | struct task_struct *powner); | ||
22 | extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); | ||
23 | extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, | ||
24 | struct rt_mutex *lock); | ||
25 | extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); | ||
26 | # define debug_rt_mutex_reset_waiter(w) \ | ||
27 | do { (w)->deadlock_lock = NULL; } while (0) | ||
28 | |||
29 | static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, | ||
30 | int detect) | ||
31 | { | ||
32 | return (waiter != NULL); | ||
33 | } | ||
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c new file mode 100644 index 000000000000..948bd8f643e2 --- /dev/null +++ b/kernel/rtmutex-tester.c | |||
@@ -0,0 +1,441 @@ | |||
1 | /* | ||
2 | * RT-Mutex-tester: scriptable tester for rt mutexes | ||
3 | * | ||
4 | * started by Thomas Gleixner: | ||
5 | * | ||
6 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
7 | * | ||
8 | */ | ||
9 | #include <linux/config.h> | ||
10 | #include <linux/kthread.h> | ||
11 | #include <linux/module.h> | ||
12 | #include <linux/sched.h> | ||
13 | #include <linux/smp_lock.h> | ||
14 | #include <linux/spinlock.h> | ||
15 | #include <linux/sysdev.h> | ||
16 | #include <linux/timer.h> | ||
17 | |||
18 | #include "rtmutex.h" | ||
19 | |||
20 | #define MAX_RT_TEST_THREADS 8 | ||
21 | #define MAX_RT_TEST_MUTEXES 8 | ||
22 | |||
23 | static spinlock_t rttest_lock; | ||
24 | static atomic_t rttest_event; | ||
25 | |||
26 | struct test_thread_data { | ||
27 | int opcode; | ||
28 | int opdata; | ||
29 | int mutexes[MAX_RT_TEST_MUTEXES]; | ||
30 | int bkl; | ||
31 | int event; | ||
32 | struct sys_device sysdev; | ||
33 | }; | ||
34 | |||
35 | static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; | ||
36 | static struct task_struct *threads[MAX_RT_TEST_THREADS]; | ||
37 | static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; | ||
38 | |||
39 | enum test_opcodes { | ||
40 | RTTEST_NOP = 0, | ||
41 | RTTEST_SCHEDOT, /* 1 Sched other, data = nice */ | ||
42 | RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */ | ||
43 | RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */ | ||
44 | RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */ | ||
45 | RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */ | ||
46 | RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ | ||
47 | RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ | ||
48 | RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ | ||
49 | RTTEST_LOCKBKL, /* 9 Lock BKL */ | ||
50 | RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ | ||
51 | RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */ | ||
52 | RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ | ||
53 | RTTEST_RESET = 99, /* 99 Reset all pending operations */ | ||
54 | }; | ||
55 | |||
56 | static int handle_op(struct test_thread_data *td, int lockwakeup) | ||
57 | { | ||
58 | int i, id, ret = -EINVAL; | ||
59 | |||
60 | switch(td->opcode) { | ||
61 | |||
62 | case RTTEST_NOP: | ||
63 | return 0; | ||
64 | |||
65 | case RTTEST_LOCKCONT: | ||
66 | td->mutexes[td->opdata] = 1; | ||
67 | td->event = atomic_add_return(1, &rttest_event); | ||
68 | return 0; | ||
69 | |||
70 | case RTTEST_RESET: | ||
71 | for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) { | ||
72 | if (td->mutexes[i] == 4) { | ||
73 | rt_mutex_unlock(&mutexes[i]); | ||
74 | td->mutexes[i] = 0; | ||
75 | } | ||
76 | } | ||
77 | |||
78 | if (!lockwakeup && td->bkl == 4) { | ||
79 | unlock_kernel(); | ||
80 | td->bkl = 0; | ||
81 | } | ||
82 | return 0; | ||
83 | |||
84 | case RTTEST_RESETEVENT: | ||
85 | atomic_set(&rttest_event, 0); | ||
86 | return 0; | ||
87 | |||
88 | default: | ||
89 | if (lockwakeup) | ||
90 | return ret; | ||
91 | } | ||
92 | |||
93 | switch(td->opcode) { | ||
94 | |||
95 | case RTTEST_LOCK: | ||
96 | case RTTEST_LOCKNOWAIT: | ||
97 | id = td->opdata; | ||
98 | if (id < 0 || id >= MAX_RT_TEST_MUTEXES) | ||
99 | return ret; | ||
100 | |||
101 | td->mutexes[id] = 1; | ||
102 | td->event = atomic_add_return(1, &rttest_event); | ||
103 | rt_mutex_lock(&mutexes[id]); | ||
104 | td->event = atomic_add_return(1, &rttest_event); | ||
105 | td->mutexes[id] = 4; | ||
106 | return 0; | ||
107 | |||
108 | case RTTEST_LOCKINT: | ||
109 | case RTTEST_LOCKINTNOWAIT: | ||
110 | id = td->opdata; | ||
111 | if (id < 0 || id >= MAX_RT_TEST_MUTEXES) | ||
112 | return ret; | ||
113 | |||
114 | td->mutexes[id] = 1; | ||
115 | td->event = atomic_add_return(1, &rttest_event); | ||
116 | ret = rt_mutex_lock_interruptible(&mutexes[id], 0); | ||
117 | td->event = atomic_add_return(1, &rttest_event); | ||
118 | td->mutexes[id] = ret ? 0 : 4; | ||
119 | return ret ? -EINTR : 0; | ||
120 | |||
121 | case RTTEST_UNLOCK: | ||
122 | id = td->opdata; | ||
123 | if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4) | ||
124 | return ret; | ||
125 | |||
126 | td->event = atomic_add_return(1, &rttest_event); | ||
127 | rt_mutex_unlock(&mutexes[id]); | ||
128 | td->event = atomic_add_return(1, &rttest_event); | ||
129 | td->mutexes[id] = 0; | ||
130 | return 0; | ||
131 | |||
132 | case RTTEST_LOCKBKL: | ||
133 | if (td->bkl) | ||
134 | return 0; | ||
135 | td->bkl = 1; | ||
136 | lock_kernel(); | ||
137 | td->bkl = 4; | ||
138 | return 0; | ||
139 | |||
140 | case RTTEST_UNLOCKBKL: | ||
141 | if (td->bkl != 4) | ||
142 | break; | ||
143 | unlock_kernel(); | ||
144 | td->bkl = 0; | ||
145 | return 0; | ||
146 | |||
147 | default: | ||
148 | break; | ||
149 | } | ||
150 | return ret; | ||
151 | } | ||
152 | |||
153 | /* | ||
154 | * Schedule replacement for rtsem_down(). Only called for threads with | ||
155 | * PF_MUTEX_TESTER set. | ||
156 | * | ||
157 | * This allows us to have finegrained control over the event flow. | ||
158 | * | ||
159 | */ | ||
160 | void schedule_rt_mutex_test(struct rt_mutex *mutex) | ||
161 | { | ||
162 | int tid, op, dat; | ||
163 | struct test_thread_data *td; | ||
164 | |||
165 | /* We have to lookup the task */ | ||
166 | for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) { | ||
167 | if (threads[tid] == current) | ||
168 | break; | ||
169 | } | ||
170 | |||
171 | BUG_ON(tid == MAX_RT_TEST_THREADS); | ||
172 | |||
173 | td = &thread_data[tid]; | ||
174 | |||
175 | op = td->opcode; | ||
176 | dat = td->opdata; | ||
177 | |||
178 | switch (op) { | ||
179 | case RTTEST_LOCK: | ||
180 | case RTTEST_LOCKINT: | ||
181 | case RTTEST_LOCKNOWAIT: | ||
182 | case RTTEST_LOCKINTNOWAIT: | ||
183 | if (mutex != &mutexes[dat]) | ||
184 | break; | ||
185 | |||
186 | if (td->mutexes[dat] != 1) | ||
187 | break; | ||
188 | |||
189 | td->mutexes[dat] = 2; | ||
190 | td->event = atomic_add_return(1, &rttest_event); | ||
191 | break; | ||
192 | |||
193 | case RTTEST_LOCKBKL: | ||
194 | default: | ||
195 | break; | ||
196 | } | ||
197 | |||
198 | schedule(); | ||
199 | |||
200 | |||
201 | switch (op) { | ||
202 | case RTTEST_LOCK: | ||
203 | case RTTEST_LOCKINT: | ||
204 | if (mutex != &mutexes[dat]) | ||
205 | return; | ||
206 | |||
207 | if (td->mutexes[dat] != 2) | ||
208 | return; | ||
209 | |||
210 | td->mutexes[dat] = 3; | ||
211 | td->event = atomic_add_return(1, &rttest_event); | ||
212 | break; | ||
213 | |||
214 | case RTTEST_LOCKNOWAIT: | ||
215 | case RTTEST_LOCKINTNOWAIT: | ||
216 | if (mutex != &mutexes[dat]) | ||
217 | return; | ||
218 | |||
219 | if (td->mutexes[dat] != 2) | ||
220 | return; | ||
221 | |||
222 | td->mutexes[dat] = 1; | ||
223 | td->event = atomic_add_return(1, &rttest_event); | ||
224 | return; | ||
225 | |||
226 | case RTTEST_LOCKBKL: | ||
227 | return; | ||
228 | default: | ||
229 | return; | ||
230 | } | ||
231 | |||
232 | td->opcode = 0; | ||
233 | |||
234 | for (;;) { | ||
235 | set_current_state(TASK_INTERRUPTIBLE); | ||
236 | |||
237 | if (td->opcode > 0) { | ||
238 | int ret; | ||
239 | |||
240 | set_current_state(TASK_RUNNING); | ||
241 | ret = handle_op(td, 1); | ||
242 | set_current_state(TASK_INTERRUPTIBLE); | ||
243 | if (td->opcode == RTTEST_LOCKCONT) | ||
244 | break; | ||
245 | td->opcode = ret; | ||
246 | } | ||
247 | |||
248 | /* Wait for the next command to be executed */ | ||
249 | schedule(); | ||
250 | } | ||
251 | |||
252 | /* Restore previous command and data */ | ||
253 | td->opcode = op; | ||
254 | td->opdata = dat; | ||
255 | } | ||
256 | |||
257 | static int test_func(void *data) | ||
258 | { | ||
259 | struct test_thread_data *td = data; | ||
260 | int ret; | ||
261 | |||
262 | current->flags |= PF_MUTEX_TESTER; | ||
263 | allow_signal(SIGHUP); | ||
264 | |||
265 | for(;;) { | ||
266 | |||
267 | set_current_state(TASK_INTERRUPTIBLE); | ||
268 | |||
269 | if (td->opcode > 0) { | ||
270 | set_current_state(TASK_RUNNING); | ||
271 | ret = handle_op(td, 0); | ||
272 | set_current_state(TASK_INTERRUPTIBLE); | ||
273 | td->opcode = ret; | ||
274 | } | ||
275 | |||
276 | /* Wait for the next command to be executed */ | ||
277 | schedule(); | ||
278 | try_to_freeze(); | ||
279 | |||
280 | if (signal_pending(current)) | ||
281 | flush_signals(current); | ||
282 | |||
283 | if(kthread_should_stop()) | ||
284 | break; | ||
285 | } | ||
286 | return 0; | ||
287 | } | ||
288 | |||
289 | /** | ||
290 | * sysfs_test_command - interface for test commands | ||
291 | * @dev: thread reference | ||
292 | * @buf: command for actual step | ||
293 | * @count: length of buffer | ||
294 | * | ||
295 | * command syntax: | ||
296 | * | ||
297 | * opcode:data | ||
298 | */ | ||
299 | static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf, | ||
300 | size_t count) | ||
301 | { | ||
302 | struct sched_param schedpar; | ||
303 | struct test_thread_data *td; | ||
304 | char cmdbuf[32]; | ||
305 | int op, dat, tid, ret; | ||
306 | |||
307 | td = container_of(dev, struct test_thread_data, sysdev); | ||
308 | tid = td->sysdev.id; | ||
309 | |||
310 | /* strings from sysfs write are not 0 terminated! */ | ||
311 | if (count >= sizeof(cmdbuf)) | ||
312 | return -EINVAL; | ||
313 | |||
314 | /* strip of \n: */ | ||
315 | if (buf[count-1] == '\n') | ||
316 | count--; | ||
317 | if (count < 1) | ||
318 | return -EINVAL; | ||
319 | |||
320 | memcpy(cmdbuf, buf, count); | ||
321 | cmdbuf[count] = 0; | ||
322 | |||
323 | if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2) | ||
324 | return -EINVAL; | ||
325 | |||
326 | switch (op) { | ||
327 | case RTTEST_SCHEDOT: | ||
328 | schedpar.sched_priority = 0; | ||
329 | ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar); | ||
330 | if (ret) | ||
331 | return ret; | ||
332 | set_user_nice(current, 0); | ||
333 | break; | ||
334 | |||
335 | case RTTEST_SCHEDRT: | ||
336 | schedpar.sched_priority = dat; | ||
337 | ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar); | ||
338 | if (ret) | ||
339 | return ret; | ||
340 | break; | ||
341 | |||
342 | case RTTEST_SIGNAL: | ||
343 | send_sig(SIGHUP, threads[tid], 0); | ||
344 | break; | ||
345 | |||
346 | default: | ||
347 | if (td->opcode > 0) | ||
348 | return -EBUSY; | ||
349 | td->opdata = dat; | ||
350 | td->opcode = op; | ||
351 | wake_up_process(threads[tid]); | ||
352 | } | ||
353 | |||
354 | return count; | ||
355 | } | ||
356 | |||
357 | /** | ||
358 | * sysfs_test_status - sysfs interface for rt tester | ||
359 | * @dev: thread to query | ||
360 | * @buf: char buffer to be filled with thread status info | ||
361 | */ | ||
362 | static ssize_t sysfs_test_status(struct sys_device *dev, char *buf) | ||
363 | { | ||
364 | struct test_thread_data *td; | ||
365 | struct task_struct *tsk; | ||
366 | char *curr = buf; | ||
367 | int i; | ||
368 | |||
369 | td = container_of(dev, struct test_thread_data, sysdev); | ||
370 | tsk = threads[td->sysdev.id]; | ||
371 | |||
372 | spin_lock(&rttest_lock); | ||
373 | |||
374 | curr += sprintf(curr, | ||
375 | "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", | ||
376 | td->opcode, td->event, tsk->state, | ||
377 | (MAX_RT_PRIO - 1) - tsk->prio, | ||
378 | (MAX_RT_PRIO - 1) - tsk->normal_prio, | ||
379 | tsk->pi_blocked_on, td->bkl); | ||
380 | |||
381 | for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) | ||
382 | curr += sprintf(curr, "%d", td->mutexes[i]); | ||
383 | |||
384 | spin_unlock(&rttest_lock); | ||
385 | |||
386 | curr += sprintf(curr, ", T: %p, R: %p\n", tsk, | ||
387 | mutexes[td->sysdev.id].owner); | ||
388 | |||
389 | return curr - buf; | ||
390 | } | ||
391 | |||
392 | static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); | ||
393 | static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); | ||
394 | |||
395 | static struct sysdev_class rttest_sysclass = { | ||
396 | set_kset_name("rttest"), | ||
397 | }; | ||
398 | |||
399 | static int init_test_thread(int id) | ||
400 | { | ||
401 | thread_data[id].sysdev.cls = &rttest_sysclass; | ||
402 | thread_data[id].sysdev.id = id; | ||
403 | |||
404 | threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); | ||
405 | if (IS_ERR(threads[id])) | ||
406 | return PTR_ERR(threads[id]); | ||
407 | |||
408 | return sysdev_register(&thread_data[id].sysdev); | ||
409 | } | ||
410 | |||
411 | static int init_rttest(void) | ||
412 | { | ||
413 | int ret, i; | ||
414 | |||
415 | spin_lock_init(&rttest_lock); | ||
416 | |||
417 | for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) | ||
418 | rt_mutex_init(&mutexes[i]); | ||
419 | |||
420 | ret = sysdev_class_register(&rttest_sysclass); | ||
421 | if (ret) | ||
422 | return ret; | ||
423 | |||
424 | for (i = 0; i < MAX_RT_TEST_THREADS; i++) { | ||
425 | ret = init_test_thread(i); | ||
426 | if (ret) | ||
427 | break; | ||
428 | ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status); | ||
429 | if (ret) | ||
430 | break; | ||
431 | ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command); | ||
432 | if (ret) | ||
433 | break; | ||
434 | } | ||
435 | |||
436 | printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" ); | ||
437 | |||
438 | return ret; | ||
439 | } | ||
440 | |||
441 | device_initcall(init_rttest); | ||
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c new file mode 100644 index 000000000000..d2ef13b485e7 --- /dev/null +++ b/kernel/rtmutex.c | |||
@@ -0,0 +1,989 @@ | |||
1 | /* | ||
2 | * RT-Mutexes: simple blocking mutual exclusion locks with PI support | ||
3 | * | ||
4 | * started by Ingo Molnar and Thomas Gleixner. | ||
5 | * | ||
6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
8 | * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt | ||
9 | * Copyright (C) 2006 Esben Nielsen | ||
10 | */ | ||
11 | #include <linux/spinlock.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/sched.h> | ||
14 | #include <linux/timer.h> | ||
15 | |||
16 | #include "rtmutex_common.h" | ||
17 | |||
18 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
19 | # include "rtmutex-debug.h" | ||
20 | #else | ||
21 | # include "rtmutex.h" | ||
22 | #endif | ||
23 | |||
24 | /* | ||
25 | * lock->owner state tracking: | ||
26 | * | ||
27 | * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 | ||
28 | * are used to keep track of the "owner is pending" and "lock has | ||
29 | * waiters" state. | ||
30 | * | ||
31 | * owner bit1 bit0 | ||
32 | * NULL 0 0 lock is free (fast acquire possible) | ||
33 | * NULL 0 1 invalid state | ||
34 | * NULL 1 0 Transitional State* | ||
35 | * NULL 1 1 invalid state | ||
36 | * taskpointer 0 0 lock is held (fast release possible) | ||
37 | * taskpointer 0 1 task is pending owner | ||
38 | * taskpointer 1 0 lock is held and has waiters | ||
39 | * taskpointer 1 1 task is pending owner and lock has more waiters | ||
40 | * | ||
41 | * Pending ownership is assigned to the top (highest priority) | ||
42 | * waiter of the lock, when the lock is released. The thread is woken | ||
43 | * up and can now take the lock. Until the lock is taken (bit 0 | ||
44 | * cleared) a competing higher priority thread can steal the lock | ||
45 | * which puts the woken up thread back on the waiters list. | ||
46 | * | ||
47 | * The fast atomic compare exchange based acquire and release is only | ||
48 | * possible when bit 0 and 1 of lock->owner are 0. | ||
49 | * | ||
50 | * (*) There's a small time where the owner can be NULL and the | ||
51 | * "lock has waiters" bit is set. This can happen when grabbing the lock. | ||
52 | * To prevent a cmpxchg of the owner releasing the lock, we need to set this | ||
53 | * bit before looking at the lock, hence the reason this is a transitional | ||
54 | * state. | ||
55 | */ | ||
56 | |||
57 | static void | ||
58 | rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, | ||
59 | unsigned long mask) | ||
60 | { | ||
61 | unsigned long val = (unsigned long)owner | mask; | ||
62 | |||
63 | if (rt_mutex_has_waiters(lock)) | ||
64 | val |= RT_MUTEX_HAS_WAITERS; | ||
65 | |||
66 | lock->owner = (struct task_struct *)val; | ||
67 | } | ||
68 | |||
69 | static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) | ||
70 | { | ||
71 | lock->owner = (struct task_struct *) | ||
72 | ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); | ||
73 | } | ||
74 | |||
75 | static void fixup_rt_mutex_waiters(struct rt_mutex *lock) | ||
76 | { | ||
77 | if (!rt_mutex_has_waiters(lock)) | ||
78 | clear_rt_mutex_waiters(lock); | ||
79 | } | ||
80 | |||
81 | /* | ||
82 | * We can speed up the acquire/release, if the architecture | ||
83 | * supports cmpxchg and if there's no debugging state to be set up | ||
84 | */ | ||
85 | #if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) | ||
86 | # define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) | ||
87 | static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) | ||
88 | { | ||
89 | unsigned long owner, *p = (unsigned long *) &lock->owner; | ||
90 | |||
91 | do { | ||
92 | owner = *p; | ||
93 | } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); | ||
94 | } | ||
95 | #else | ||
96 | # define rt_mutex_cmpxchg(l,c,n) (0) | ||
97 | static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) | ||
98 | { | ||
99 | lock->owner = (struct task_struct *) | ||
100 | ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); | ||
101 | } | ||
102 | #endif | ||
103 | |||
104 | /* | ||
105 | * Calculate task priority from the waiter list priority | ||
106 | * | ||
107 | * Return task->normal_prio when the waiter list is empty or when | ||
108 | * the waiter is not allowed to do priority boosting | ||
109 | */ | ||
110 | int rt_mutex_getprio(struct task_struct *task) | ||
111 | { | ||
112 | if (likely(!task_has_pi_waiters(task))) | ||
113 | return task->normal_prio; | ||
114 | |||
115 | return min(task_top_pi_waiter(task)->pi_list_entry.prio, | ||
116 | task->normal_prio); | ||
117 | } | ||
118 | |||
119 | /* | ||
120 | * Adjust the priority of a task, after its pi_waiters got modified. | ||
121 | * | ||
122 | * This can be both boosting and unboosting. task->pi_lock must be held. | ||
123 | */ | ||
124 | static void __rt_mutex_adjust_prio(struct task_struct *task) | ||
125 | { | ||
126 | int prio = rt_mutex_getprio(task); | ||
127 | |||
128 | if (task->prio != prio) | ||
129 | rt_mutex_setprio(task, prio); | ||
130 | } | ||
131 | |||
132 | /* | ||
133 | * Adjust task priority (undo boosting). Called from the exit path of | ||
134 | * rt_mutex_slowunlock() and rt_mutex_slowlock(). | ||
135 | * | ||
136 | * (Note: We do this outside of the protection of lock->wait_lock to | ||
137 | * allow the lock to be taken while or before we readjust the priority | ||
138 | * of task. We do not use the spin_xx_mutex() variants here as we are | ||
139 | * outside of the debug path.) | ||
140 | */ | ||
141 | static void rt_mutex_adjust_prio(struct task_struct *task) | ||
142 | { | ||
143 | unsigned long flags; | ||
144 | |||
145 | spin_lock_irqsave(&task->pi_lock, flags); | ||
146 | __rt_mutex_adjust_prio(task); | ||
147 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
148 | } | ||
149 | |||
150 | /* | ||
151 | * Max number of times we'll walk the boosting chain: | ||
152 | */ | ||
153 | int max_lock_depth = 1024; | ||
154 | |||
155 | /* | ||
156 | * Adjust the priority chain. Also used for deadlock detection. | ||
157 | * Decreases task's usage by one - may thus free the task. | ||
158 | * Returns 0 or -EDEADLK. | ||
159 | */ | ||
160 | static int rt_mutex_adjust_prio_chain(struct task_struct *task, | ||
161 | int deadlock_detect, | ||
162 | struct rt_mutex *orig_lock, | ||
163 | struct rt_mutex_waiter *orig_waiter, | ||
164 | struct task_struct *top_task) | ||
165 | { | ||
166 | struct rt_mutex *lock; | ||
167 | struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; | ||
168 | int detect_deadlock, ret = 0, depth = 0; | ||
169 | unsigned long flags; | ||
170 | |||
171 | detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, | ||
172 | deadlock_detect); | ||
173 | |||
174 | /* | ||
175 | * The (de)boosting is a step by step approach with a lot of | ||
176 | * pitfalls. We want this to be preemptible and we want hold a | ||
177 | * maximum of two locks per step. So we have to check | ||
178 | * carefully whether things change under us. | ||
179 | */ | ||
180 | again: | ||
181 | if (++depth > max_lock_depth) { | ||
182 | static int prev_max; | ||
183 | |||
184 | /* | ||
185 | * Print this only once. If the admin changes the limit, | ||
186 | * print a new message when reaching the limit again. | ||
187 | */ | ||
188 | if (prev_max != max_lock_depth) { | ||
189 | prev_max = max_lock_depth; | ||
190 | printk(KERN_WARNING "Maximum lock depth %d reached " | ||
191 | "task: %s (%d)\n", max_lock_depth, | ||
192 | top_task->comm, top_task->pid); | ||
193 | } | ||
194 | put_task_struct(task); | ||
195 | |||
196 | return deadlock_detect ? -EDEADLK : 0; | ||
197 | } | ||
198 | retry: | ||
199 | /* | ||
200 | * Task can not go away as we did a get_task() before ! | ||
201 | */ | ||
202 | spin_lock_irqsave(&task->pi_lock, flags); | ||
203 | |||
204 | waiter = task->pi_blocked_on; | ||
205 | /* | ||
206 | * Check whether the end of the boosting chain has been | ||
207 | * reached or the state of the chain has changed while we | ||
208 | * dropped the locks. | ||
209 | */ | ||
210 | if (!waiter || !waiter->task) | ||
211 | goto out_unlock_pi; | ||
212 | |||
213 | if (top_waiter && (!task_has_pi_waiters(task) || | ||
214 | top_waiter != task_top_pi_waiter(task))) | ||
215 | goto out_unlock_pi; | ||
216 | |||
217 | /* | ||
218 | * When deadlock detection is off then we check, if further | ||
219 | * priority adjustment is necessary. | ||
220 | */ | ||
221 | if (!detect_deadlock && waiter->list_entry.prio == task->prio) | ||
222 | goto out_unlock_pi; | ||
223 | |||
224 | lock = waiter->lock; | ||
225 | if (!spin_trylock(&lock->wait_lock)) { | ||
226 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
227 | cpu_relax(); | ||
228 | goto retry; | ||
229 | } | ||
230 | |||
231 | /* Deadlock detection */ | ||
232 | if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { | ||
233 | debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); | ||
234 | spin_unlock(&lock->wait_lock); | ||
235 | ret = deadlock_detect ? -EDEADLK : 0; | ||
236 | goto out_unlock_pi; | ||
237 | } | ||
238 | |||
239 | top_waiter = rt_mutex_top_waiter(lock); | ||
240 | |||
241 | /* Requeue the waiter */ | ||
242 | plist_del(&waiter->list_entry, &lock->wait_list); | ||
243 | waiter->list_entry.prio = task->prio; | ||
244 | plist_add(&waiter->list_entry, &lock->wait_list); | ||
245 | |||
246 | /* Release the task */ | ||
247 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
248 | put_task_struct(task); | ||
249 | |||
250 | /* Grab the next task */ | ||
251 | task = rt_mutex_owner(lock); | ||
252 | spin_lock_irqsave(&task->pi_lock, flags); | ||
253 | |||
254 | if (waiter == rt_mutex_top_waiter(lock)) { | ||
255 | /* Boost the owner */ | ||
256 | plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); | ||
257 | waiter->pi_list_entry.prio = waiter->list_entry.prio; | ||
258 | plist_add(&waiter->pi_list_entry, &task->pi_waiters); | ||
259 | __rt_mutex_adjust_prio(task); | ||
260 | |||
261 | } else if (top_waiter == waiter) { | ||
262 | /* Deboost the owner */ | ||
263 | plist_del(&waiter->pi_list_entry, &task->pi_waiters); | ||
264 | waiter = rt_mutex_top_waiter(lock); | ||
265 | waiter->pi_list_entry.prio = waiter->list_entry.prio; | ||
266 | plist_add(&waiter->pi_list_entry, &task->pi_waiters); | ||
267 | __rt_mutex_adjust_prio(task); | ||
268 | } | ||
269 | |||
270 | get_task_struct(task); | ||
271 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
272 | |||
273 | top_waiter = rt_mutex_top_waiter(lock); | ||
274 | spin_unlock(&lock->wait_lock); | ||
275 | |||
276 | if (!detect_deadlock && waiter != top_waiter) | ||
277 | goto out_put_task; | ||
278 | |||
279 | goto again; | ||
280 | |||
281 | out_unlock_pi: | ||
282 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
283 | out_put_task: | ||
284 | put_task_struct(task); | ||
285 | |||
286 | return ret; | ||
287 | } | ||
288 | |||
289 | /* | ||
290 | * Optimization: check if we can steal the lock from the | ||
291 | * assigned pending owner [which might not have taken the | ||
292 | * lock yet]: | ||
293 | */ | ||
294 | static inline int try_to_steal_lock(struct rt_mutex *lock) | ||
295 | { | ||
296 | struct task_struct *pendowner = rt_mutex_owner(lock); | ||
297 | struct rt_mutex_waiter *next; | ||
298 | unsigned long flags; | ||
299 | |||
300 | if (!rt_mutex_owner_pending(lock)) | ||
301 | return 0; | ||
302 | |||
303 | if (pendowner == current) | ||
304 | return 1; | ||
305 | |||
306 | spin_lock_irqsave(&pendowner->pi_lock, flags); | ||
307 | if (current->prio >= pendowner->prio) { | ||
308 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
309 | return 0; | ||
310 | } | ||
311 | |||
312 | /* | ||
313 | * Check if a waiter is enqueued on the pending owners | ||
314 | * pi_waiters list. Remove it and readjust pending owners | ||
315 | * priority. | ||
316 | */ | ||
317 | if (likely(!rt_mutex_has_waiters(lock))) { | ||
318 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
319 | return 1; | ||
320 | } | ||
321 | |||
322 | /* No chain handling, pending owner is not blocked on anything: */ | ||
323 | next = rt_mutex_top_waiter(lock); | ||
324 | plist_del(&next->pi_list_entry, &pendowner->pi_waiters); | ||
325 | __rt_mutex_adjust_prio(pendowner); | ||
326 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
327 | |||
328 | /* | ||
329 | * We are going to steal the lock and a waiter was | ||
330 | * enqueued on the pending owners pi_waiters queue. So | ||
331 | * we have to enqueue this waiter into | ||
332 | * current->pi_waiters list. This covers the case, | ||
333 | * where current is boosted because it holds another | ||
334 | * lock and gets unboosted because the booster is | ||
335 | * interrupted, so we would delay a waiter with higher | ||
336 | * priority as current->normal_prio. | ||
337 | * | ||
338 | * Note: in the rare case of a SCHED_OTHER task changing | ||
339 | * its priority and thus stealing the lock, next->task | ||
340 | * might be current: | ||
341 | */ | ||
342 | if (likely(next->task != current)) { | ||
343 | spin_lock_irqsave(¤t->pi_lock, flags); | ||
344 | plist_add(&next->pi_list_entry, ¤t->pi_waiters); | ||
345 | __rt_mutex_adjust_prio(current); | ||
346 | spin_unlock_irqrestore(¤t->pi_lock, flags); | ||
347 | } | ||
348 | return 1; | ||
349 | } | ||
350 | |||
351 | /* | ||
352 | * Try to take an rt-mutex | ||
353 | * | ||
354 | * This fails | ||
355 | * - when the lock has a real owner | ||
356 | * - when a different pending owner exists and has higher priority than current | ||
357 | * | ||
358 | * Must be called with lock->wait_lock held. | ||
359 | */ | ||
360 | static int try_to_take_rt_mutex(struct rt_mutex *lock) | ||
361 | { | ||
362 | /* | ||
363 | * We have to be careful here if the atomic speedups are | ||
364 | * enabled, such that, when | ||
365 | * - no other waiter is on the lock | ||
366 | * - the lock has been released since we did the cmpxchg | ||
367 | * the lock can be released or taken while we are doing the | ||
368 | * checks and marking the lock with RT_MUTEX_HAS_WAITERS. | ||
369 | * | ||
370 | * The atomic acquire/release aware variant of | ||
371 | * mark_rt_mutex_waiters uses a cmpxchg loop. After setting | ||
372 | * the WAITERS bit, the atomic release / acquire can not | ||
373 | * happen anymore and lock->wait_lock protects us from the | ||
374 | * non-atomic case. | ||
375 | * | ||
376 | * Note, that this might set lock->owner = | ||
377 | * RT_MUTEX_HAS_WAITERS in the case the lock is not contended | ||
378 | * any more. This is fixed up when we take the ownership. | ||
379 | * This is the transitional state explained at the top of this file. | ||
380 | */ | ||
381 | mark_rt_mutex_waiters(lock); | ||
382 | |||
383 | if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) | ||
384 | return 0; | ||
385 | |||
386 | /* We got the lock. */ | ||
387 | debug_rt_mutex_lock(lock); | ||
388 | |||
389 | rt_mutex_set_owner(lock, current, 0); | ||
390 | |||
391 | rt_mutex_deadlock_account_lock(lock, current); | ||
392 | |||
393 | return 1; | ||
394 | } | ||
395 | |||
396 | /* | ||
397 | * Task blocks on lock. | ||
398 | * | ||
399 | * Prepare waiter and propagate pi chain | ||
400 | * | ||
401 | * This must be called with lock->wait_lock held. | ||
402 | */ | ||
403 | static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | ||
404 | struct rt_mutex_waiter *waiter, | ||
405 | int detect_deadlock) | ||
406 | { | ||
407 | struct task_struct *owner = rt_mutex_owner(lock); | ||
408 | struct rt_mutex_waiter *top_waiter = waiter; | ||
409 | unsigned long flags; | ||
410 | int boost = 0, res; | ||
411 | |||
412 | spin_lock_irqsave(¤t->pi_lock, flags); | ||
413 | __rt_mutex_adjust_prio(current); | ||
414 | waiter->task = current; | ||
415 | waiter->lock = lock; | ||
416 | plist_node_init(&waiter->list_entry, current->prio); | ||
417 | plist_node_init(&waiter->pi_list_entry, current->prio); | ||
418 | |||
419 | /* Get the top priority waiter on the lock */ | ||
420 | if (rt_mutex_has_waiters(lock)) | ||
421 | top_waiter = rt_mutex_top_waiter(lock); | ||
422 | plist_add(&waiter->list_entry, &lock->wait_list); | ||
423 | |||
424 | current->pi_blocked_on = waiter; | ||
425 | |||
426 | spin_unlock_irqrestore(¤t->pi_lock, flags); | ||
427 | |||
428 | if (waiter == rt_mutex_top_waiter(lock)) { | ||
429 | spin_lock_irqsave(&owner->pi_lock, flags); | ||
430 | plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); | ||
431 | plist_add(&waiter->pi_list_entry, &owner->pi_waiters); | ||
432 | |||
433 | __rt_mutex_adjust_prio(owner); | ||
434 | if (owner->pi_blocked_on) { | ||
435 | boost = 1; | ||
436 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ | ||
437 | get_task_struct(owner); | ||
438 | } | ||
439 | spin_unlock_irqrestore(&owner->pi_lock, flags); | ||
440 | } | ||
441 | else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { | ||
442 | spin_lock_irqsave(&owner->pi_lock, flags); | ||
443 | if (owner->pi_blocked_on) { | ||
444 | boost = 1; | ||
445 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ | ||
446 | get_task_struct(owner); | ||
447 | } | ||
448 | spin_unlock_irqrestore(&owner->pi_lock, flags); | ||
449 | } | ||
450 | if (!boost) | ||
451 | return 0; | ||
452 | |||
453 | spin_unlock(&lock->wait_lock); | ||
454 | |||
455 | res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, | ||
456 | current); | ||
457 | |||
458 | spin_lock(&lock->wait_lock); | ||
459 | |||
460 | return res; | ||
461 | } | ||
462 | |||
463 | /* | ||
464 | * Wake up the next waiter on the lock. | ||
465 | * | ||
466 | * Remove the top waiter from the current tasks waiter list and from | ||
467 | * the lock waiter list. Set it as pending owner. Then wake it up. | ||
468 | * | ||
469 | * Called with lock->wait_lock held. | ||
470 | */ | ||
471 | static void wakeup_next_waiter(struct rt_mutex *lock) | ||
472 | { | ||
473 | struct rt_mutex_waiter *waiter; | ||
474 | struct task_struct *pendowner; | ||
475 | unsigned long flags; | ||
476 | |||
477 | spin_lock_irqsave(¤t->pi_lock, flags); | ||
478 | |||
479 | waiter = rt_mutex_top_waiter(lock); | ||
480 | plist_del(&waiter->list_entry, &lock->wait_list); | ||
481 | |||
482 | /* | ||
483 | * Remove it from current->pi_waiters. We do not adjust a | ||
484 | * possible priority boost right now. We execute wakeup in the | ||
485 | * boosted mode and go back to normal after releasing | ||
486 | * lock->wait_lock. | ||
487 | */ | ||
488 | plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); | ||
489 | pendowner = waiter->task; | ||
490 | waiter->task = NULL; | ||
491 | |||
492 | rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); | ||
493 | |||
494 | spin_unlock_irqrestore(¤t->pi_lock, flags); | ||
495 | |||
496 | /* | ||
497 | * Clear the pi_blocked_on variable and enqueue a possible | ||
498 | * waiter into the pi_waiters list of the pending owner. This | ||
499 | * prevents that in case the pending owner gets unboosted a | ||
500 | * waiter with higher priority than pending-owner->normal_prio | ||
501 | * is blocked on the unboosted (pending) owner. | ||
502 | */ | ||
503 | spin_lock_irqsave(&pendowner->pi_lock, flags); | ||
504 | |||
505 | WARN_ON(!pendowner->pi_blocked_on); | ||
506 | WARN_ON(pendowner->pi_blocked_on != waiter); | ||
507 | WARN_ON(pendowner->pi_blocked_on->lock != lock); | ||
508 | |||
509 | pendowner->pi_blocked_on = NULL; | ||
510 | |||
511 | if (rt_mutex_has_waiters(lock)) { | ||
512 | struct rt_mutex_waiter *next; | ||
513 | |||
514 | next = rt_mutex_top_waiter(lock); | ||
515 | plist_add(&next->pi_list_entry, &pendowner->pi_waiters); | ||
516 | } | ||
517 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
518 | |||
519 | wake_up_process(pendowner); | ||
520 | } | ||
521 | |||
522 | /* | ||
523 | * Remove a waiter from a lock | ||
524 | * | ||
525 | * Must be called with lock->wait_lock held | ||
526 | */ | ||
527 | static void remove_waiter(struct rt_mutex *lock, | ||
528 | struct rt_mutex_waiter *waiter) | ||
529 | { | ||
530 | int first = (waiter == rt_mutex_top_waiter(lock)); | ||
531 | struct task_struct *owner = rt_mutex_owner(lock); | ||
532 | unsigned long flags; | ||
533 | int boost = 0; | ||
534 | |||
535 | spin_lock_irqsave(¤t->pi_lock, flags); | ||
536 | plist_del(&waiter->list_entry, &lock->wait_list); | ||
537 | waiter->task = NULL; | ||
538 | current->pi_blocked_on = NULL; | ||
539 | spin_unlock_irqrestore(¤t->pi_lock, flags); | ||
540 | |||
541 | if (first && owner != current) { | ||
542 | |||
543 | spin_lock_irqsave(&owner->pi_lock, flags); | ||
544 | |||
545 | plist_del(&waiter->pi_list_entry, &owner->pi_waiters); | ||
546 | |||
547 | if (rt_mutex_has_waiters(lock)) { | ||
548 | struct rt_mutex_waiter *next; | ||
549 | |||
550 | next = rt_mutex_top_waiter(lock); | ||
551 | plist_add(&next->pi_list_entry, &owner->pi_waiters); | ||
552 | } | ||
553 | __rt_mutex_adjust_prio(owner); | ||
554 | |||
555 | if (owner->pi_blocked_on) { | ||
556 | boost = 1; | ||
557 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ | ||
558 | get_task_struct(owner); | ||
559 | } | ||
560 | spin_unlock_irqrestore(&owner->pi_lock, flags); | ||
561 | } | ||
562 | |||
563 | WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); | ||
564 | |||
565 | if (!boost) | ||
566 | return; | ||
567 | |||
568 | spin_unlock(&lock->wait_lock); | ||
569 | |||
570 | rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); | ||
571 | |||
572 | spin_lock(&lock->wait_lock); | ||
573 | } | ||
574 | |||
575 | /* | ||
576 | * Recheck the pi chain, in case we got a priority setting | ||
577 | * | ||
578 | * Called from sched_setscheduler | ||
579 | */ | ||
580 | void rt_mutex_adjust_pi(struct task_struct *task) | ||
581 | { | ||
582 | struct rt_mutex_waiter *waiter; | ||
583 | unsigned long flags; | ||
584 | |||
585 | spin_lock_irqsave(&task->pi_lock, flags); | ||
586 | |||
587 | waiter = task->pi_blocked_on; | ||
588 | if (!waiter || waiter->list_entry.prio == task->prio) { | ||
589 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
590 | return; | ||
591 | } | ||
592 | |||
593 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ | ||
594 | get_task_struct(task); | ||
595 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
596 | |||
597 | rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); | ||
598 | } | ||
599 | |||
600 | /* | ||
601 | * Slow path lock function: | ||
602 | */ | ||
603 | static int __sched | ||
604 | rt_mutex_slowlock(struct rt_mutex *lock, int state, | ||
605 | struct hrtimer_sleeper *timeout, | ||
606 | int detect_deadlock) | ||
607 | { | ||
608 | struct rt_mutex_waiter waiter; | ||
609 | int ret = 0; | ||
610 | |||
611 | debug_rt_mutex_init_waiter(&waiter); | ||
612 | waiter.task = NULL; | ||
613 | |||
614 | spin_lock(&lock->wait_lock); | ||
615 | |||
616 | /* Try to acquire the lock again: */ | ||
617 | if (try_to_take_rt_mutex(lock)) { | ||
618 | spin_unlock(&lock->wait_lock); | ||
619 | return 0; | ||
620 | } | ||
621 | |||
622 | set_current_state(state); | ||
623 | |||
624 | /* Setup the timer, when timeout != NULL */ | ||
625 | if (unlikely(timeout)) | ||
626 | hrtimer_start(&timeout->timer, timeout->timer.expires, | ||
627 | HRTIMER_ABS); | ||
628 | |||
629 | for (;;) { | ||
630 | /* Try to acquire the lock: */ | ||
631 | if (try_to_take_rt_mutex(lock)) | ||
632 | break; | ||
633 | |||
634 | /* | ||
635 | * TASK_INTERRUPTIBLE checks for signals and | ||
636 | * timeout. Ignored otherwise. | ||
637 | */ | ||
638 | if (unlikely(state == TASK_INTERRUPTIBLE)) { | ||
639 | /* Signal pending? */ | ||
640 | if (signal_pending(current)) | ||
641 | ret = -EINTR; | ||
642 | if (timeout && !timeout->task) | ||
643 | ret = -ETIMEDOUT; | ||
644 | if (ret) | ||
645 | break; | ||
646 | } | ||
647 | |||
648 | /* | ||
649 | * waiter.task is NULL the first time we come here and | ||
650 | * when we have been woken up by the previous owner | ||
651 | * but the lock got stolen by a higher prio task. | ||
652 | */ | ||
653 | if (!waiter.task) { | ||
654 | ret = task_blocks_on_rt_mutex(lock, &waiter, | ||
655 | detect_deadlock); | ||
656 | /* | ||
657 | * If we got woken up by the owner then start loop | ||
658 | * all over without going into schedule to try | ||
659 | * to get the lock now: | ||
660 | */ | ||
661 | if (unlikely(!waiter.task)) | ||
662 | continue; | ||
663 | |||
664 | if (unlikely(ret)) | ||
665 | break; | ||
666 | } | ||
667 | |||
668 | spin_unlock(&lock->wait_lock); | ||
669 | |||
670 | debug_rt_mutex_print_deadlock(&waiter); | ||
671 | |||
672 | if (waiter.task) | ||
673 | schedule_rt_mutex(lock); | ||
674 | |||
675 | spin_lock(&lock->wait_lock); | ||
676 | set_current_state(state); | ||
677 | } | ||
678 | |||
679 | set_current_state(TASK_RUNNING); | ||
680 | |||
681 | if (unlikely(waiter.task)) | ||
682 | remove_waiter(lock, &waiter); | ||
683 | |||
684 | /* | ||
685 | * try_to_take_rt_mutex() sets the waiter bit | ||
686 | * unconditionally. We might have to fix that up. | ||
687 | */ | ||
688 | fixup_rt_mutex_waiters(lock); | ||
689 | |||
690 | spin_unlock(&lock->wait_lock); | ||
691 | |||
692 | /* Remove pending timer: */ | ||
693 | if (unlikely(timeout)) | ||
694 | hrtimer_cancel(&timeout->timer); | ||
695 | |||
696 | /* | ||
697 | * Readjust priority, when we did not get the lock. We might | ||
698 | * have been the pending owner and boosted. Since we did not | ||
699 | * take the lock, the PI boost has to go. | ||
700 | */ | ||
701 | if (unlikely(ret)) | ||
702 | rt_mutex_adjust_prio(current); | ||
703 | |||
704 | debug_rt_mutex_free_waiter(&waiter); | ||
705 | |||
706 | return ret; | ||
707 | } | ||
708 | |||
709 | /* | ||
710 | * Slow path try-lock function: | ||
711 | */ | ||
712 | static inline int | ||
713 | rt_mutex_slowtrylock(struct rt_mutex *lock) | ||
714 | { | ||
715 | int ret = 0; | ||
716 | |||
717 | spin_lock(&lock->wait_lock); | ||
718 | |||
719 | if (likely(rt_mutex_owner(lock) != current)) { | ||
720 | |||
721 | ret = try_to_take_rt_mutex(lock); | ||
722 | /* | ||
723 | * try_to_take_rt_mutex() sets the lock waiters | ||
724 | * bit unconditionally. Clean this up. | ||
725 | */ | ||
726 | fixup_rt_mutex_waiters(lock); | ||
727 | } | ||
728 | |||
729 | spin_unlock(&lock->wait_lock); | ||
730 | |||
731 | return ret; | ||
732 | } | ||
733 | |||
734 | /* | ||
735 | * Slow path to release a rt-mutex: | ||
736 | */ | ||
737 | static void __sched | ||
738 | rt_mutex_slowunlock(struct rt_mutex *lock) | ||
739 | { | ||
740 | spin_lock(&lock->wait_lock); | ||
741 | |||
742 | debug_rt_mutex_unlock(lock); | ||
743 | |||
744 | rt_mutex_deadlock_account_unlock(current); | ||
745 | |||
746 | if (!rt_mutex_has_waiters(lock)) { | ||
747 | lock->owner = NULL; | ||
748 | spin_unlock(&lock->wait_lock); | ||
749 | return; | ||
750 | } | ||
751 | |||
752 | wakeup_next_waiter(lock); | ||
753 | |||
754 | spin_unlock(&lock->wait_lock); | ||
755 | |||
756 | /* Undo pi boosting if necessary: */ | ||
757 | rt_mutex_adjust_prio(current); | ||
758 | } | ||
759 | |||
760 | /* | ||
761 | * debug aware fast / slowpath lock,trylock,unlock | ||
762 | * | ||
763 | * The atomic acquire/release ops are compiled away, when either the | ||
764 | * architecture does not support cmpxchg or when debugging is enabled. | ||
765 | */ | ||
766 | static inline int | ||
767 | rt_mutex_fastlock(struct rt_mutex *lock, int state, | ||
768 | int detect_deadlock, | ||
769 | int (*slowfn)(struct rt_mutex *lock, int state, | ||
770 | struct hrtimer_sleeper *timeout, | ||
771 | int detect_deadlock)) | ||
772 | { | ||
773 | if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { | ||
774 | rt_mutex_deadlock_account_lock(lock, current); | ||
775 | return 0; | ||
776 | } else | ||
777 | return slowfn(lock, state, NULL, detect_deadlock); | ||
778 | } | ||
779 | |||
780 | static inline int | ||
781 | rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, | ||
782 | struct hrtimer_sleeper *timeout, int detect_deadlock, | ||
783 | int (*slowfn)(struct rt_mutex *lock, int state, | ||
784 | struct hrtimer_sleeper *timeout, | ||
785 | int detect_deadlock)) | ||
786 | { | ||
787 | if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { | ||
788 | rt_mutex_deadlock_account_lock(lock, current); | ||
789 | return 0; | ||
790 | } else | ||
791 | return slowfn(lock, state, timeout, detect_deadlock); | ||
792 | } | ||
793 | |||
794 | static inline int | ||
795 | rt_mutex_fasttrylock(struct rt_mutex *lock, | ||
796 | int (*slowfn)(struct rt_mutex *lock)) | ||
797 | { | ||
798 | if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { | ||
799 | rt_mutex_deadlock_account_lock(lock, current); | ||
800 | return 1; | ||
801 | } | ||
802 | return slowfn(lock); | ||
803 | } | ||
804 | |||
805 | static inline void | ||
806 | rt_mutex_fastunlock(struct rt_mutex *lock, | ||
807 | void (*slowfn)(struct rt_mutex *lock)) | ||
808 | { | ||
809 | if (likely(rt_mutex_cmpxchg(lock, current, NULL))) | ||
810 | rt_mutex_deadlock_account_unlock(current); | ||
811 | else | ||
812 | slowfn(lock); | ||
813 | } | ||
814 | |||
815 | /** | ||
816 | * rt_mutex_lock - lock a rt_mutex | ||
817 | * | ||
818 | * @lock: the rt_mutex to be locked | ||
819 | */ | ||
820 | void __sched rt_mutex_lock(struct rt_mutex *lock) | ||
821 | { | ||
822 | might_sleep(); | ||
823 | |||
824 | rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); | ||
825 | } | ||
826 | EXPORT_SYMBOL_GPL(rt_mutex_lock); | ||
827 | |||
828 | /** | ||
829 | * rt_mutex_lock_interruptible - lock a rt_mutex interruptible | ||
830 | * | ||
831 | * @lock: the rt_mutex to be locked | ||
832 | * @detect_deadlock: deadlock detection on/off | ||
833 | * | ||
834 | * Returns: | ||
835 | * 0 on success | ||
836 | * -EINTR when interrupted by a signal | ||
837 | * -EDEADLK when the lock would deadlock (when deadlock detection is on) | ||
838 | */ | ||
839 | int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, | ||
840 | int detect_deadlock) | ||
841 | { | ||
842 | might_sleep(); | ||
843 | |||
844 | return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, | ||
845 | detect_deadlock, rt_mutex_slowlock); | ||
846 | } | ||
847 | EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); | ||
848 | |||
849 | /** | ||
850 | * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible | ||
851 | * the timeout structure is provided | ||
852 | * by the caller | ||
853 | * | ||
854 | * @lock: the rt_mutex to be locked | ||
855 | * @timeout: timeout structure or NULL (no timeout) | ||
856 | * @detect_deadlock: deadlock detection on/off | ||
857 | * | ||
858 | * Returns: | ||
859 | * 0 on success | ||
860 | * -EINTR when interrupted by a signal | ||
861 | * -ETIMEOUT when the timeout expired | ||
862 | * -EDEADLK when the lock would deadlock (when deadlock detection is on) | ||
863 | */ | ||
864 | int | ||
865 | rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, | ||
866 | int detect_deadlock) | ||
867 | { | ||
868 | might_sleep(); | ||
869 | |||
870 | return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, | ||
871 | detect_deadlock, rt_mutex_slowlock); | ||
872 | } | ||
873 | EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); | ||
874 | |||
875 | /** | ||
876 | * rt_mutex_trylock - try to lock a rt_mutex | ||
877 | * | ||
878 | * @lock: the rt_mutex to be locked | ||
879 | * | ||
880 | * Returns 1 on success and 0 on contention | ||
881 | */ | ||
882 | int __sched rt_mutex_trylock(struct rt_mutex *lock) | ||
883 | { | ||
884 | return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); | ||
885 | } | ||
886 | EXPORT_SYMBOL_GPL(rt_mutex_trylock); | ||
887 | |||
888 | /** | ||
889 | * rt_mutex_unlock - unlock a rt_mutex | ||
890 | * | ||
891 | * @lock: the rt_mutex to be unlocked | ||
892 | */ | ||
893 | void __sched rt_mutex_unlock(struct rt_mutex *lock) | ||
894 | { | ||
895 | rt_mutex_fastunlock(lock, rt_mutex_slowunlock); | ||
896 | } | ||
897 | EXPORT_SYMBOL_GPL(rt_mutex_unlock); | ||
898 | |||
899 | /*** | ||
900 | * rt_mutex_destroy - mark a mutex unusable | ||
901 | * @lock: the mutex to be destroyed | ||
902 | * | ||
903 | * This function marks the mutex uninitialized, and any subsequent | ||
904 | * use of the mutex is forbidden. The mutex must not be locked when | ||
905 | * this function is called. | ||
906 | */ | ||
907 | void rt_mutex_destroy(struct rt_mutex *lock) | ||
908 | { | ||
909 | WARN_ON(rt_mutex_is_locked(lock)); | ||
910 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
911 | lock->magic = NULL; | ||
912 | #endif | ||
913 | } | ||
914 | |||
915 | EXPORT_SYMBOL_GPL(rt_mutex_destroy); | ||
916 | |||
917 | /** | ||
918 | * __rt_mutex_init - initialize the rt lock | ||
919 | * | ||
920 | * @lock: the rt lock to be initialized | ||
921 | * | ||
922 | * Initialize the rt lock to unlocked state. | ||
923 | * | ||
924 | * Initializing of a locked rt lock is not allowed | ||
925 | */ | ||
926 | void __rt_mutex_init(struct rt_mutex *lock, const char *name) | ||
927 | { | ||
928 | lock->owner = NULL; | ||
929 | spin_lock_init(&lock->wait_lock); | ||
930 | plist_head_init(&lock->wait_list, &lock->wait_lock); | ||
931 | |||
932 | debug_rt_mutex_init(lock, name); | ||
933 | } | ||
934 | EXPORT_SYMBOL_GPL(__rt_mutex_init); | ||
935 | |||
936 | /** | ||
937 | * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a | ||
938 | * proxy owner | ||
939 | * | ||
940 | * @lock: the rt_mutex to be locked | ||
941 | * @proxy_owner:the task to set as owner | ||
942 | * | ||
943 | * No locking. Caller has to do serializing itself | ||
944 | * Special API call for PI-futex support | ||
945 | */ | ||
946 | void rt_mutex_init_proxy_locked(struct rt_mutex *lock, | ||
947 | struct task_struct *proxy_owner) | ||
948 | { | ||
949 | __rt_mutex_init(lock, NULL); | ||
950 | debug_rt_mutex_proxy_lock(lock, proxy_owner); | ||
951 | rt_mutex_set_owner(lock, proxy_owner, 0); | ||
952 | rt_mutex_deadlock_account_lock(lock, proxy_owner); | ||
953 | } | ||
954 | |||
955 | /** | ||
956 | * rt_mutex_proxy_unlock - release a lock on behalf of owner | ||
957 | * | ||
958 | * @lock: the rt_mutex to be locked | ||
959 | * | ||
960 | * No locking. Caller has to do serializing itself | ||
961 | * Special API call for PI-futex support | ||
962 | */ | ||
963 | void rt_mutex_proxy_unlock(struct rt_mutex *lock, | ||
964 | struct task_struct *proxy_owner) | ||
965 | { | ||
966 | debug_rt_mutex_proxy_unlock(lock); | ||
967 | rt_mutex_set_owner(lock, NULL, 0); | ||
968 | rt_mutex_deadlock_account_unlock(proxy_owner); | ||
969 | } | ||
970 | |||
971 | /** | ||
972 | * rt_mutex_next_owner - return the next owner of the lock | ||
973 | * | ||
974 | * @lock: the rt lock query | ||
975 | * | ||
976 | * Returns the next owner of the lock or NULL | ||
977 | * | ||
978 | * Caller has to serialize against other accessors to the lock | ||
979 | * itself. | ||
980 | * | ||
981 | * Special API call for PI-futex support | ||
982 | */ | ||
983 | struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) | ||
984 | { | ||
985 | if (!rt_mutex_has_waiters(lock)) | ||
986 | return NULL; | ||
987 | |||
988 | return rt_mutex_top_waiter(lock)->task; | ||
989 | } | ||
diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h new file mode 100644 index 000000000000..a1a1dd06421d --- /dev/null +++ b/kernel/rtmutex.h | |||
@@ -0,0 +1,26 @@ | |||
1 | /* | ||
2 | * RT-Mutexes: blocking mutual exclusion locks with PI support | ||
3 | * | ||
4 | * started by Ingo Molnar and Thomas Gleixner: | ||
5 | * | ||
6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
8 | * | ||
9 | * This file contains macros used solely by rtmutex.c. | ||
10 | * Non-debug version. | ||
11 | */ | ||
12 | |||
13 | #define rt_mutex_deadlock_check(l) (0) | ||
14 | #define rt_mutex_deadlock_account_lock(m, t) do { } while (0) | ||
15 | #define rt_mutex_deadlock_account_unlock(l) do { } while (0) | ||
16 | #define debug_rt_mutex_init_waiter(w) do { } while (0) | ||
17 | #define debug_rt_mutex_free_waiter(w) do { } while (0) | ||
18 | #define debug_rt_mutex_lock(l) do { } while (0) | ||
19 | #define debug_rt_mutex_proxy_lock(l,p) do { } while (0) | ||
20 | #define debug_rt_mutex_proxy_unlock(l) do { } while (0) | ||
21 | #define debug_rt_mutex_unlock(l) do { } while (0) | ||
22 | #define debug_rt_mutex_init(m, n) do { } while (0) | ||
23 | #define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) | ||
24 | #define debug_rt_mutex_print_deadlock(w) do { } while (0) | ||
25 | #define debug_rt_mutex_detect_deadlock(w,d) (d) | ||
26 | #define debug_rt_mutex_reset_waiter(w) do { } while (0) | ||
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h new file mode 100644 index 000000000000..9c75856e791e --- /dev/null +++ b/kernel/rtmutex_common.h | |||
@@ -0,0 +1,123 @@ | |||
1 | /* | ||
2 | * RT Mutexes: blocking mutual exclusion locks with PI support | ||
3 | * | ||
4 | * started by Ingo Molnar and Thomas Gleixner: | ||
5 | * | ||
6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
8 | * | ||
9 | * This file contains the private data structure and API definitions. | ||
10 | */ | ||
11 | |||
12 | #ifndef __KERNEL_RTMUTEX_COMMON_H | ||
13 | #define __KERNEL_RTMUTEX_COMMON_H | ||
14 | |||
15 | #include <linux/rtmutex.h> | ||
16 | |||
17 | /* | ||
18 | * The rtmutex in kernel tester is independent of rtmutex debugging. We | ||
19 | * call schedule_rt_mutex_test() instead of schedule() for the tasks which | ||
20 | * belong to the tester. That way we can delay the wakeup path of those | ||
21 | * threads to provoke lock stealing and testing of complex boosting scenarios. | ||
22 | */ | ||
23 | #ifdef CONFIG_RT_MUTEX_TESTER | ||
24 | |||
25 | extern void schedule_rt_mutex_test(struct rt_mutex *lock); | ||
26 | |||
27 | #define schedule_rt_mutex(_lock) \ | ||
28 | do { \ | ||
29 | if (!(current->flags & PF_MUTEX_TESTER)) \ | ||
30 | schedule(); \ | ||
31 | else \ | ||
32 | schedule_rt_mutex_test(_lock); \ | ||
33 | } while (0) | ||
34 | |||
35 | #else | ||
36 | # define schedule_rt_mutex(_lock) schedule() | ||
37 | #endif | ||
38 | |||
39 | /* | ||
40 | * This is the control structure for tasks blocked on a rt_mutex, | ||
41 | * which is allocated on the kernel stack on of the blocked task. | ||
42 | * | ||
43 | * @list_entry: pi node to enqueue into the mutex waiters list | ||
44 | * @pi_list_entry: pi node to enqueue into the mutex owner waiters list | ||
45 | * @task: task reference to the blocked task | ||
46 | */ | ||
47 | struct rt_mutex_waiter { | ||
48 | struct plist_node list_entry; | ||
49 | struct plist_node pi_list_entry; | ||
50 | struct task_struct *task; | ||
51 | struct rt_mutex *lock; | ||
52 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
53 | unsigned long ip; | ||
54 | pid_t deadlock_task_pid; | ||
55 | struct rt_mutex *deadlock_lock; | ||
56 | #endif | ||
57 | }; | ||
58 | |||
59 | /* | ||
60 | * Various helpers to access the waiters-plist: | ||
61 | */ | ||
62 | static inline int rt_mutex_has_waiters(struct rt_mutex *lock) | ||
63 | { | ||
64 | return !plist_head_empty(&lock->wait_list); | ||
65 | } | ||
66 | |||
67 | static inline struct rt_mutex_waiter * | ||
68 | rt_mutex_top_waiter(struct rt_mutex *lock) | ||
69 | { | ||
70 | struct rt_mutex_waiter *w; | ||
71 | |||
72 | w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, | ||
73 | list_entry); | ||
74 | BUG_ON(w->lock != lock); | ||
75 | |||
76 | return w; | ||
77 | } | ||
78 | |||
79 | static inline int task_has_pi_waiters(struct task_struct *p) | ||
80 | { | ||
81 | return !plist_head_empty(&p->pi_waiters); | ||
82 | } | ||
83 | |||
84 | static inline struct rt_mutex_waiter * | ||
85 | task_top_pi_waiter(struct task_struct *p) | ||
86 | { | ||
87 | return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, | ||
88 | pi_list_entry); | ||
89 | } | ||
90 | |||
91 | /* | ||
92 | * lock->owner state tracking: | ||
93 | */ | ||
94 | #define RT_MUTEX_OWNER_PENDING 1UL | ||
95 | #define RT_MUTEX_HAS_WAITERS 2UL | ||
96 | #define RT_MUTEX_OWNER_MASKALL 3UL | ||
97 | |||
98 | static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) | ||
99 | { | ||
100 | return (struct task_struct *) | ||
101 | ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); | ||
102 | } | ||
103 | |||
104 | static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) | ||
105 | { | ||
106 | return (struct task_struct *) | ||
107 | ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); | ||
108 | } | ||
109 | |||
110 | static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) | ||
111 | { | ||
112 | return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; | ||
113 | } | ||
114 | |||
115 | /* | ||
116 | * PI-futex support (proxy locking functions, etc.): | ||
117 | */ | ||
118 | extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); | ||
119 | extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, | ||
120 | struct task_struct *proxy_owner); | ||
121 | extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, | ||
122 | struct task_struct *proxy_owner); | ||
123 | #endif | ||
diff --git a/kernel/rwsem.c b/kernel/rwsem.c new file mode 100644 index 000000000000..291ded556aa0 --- /dev/null +++ b/kernel/rwsem.c | |||
@@ -0,0 +1,147 @@ | |||
1 | /* kernel/rwsem.c: R/W semaphores, public implementation | ||
2 | * | ||
3 | * Written by David Howells (dhowells@redhat.com). | ||
4 | * Derived from asm-i386/semaphore.h | ||
5 | */ | ||
6 | |||
7 | #include <linux/types.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/rwsem.h> | ||
11 | |||
12 | #include <asm/system.h> | ||
13 | #include <asm/atomic.h> | ||
14 | |||
15 | /* | ||
16 | * lock for reading | ||
17 | */ | ||
18 | void down_read(struct rw_semaphore *sem) | ||
19 | { | ||
20 | might_sleep(); | ||
21 | rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); | ||
22 | |||
23 | __down_read(sem); | ||
24 | } | ||
25 | |||
26 | EXPORT_SYMBOL(down_read); | ||
27 | |||
28 | /* | ||
29 | * trylock for reading -- returns 1 if successful, 0 if contention | ||
30 | */ | ||
31 | int down_read_trylock(struct rw_semaphore *sem) | ||
32 | { | ||
33 | int ret = __down_read_trylock(sem); | ||
34 | |||
35 | if (ret == 1) | ||
36 | rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); | ||
37 | return ret; | ||
38 | } | ||
39 | |||
40 | EXPORT_SYMBOL(down_read_trylock); | ||
41 | |||
42 | /* | ||
43 | * lock for writing | ||
44 | */ | ||
45 | void down_write(struct rw_semaphore *sem) | ||
46 | { | ||
47 | might_sleep(); | ||
48 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); | ||
49 | |||
50 | __down_write(sem); | ||
51 | } | ||
52 | |||
53 | EXPORT_SYMBOL(down_write); | ||
54 | |||
55 | /* | ||
56 | * trylock for writing -- returns 1 if successful, 0 if contention | ||
57 | */ | ||
58 | int down_write_trylock(struct rw_semaphore *sem) | ||
59 | { | ||
60 | int ret = __down_write_trylock(sem); | ||
61 | |||
62 | if (ret == 1) | ||
63 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); | ||
64 | return ret; | ||
65 | } | ||
66 | |||
67 | EXPORT_SYMBOL(down_write_trylock); | ||
68 | |||
69 | /* | ||
70 | * release a read lock | ||
71 | */ | ||
72 | void up_read(struct rw_semaphore *sem) | ||
73 | { | ||
74 | rwsem_release(&sem->dep_map, 1, _RET_IP_); | ||
75 | |||
76 | __up_read(sem); | ||
77 | } | ||
78 | |||
79 | EXPORT_SYMBOL(up_read); | ||
80 | |||
81 | /* | ||
82 | * release a write lock | ||
83 | */ | ||
84 | void up_write(struct rw_semaphore *sem) | ||
85 | { | ||
86 | rwsem_release(&sem->dep_map, 1, _RET_IP_); | ||
87 | |||
88 | __up_write(sem); | ||
89 | } | ||
90 | |||
91 | EXPORT_SYMBOL(up_write); | ||
92 | |||
93 | /* | ||
94 | * downgrade write lock to read lock | ||
95 | */ | ||
96 | void downgrade_write(struct rw_semaphore *sem) | ||
97 | { | ||
98 | /* | ||
99 | * lockdep: a downgraded write will live on as a write | ||
100 | * dependency. | ||
101 | */ | ||
102 | __downgrade_write(sem); | ||
103 | } | ||
104 | |||
105 | EXPORT_SYMBOL(downgrade_write); | ||
106 | |||
107 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
108 | |||
109 | void down_read_nested(struct rw_semaphore *sem, int subclass) | ||
110 | { | ||
111 | might_sleep(); | ||
112 | rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); | ||
113 | |||
114 | __down_read(sem); | ||
115 | } | ||
116 | |||
117 | EXPORT_SYMBOL(down_read_nested); | ||
118 | |||
119 | void down_read_non_owner(struct rw_semaphore *sem) | ||
120 | { | ||
121 | might_sleep(); | ||
122 | |||
123 | __down_read(sem); | ||
124 | } | ||
125 | |||
126 | EXPORT_SYMBOL(down_read_non_owner); | ||
127 | |||
128 | void down_write_nested(struct rw_semaphore *sem, int subclass) | ||
129 | { | ||
130 | might_sleep(); | ||
131 | rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); | ||
132 | |||
133 | __down_write_nested(sem, subclass); | ||
134 | } | ||
135 | |||
136 | EXPORT_SYMBOL(down_write_nested); | ||
137 | |||
138 | void up_read_non_owner(struct rw_semaphore *sem) | ||
139 | { | ||
140 | __up_read(sem); | ||
141 | } | ||
142 | |||
143 | EXPORT_SYMBOL(up_read_non_owner); | ||
144 | |||
145 | #endif | ||
146 | |||
147 | |||
diff --git a/kernel/sched.c b/kernel/sched.c index c13f1bd2df7d..b44b9a43b0fc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/capability.h> | 30 | #include <linux/capability.h> |
31 | #include <linux/completion.h> | 31 | #include <linux/completion.h> |
32 | #include <linux/kernel_stat.h> | 32 | #include <linux/kernel_stat.h> |
33 | #include <linux/debug_locks.h> | ||
33 | #include <linux/security.h> | 34 | #include <linux/security.h> |
34 | #include <linux/notifier.h> | 35 | #include <linux/notifier.h> |
35 | #include <linux/profile.h> | 36 | #include <linux/profile.h> |
@@ -50,6 +51,7 @@ | |||
50 | #include <linux/times.h> | 51 | #include <linux/times.h> |
51 | #include <linux/acct.h> | 52 | #include <linux/acct.h> |
52 | #include <linux/kprobes.h> | 53 | #include <linux/kprobes.h> |
54 | #include <linux/delayacct.h> | ||
53 | #include <asm/tlb.h> | 55 | #include <asm/tlb.h> |
54 | 56 | ||
55 | #include <asm/unistd.h> | 57 | #include <asm/unistd.h> |
@@ -168,29 +170,28 @@ | |||
168 | */ | 170 | */ |
169 | 171 | ||
170 | #define SCALE_PRIO(x, prio) \ | 172 | #define SCALE_PRIO(x, prio) \ |
171 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) | 173 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) |
172 | 174 | ||
173 | static unsigned int task_timeslice(task_t *p) | 175 | static unsigned int static_prio_timeslice(int static_prio) |
174 | { | 176 | { |
175 | if (p->static_prio < NICE_TO_PRIO(0)) | 177 | if (static_prio < NICE_TO_PRIO(0)) |
176 | return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); | 178 | return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); |
177 | else | 179 | else |
178 | return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); | 180 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); |
181 | } | ||
182 | |||
183 | static inline unsigned int task_timeslice(struct task_struct *p) | ||
184 | { | ||
185 | return static_prio_timeslice(p->static_prio); | ||
179 | } | 186 | } |
180 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ | ||
181 | < (long long) (sd)->cache_hot_time) | ||
182 | 187 | ||
183 | /* | 188 | /* |
184 | * These are the runqueue data structures: | 189 | * These are the runqueue data structures: |
185 | */ | 190 | */ |
186 | 191 | ||
187 | #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) | ||
188 | |||
189 | typedef struct runqueue runqueue_t; | ||
190 | |||
191 | struct prio_array { | 192 | struct prio_array { |
192 | unsigned int nr_active; | 193 | unsigned int nr_active; |
193 | unsigned long bitmap[BITMAP_SIZE]; | 194 | DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ |
194 | struct list_head queue[MAX_PRIO]; | 195 | struct list_head queue[MAX_PRIO]; |
195 | }; | 196 | }; |
196 | 197 | ||
@@ -201,7 +202,7 @@ struct prio_array { | |||
201 | * (such as the load balancing or the thread migration code), lock | 202 | * (such as the load balancing or the thread migration code), lock |
202 | * acquire operations must be ordered by ascending &runqueue. | 203 | * acquire operations must be ordered by ascending &runqueue. |
203 | */ | 204 | */ |
204 | struct runqueue { | 205 | struct rq { |
205 | spinlock_t lock; | 206 | spinlock_t lock; |
206 | 207 | ||
207 | /* | 208 | /* |
@@ -209,6 +210,7 @@ struct runqueue { | |||
209 | * remote CPUs use both these fields when doing load calculation. | 210 | * remote CPUs use both these fields when doing load calculation. |
210 | */ | 211 | */ |
211 | unsigned long nr_running; | 212 | unsigned long nr_running; |
213 | unsigned long raw_weighted_load; | ||
212 | #ifdef CONFIG_SMP | 214 | #ifdef CONFIG_SMP |
213 | unsigned long cpu_load[3]; | 215 | unsigned long cpu_load[3]; |
214 | #endif | 216 | #endif |
@@ -224,9 +226,9 @@ struct runqueue { | |||
224 | 226 | ||
225 | unsigned long expired_timestamp; | 227 | unsigned long expired_timestamp; |
226 | unsigned long long timestamp_last_tick; | 228 | unsigned long long timestamp_last_tick; |
227 | task_t *curr, *idle; | 229 | struct task_struct *curr, *idle; |
228 | struct mm_struct *prev_mm; | 230 | struct mm_struct *prev_mm; |
229 | prio_array_t *active, *expired, arrays[2]; | 231 | struct prio_array *active, *expired, arrays[2]; |
230 | int best_expired_prio; | 232 | int best_expired_prio; |
231 | atomic_t nr_iowait; | 233 | atomic_t nr_iowait; |
232 | 234 | ||
@@ -237,9 +239,8 @@ struct runqueue { | |||
237 | int active_balance; | 239 | int active_balance; |
238 | int push_cpu; | 240 | int push_cpu; |
239 | 241 | ||
240 | task_t *migration_thread; | 242 | struct task_struct *migration_thread; |
241 | struct list_head migration_queue; | 243 | struct list_head migration_queue; |
242 | int cpu; | ||
243 | #endif | 244 | #endif |
244 | 245 | ||
245 | #ifdef CONFIG_SCHEDSTATS | 246 | #ifdef CONFIG_SCHEDSTATS |
@@ -261,9 +262,10 @@ struct runqueue { | |||
261 | unsigned long ttwu_cnt; | 262 | unsigned long ttwu_cnt; |
262 | unsigned long ttwu_local; | 263 | unsigned long ttwu_local; |
263 | #endif | 264 | #endif |
265 | struct lock_class_key rq_lock_key; | ||
264 | }; | 266 | }; |
265 | 267 | ||
266 | static DEFINE_PER_CPU(struct runqueue, runqueues); | 268 | static DEFINE_PER_CPU(struct rq, runqueues); |
267 | 269 | ||
268 | /* | 270 | /* |
269 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | 271 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. |
@@ -272,8 +274,8 @@ static DEFINE_PER_CPU(struct runqueue, runqueues); | |||
272 | * The domain tree of any CPU may only be accessed from within | 274 | * The domain tree of any CPU may only be accessed from within |
273 | * preempt-disabled sections. | 275 | * preempt-disabled sections. |
274 | */ | 276 | */ |
275 | #define for_each_domain(cpu, domain) \ | 277 | #define for_each_domain(cpu, __sd) \ |
276 | for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) | 278 | for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) |
277 | 279 | ||
278 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 280 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
279 | #define this_rq() (&__get_cpu_var(runqueues)) | 281 | #define this_rq() (&__get_cpu_var(runqueues)) |
@@ -288,26 +290,33 @@ for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) | |||
288 | #endif | 290 | #endif |
289 | 291 | ||
290 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | 292 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW |
291 | static inline int task_running(runqueue_t *rq, task_t *p) | 293 | static inline int task_running(struct rq *rq, struct task_struct *p) |
292 | { | 294 | { |
293 | return rq->curr == p; | 295 | return rq->curr == p; |
294 | } | 296 | } |
295 | 297 | ||
296 | static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) | 298 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
297 | { | 299 | { |
298 | } | 300 | } |
299 | 301 | ||
300 | static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | 302 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
301 | { | 303 | { |
302 | #ifdef CONFIG_DEBUG_SPINLOCK | 304 | #ifdef CONFIG_DEBUG_SPINLOCK |
303 | /* this is a valid case when another task releases the spinlock */ | 305 | /* this is a valid case when another task releases the spinlock */ |
304 | rq->lock.owner = current; | 306 | rq->lock.owner = current; |
305 | #endif | 307 | #endif |
308 | /* | ||
309 | * If we are tracking spinlock dependencies then we have to | ||
310 | * fix up the runqueue lock - which gets 'carried over' from | ||
311 | * prev into current: | ||
312 | */ | ||
313 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); | ||
314 | |||
306 | spin_unlock_irq(&rq->lock); | 315 | spin_unlock_irq(&rq->lock); |
307 | } | 316 | } |
308 | 317 | ||
309 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | 318 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ |
310 | static inline int task_running(runqueue_t *rq, task_t *p) | 319 | static inline int task_running(struct rq *rq, struct task_struct *p) |
311 | { | 320 | { |
312 | #ifdef CONFIG_SMP | 321 | #ifdef CONFIG_SMP |
313 | return p->oncpu; | 322 | return p->oncpu; |
@@ -316,7 +325,7 @@ static inline int task_running(runqueue_t *rq, task_t *p) | |||
316 | #endif | 325 | #endif |
317 | } | 326 | } |
318 | 327 | ||
319 | static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) | 328 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
320 | { | 329 | { |
321 | #ifdef CONFIG_SMP | 330 | #ifdef CONFIG_SMP |
322 | /* | 331 | /* |
@@ -333,7 +342,7 @@ static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) | |||
333 | #endif | 342 | #endif |
334 | } | 343 | } |
335 | 344 | ||
336 | static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | 345 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
337 | { | 346 | { |
338 | #ifdef CONFIG_SMP | 347 | #ifdef CONFIG_SMP |
339 | /* | 348 | /* |
@@ -351,14 +360,33 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | |||
351 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 360 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
352 | 361 | ||
353 | /* | 362 | /* |
363 | * __task_rq_lock - lock the runqueue a given task resides on. | ||
364 | * Must be called interrupts disabled. | ||
365 | */ | ||
366 | static inline struct rq *__task_rq_lock(struct task_struct *p) | ||
367 | __acquires(rq->lock) | ||
368 | { | ||
369 | struct rq *rq; | ||
370 | |||
371 | repeat_lock_task: | ||
372 | rq = task_rq(p); | ||
373 | spin_lock(&rq->lock); | ||
374 | if (unlikely(rq != task_rq(p))) { | ||
375 | spin_unlock(&rq->lock); | ||
376 | goto repeat_lock_task; | ||
377 | } | ||
378 | return rq; | ||
379 | } | ||
380 | |||
381 | /* | ||
354 | * task_rq_lock - lock the runqueue a given task resides on and disable | 382 | * task_rq_lock - lock the runqueue a given task resides on and disable |
355 | * interrupts. Note the ordering: we can safely lookup the task_rq without | 383 | * interrupts. Note the ordering: we can safely lookup the task_rq without |
356 | * explicitly disabling preemption. | 384 | * explicitly disabling preemption. |
357 | */ | 385 | */ |
358 | static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) | 386 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) |
359 | __acquires(rq->lock) | 387 | __acquires(rq->lock) |
360 | { | 388 | { |
361 | struct runqueue *rq; | 389 | struct rq *rq; |
362 | 390 | ||
363 | repeat_lock_task: | 391 | repeat_lock_task: |
364 | local_irq_save(*flags); | 392 | local_irq_save(*flags); |
@@ -371,7 +399,13 @@ repeat_lock_task: | |||
371 | return rq; | 399 | return rq; |
372 | } | 400 | } |
373 | 401 | ||
374 | static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) | 402 | static inline void __task_rq_unlock(struct rq *rq) |
403 | __releases(rq->lock) | ||
404 | { | ||
405 | spin_unlock(&rq->lock); | ||
406 | } | ||
407 | |||
408 | static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | ||
375 | __releases(rq->lock) | 409 | __releases(rq->lock) |
376 | { | 410 | { |
377 | spin_unlock_irqrestore(&rq->lock, *flags); | 411 | spin_unlock_irqrestore(&rq->lock, *flags); |
@@ -391,7 +425,7 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
391 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | 425 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); |
392 | seq_printf(seq, "timestamp %lu\n", jiffies); | 426 | seq_printf(seq, "timestamp %lu\n", jiffies); |
393 | for_each_online_cpu(cpu) { | 427 | for_each_online_cpu(cpu) { |
394 | runqueue_t *rq = cpu_rq(cpu); | 428 | struct rq *rq = cpu_rq(cpu); |
395 | #ifdef CONFIG_SMP | 429 | #ifdef CONFIG_SMP |
396 | struct sched_domain *sd; | 430 | struct sched_domain *sd; |
397 | int dcnt = 0; | 431 | int dcnt = 0; |
@@ -468,9 +502,36 @@ struct file_operations proc_schedstat_operations = { | |||
468 | .release = single_release, | 502 | .release = single_release, |
469 | }; | 503 | }; |
470 | 504 | ||
505 | /* | ||
506 | * Expects runqueue lock to be held for atomicity of update | ||
507 | */ | ||
508 | static inline void | ||
509 | rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) | ||
510 | { | ||
511 | if (rq) { | ||
512 | rq->rq_sched_info.run_delay += delta_jiffies; | ||
513 | rq->rq_sched_info.pcnt++; | ||
514 | } | ||
515 | } | ||
516 | |||
517 | /* | ||
518 | * Expects runqueue lock to be held for atomicity of update | ||
519 | */ | ||
520 | static inline void | ||
521 | rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) | ||
522 | { | ||
523 | if (rq) | ||
524 | rq->rq_sched_info.cpu_time += delta_jiffies; | ||
525 | } | ||
471 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) | 526 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) |
472 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) | 527 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) |
473 | #else /* !CONFIG_SCHEDSTATS */ | 528 | #else /* !CONFIG_SCHEDSTATS */ |
529 | static inline void | ||
530 | rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) | ||
531 | {} | ||
532 | static inline void | ||
533 | rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) | ||
534 | {} | ||
474 | # define schedstat_inc(rq, field) do { } while (0) | 535 | # define schedstat_inc(rq, field) do { } while (0) |
475 | # define schedstat_add(rq, field, amt) do { } while (0) | 536 | # define schedstat_add(rq, field, amt) do { } while (0) |
476 | #endif | 537 | #endif |
@@ -478,10 +539,10 @@ struct file_operations proc_schedstat_operations = { | |||
478 | /* | 539 | /* |
479 | * rq_lock - lock a given runqueue and disable interrupts. | 540 | * rq_lock - lock a given runqueue and disable interrupts. |
480 | */ | 541 | */ |
481 | static inline runqueue_t *this_rq_lock(void) | 542 | static inline struct rq *this_rq_lock(void) |
482 | __acquires(rq->lock) | 543 | __acquires(rq->lock) |
483 | { | 544 | { |
484 | runqueue_t *rq; | 545 | struct rq *rq; |
485 | 546 | ||
486 | local_irq_disable(); | 547 | local_irq_disable(); |
487 | rq = this_rq(); | 548 | rq = this_rq(); |
@@ -490,7 +551,7 @@ static inline runqueue_t *this_rq_lock(void) | |||
490 | return rq; | 551 | return rq; |
491 | } | 552 | } |
492 | 553 | ||
493 | #ifdef CONFIG_SCHEDSTATS | 554 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
494 | /* | 555 | /* |
495 | * Called when a process is dequeued from the active array and given | 556 | * Called when a process is dequeued from the active array and given |
496 | * the cpu. We should note that with the exception of interactive | 557 | * the cpu. We should note that with the exception of interactive |
@@ -506,7 +567,7 @@ static inline runqueue_t *this_rq_lock(void) | |||
506 | * long it was from the *first* time it was queued to the time that it | 567 | * long it was from the *first* time it was queued to the time that it |
507 | * finally hit a cpu. | 568 | * finally hit a cpu. |
508 | */ | 569 | */ |
509 | static inline void sched_info_dequeued(task_t *t) | 570 | static inline void sched_info_dequeued(struct task_struct *t) |
510 | { | 571 | { |
511 | t->sched_info.last_queued = 0; | 572 | t->sched_info.last_queued = 0; |
512 | } | 573 | } |
@@ -516,23 +577,18 @@ static inline void sched_info_dequeued(task_t *t) | |||
516 | * long it was waiting to run. We also note when it began so that we | 577 | * long it was waiting to run. We also note when it began so that we |
517 | * can keep stats on how long its timeslice is. | 578 | * can keep stats on how long its timeslice is. |
518 | */ | 579 | */ |
519 | static void sched_info_arrive(task_t *t) | 580 | static void sched_info_arrive(struct task_struct *t) |
520 | { | 581 | { |
521 | unsigned long now = jiffies, diff = 0; | 582 | unsigned long now = jiffies, delta_jiffies = 0; |
522 | struct runqueue *rq = task_rq(t); | ||
523 | 583 | ||
524 | if (t->sched_info.last_queued) | 584 | if (t->sched_info.last_queued) |
525 | diff = now - t->sched_info.last_queued; | 585 | delta_jiffies = now - t->sched_info.last_queued; |
526 | sched_info_dequeued(t); | 586 | sched_info_dequeued(t); |
527 | t->sched_info.run_delay += diff; | 587 | t->sched_info.run_delay += delta_jiffies; |
528 | t->sched_info.last_arrival = now; | 588 | t->sched_info.last_arrival = now; |
529 | t->sched_info.pcnt++; | 589 | t->sched_info.pcnt++; |
530 | 590 | ||
531 | if (!rq) | 591 | rq_sched_info_arrive(task_rq(t), delta_jiffies); |
532 | return; | ||
533 | |||
534 | rq->rq_sched_info.run_delay += diff; | ||
535 | rq->rq_sched_info.pcnt++; | ||
536 | } | 592 | } |
537 | 593 | ||
538 | /* | 594 | /* |
@@ -550,25 +606,23 @@ static void sched_info_arrive(task_t *t) | |||
550 | * the timestamp if it is already not set. It's assumed that | 606 | * the timestamp if it is already not set. It's assumed that |
551 | * sched_info_dequeued() will clear that stamp when appropriate. | 607 | * sched_info_dequeued() will clear that stamp when appropriate. |
552 | */ | 608 | */ |
553 | static inline void sched_info_queued(task_t *t) | 609 | static inline void sched_info_queued(struct task_struct *t) |
554 | { | 610 | { |
555 | if (!t->sched_info.last_queued) | 611 | if (unlikely(sched_info_on())) |
556 | t->sched_info.last_queued = jiffies; | 612 | if (!t->sched_info.last_queued) |
613 | t->sched_info.last_queued = jiffies; | ||
557 | } | 614 | } |
558 | 615 | ||
559 | /* | 616 | /* |
560 | * Called when a process ceases being the active-running process, either | 617 | * Called when a process ceases being the active-running process, either |
561 | * voluntarily or involuntarily. Now we can calculate how long we ran. | 618 | * voluntarily or involuntarily. Now we can calculate how long we ran. |
562 | */ | 619 | */ |
563 | static inline void sched_info_depart(task_t *t) | 620 | static inline void sched_info_depart(struct task_struct *t) |
564 | { | 621 | { |
565 | struct runqueue *rq = task_rq(t); | 622 | unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival; |
566 | unsigned long diff = jiffies - t->sched_info.last_arrival; | ||
567 | |||
568 | t->sched_info.cpu_time += diff; | ||
569 | 623 | ||
570 | if (rq) | 624 | t->sched_info.cpu_time += delta_jiffies; |
571 | rq->rq_sched_info.cpu_time += diff; | 625 | rq_sched_info_depart(task_rq(t), delta_jiffies); |
572 | } | 626 | } |
573 | 627 | ||
574 | /* | 628 | /* |
@@ -576,9 +630,10 @@ static inline void sched_info_depart(task_t *t) | |||
576 | * their time slice. (This may also be called when switching to or from | 630 | * their time slice. (This may also be called when switching to or from |
577 | * the idle task.) We are only called when prev != next. | 631 | * the idle task.) We are only called when prev != next. |
578 | */ | 632 | */ |
579 | static inline void sched_info_switch(task_t *prev, task_t *next) | 633 | static inline void |
634 | __sched_info_switch(struct task_struct *prev, struct task_struct *next) | ||
580 | { | 635 | { |
581 | struct runqueue *rq = task_rq(prev); | 636 | struct rq *rq = task_rq(prev); |
582 | 637 | ||
583 | /* | 638 | /* |
584 | * prev now departs the cpu. It's not interesting to record | 639 | * prev now departs the cpu. It's not interesting to record |
@@ -591,15 +646,21 @@ static inline void sched_info_switch(task_t *prev, task_t *next) | |||
591 | if (next != rq->idle) | 646 | if (next != rq->idle) |
592 | sched_info_arrive(next); | 647 | sched_info_arrive(next); |
593 | } | 648 | } |
649 | static inline void | ||
650 | sched_info_switch(struct task_struct *prev, struct task_struct *next) | ||
651 | { | ||
652 | if (unlikely(sched_info_on())) | ||
653 | __sched_info_switch(prev, next); | ||
654 | } | ||
594 | #else | 655 | #else |
595 | #define sched_info_queued(t) do { } while (0) | 656 | #define sched_info_queued(t) do { } while (0) |
596 | #define sched_info_switch(t, next) do { } while (0) | 657 | #define sched_info_switch(t, next) do { } while (0) |
597 | #endif /* CONFIG_SCHEDSTATS */ | 658 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ |
598 | 659 | ||
599 | /* | 660 | /* |
600 | * Adding/removing a task to/from a priority array: | 661 | * Adding/removing a task to/from a priority array: |
601 | */ | 662 | */ |
602 | static void dequeue_task(struct task_struct *p, prio_array_t *array) | 663 | static void dequeue_task(struct task_struct *p, struct prio_array *array) |
603 | { | 664 | { |
604 | array->nr_active--; | 665 | array->nr_active--; |
605 | list_del(&p->run_list); | 666 | list_del(&p->run_list); |
@@ -607,7 +668,7 @@ static void dequeue_task(struct task_struct *p, prio_array_t *array) | |||
607 | __clear_bit(p->prio, array->bitmap); | 668 | __clear_bit(p->prio, array->bitmap); |
608 | } | 669 | } |
609 | 670 | ||
610 | static void enqueue_task(struct task_struct *p, prio_array_t *array) | 671 | static void enqueue_task(struct task_struct *p, struct prio_array *array) |
611 | { | 672 | { |
612 | sched_info_queued(p); | 673 | sched_info_queued(p); |
613 | list_add_tail(&p->run_list, array->queue + p->prio); | 674 | list_add_tail(&p->run_list, array->queue + p->prio); |
@@ -620,12 +681,13 @@ static void enqueue_task(struct task_struct *p, prio_array_t *array) | |||
620 | * Put task to the end of the run list without the overhead of dequeue | 681 | * Put task to the end of the run list without the overhead of dequeue |
621 | * followed by enqueue. | 682 | * followed by enqueue. |
622 | */ | 683 | */ |
623 | static void requeue_task(struct task_struct *p, prio_array_t *array) | 684 | static void requeue_task(struct task_struct *p, struct prio_array *array) |
624 | { | 685 | { |
625 | list_move_tail(&p->run_list, array->queue + p->prio); | 686 | list_move_tail(&p->run_list, array->queue + p->prio); |
626 | } | 687 | } |
627 | 688 | ||
628 | static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | 689 | static inline void |
690 | enqueue_task_head(struct task_struct *p, struct prio_array *array) | ||
629 | { | 691 | { |
630 | list_add(&p->run_list, array->queue + p->prio); | 692 | list_add(&p->run_list, array->queue + p->prio); |
631 | __set_bit(p->prio, array->bitmap); | 693 | __set_bit(p->prio, array->bitmap); |
@@ -634,7 +696,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | |||
634 | } | 696 | } |
635 | 697 | ||
636 | /* | 698 | /* |
637 | * effective_prio - return the priority that is based on the static | 699 | * __normal_prio - return the priority that is based on the static |
638 | * priority but is modified by bonuses/penalties. | 700 | * priority but is modified by bonuses/penalties. |
639 | * | 701 | * |
640 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] | 702 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] |
@@ -647,13 +709,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | |||
647 | * | 709 | * |
648 | * Both properties are important to certain workloads. | 710 | * Both properties are important to certain workloads. |
649 | */ | 711 | */ |
650 | static int effective_prio(task_t *p) | 712 | |
713 | static inline int __normal_prio(struct task_struct *p) | ||
651 | { | 714 | { |
652 | int bonus, prio; | 715 | int bonus, prio; |
653 | 716 | ||
654 | if (rt_task(p)) | ||
655 | return p->prio; | ||
656 | |||
657 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; | 717 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; |
658 | 718 | ||
659 | prio = p->static_prio - bonus; | 719 | prio = p->static_prio - bonus; |
@@ -665,57 +725,165 @@ static int effective_prio(task_t *p) | |||
665 | } | 725 | } |
666 | 726 | ||
667 | /* | 727 | /* |
728 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | ||
729 | * of tasks with abnormal "nice" values across CPUs the contribution that | ||
730 | * each task makes to its run queue's load is weighted according to its | ||
731 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a | ||
732 | * scaled version of the new time slice allocation that they receive on time | ||
733 | * slice expiry etc. | ||
734 | */ | ||
735 | |||
736 | /* | ||
737 | * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE | ||
738 | * If static_prio_timeslice() is ever changed to break this assumption then | ||
739 | * this code will need modification | ||
740 | */ | ||
741 | #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE | ||
742 | #define LOAD_WEIGHT(lp) \ | ||
743 | (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) | ||
744 | #define PRIO_TO_LOAD_WEIGHT(prio) \ | ||
745 | LOAD_WEIGHT(static_prio_timeslice(prio)) | ||
746 | #define RTPRIO_TO_LOAD_WEIGHT(rp) \ | ||
747 | (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) | ||
748 | |||
749 | static void set_load_weight(struct task_struct *p) | ||
750 | { | ||
751 | if (has_rt_policy(p)) { | ||
752 | #ifdef CONFIG_SMP | ||
753 | if (p == task_rq(p)->migration_thread) | ||
754 | /* | ||
755 | * The migration thread does the actual balancing. | ||
756 | * Giving its load any weight will skew balancing | ||
757 | * adversely. | ||
758 | */ | ||
759 | p->load_weight = 0; | ||
760 | else | ||
761 | #endif | ||
762 | p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); | ||
763 | } else | ||
764 | p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); | ||
765 | } | ||
766 | |||
767 | static inline void | ||
768 | inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) | ||
769 | { | ||
770 | rq->raw_weighted_load += p->load_weight; | ||
771 | } | ||
772 | |||
773 | static inline void | ||
774 | dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) | ||
775 | { | ||
776 | rq->raw_weighted_load -= p->load_weight; | ||
777 | } | ||
778 | |||
779 | static inline void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
780 | { | ||
781 | rq->nr_running++; | ||
782 | inc_raw_weighted_load(rq, p); | ||
783 | } | ||
784 | |||
785 | static inline void dec_nr_running(struct task_struct *p, struct rq *rq) | ||
786 | { | ||
787 | rq->nr_running--; | ||
788 | dec_raw_weighted_load(rq, p); | ||
789 | } | ||
790 | |||
791 | /* | ||
792 | * Calculate the expected normal priority: i.e. priority | ||
793 | * without taking RT-inheritance into account. Might be | ||
794 | * boosted by interactivity modifiers. Changes upon fork, | ||
795 | * setprio syscalls, and whenever the interactivity | ||
796 | * estimator recalculates. | ||
797 | */ | ||
798 | static inline int normal_prio(struct task_struct *p) | ||
799 | { | ||
800 | int prio; | ||
801 | |||
802 | if (has_rt_policy(p)) | ||
803 | prio = MAX_RT_PRIO-1 - p->rt_priority; | ||
804 | else | ||
805 | prio = __normal_prio(p); | ||
806 | return prio; | ||
807 | } | ||
808 | |||
809 | /* | ||
810 | * Calculate the current priority, i.e. the priority | ||
811 | * taken into account by the scheduler. This value might | ||
812 | * be boosted by RT tasks, or might be boosted by | ||
813 | * interactivity modifiers. Will be RT if the task got | ||
814 | * RT-boosted. If not then it returns p->normal_prio. | ||
815 | */ | ||
816 | static int effective_prio(struct task_struct *p) | ||
817 | { | ||
818 | p->normal_prio = normal_prio(p); | ||
819 | /* | ||
820 | * If we are RT tasks or we were boosted to RT priority, | ||
821 | * keep the priority unchanged. Otherwise, update priority | ||
822 | * to the normal priority: | ||
823 | */ | ||
824 | if (!rt_prio(p->prio)) | ||
825 | return p->normal_prio; | ||
826 | return p->prio; | ||
827 | } | ||
828 | |||
829 | /* | ||
668 | * __activate_task - move a task to the runqueue. | 830 | * __activate_task - move a task to the runqueue. |
669 | */ | 831 | */ |
670 | static void __activate_task(task_t *p, runqueue_t *rq) | 832 | static void __activate_task(struct task_struct *p, struct rq *rq) |
671 | { | 833 | { |
672 | prio_array_t *target = rq->active; | 834 | struct prio_array *target = rq->active; |
673 | 835 | ||
674 | if (batch_task(p)) | 836 | if (batch_task(p)) |
675 | target = rq->expired; | 837 | target = rq->expired; |
676 | enqueue_task(p, target); | 838 | enqueue_task(p, target); |
677 | rq->nr_running++; | 839 | inc_nr_running(p, rq); |
678 | } | 840 | } |
679 | 841 | ||
680 | /* | 842 | /* |
681 | * __activate_idle_task - move idle task to the _front_ of runqueue. | 843 | * __activate_idle_task - move idle task to the _front_ of runqueue. |
682 | */ | 844 | */ |
683 | static inline void __activate_idle_task(task_t *p, runqueue_t *rq) | 845 | static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) |
684 | { | 846 | { |
685 | enqueue_task_head(p, rq->active); | 847 | enqueue_task_head(p, rq->active); |
686 | rq->nr_running++; | 848 | inc_nr_running(p, rq); |
687 | } | 849 | } |
688 | 850 | ||
689 | static int recalc_task_prio(task_t *p, unsigned long long now) | 851 | /* |
852 | * Recalculate p->normal_prio and p->prio after having slept, | ||
853 | * updating the sleep-average too: | ||
854 | */ | ||
855 | static int recalc_task_prio(struct task_struct *p, unsigned long long now) | ||
690 | { | 856 | { |
691 | /* Caller must always ensure 'now >= p->timestamp' */ | 857 | /* Caller must always ensure 'now >= p->timestamp' */ |
692 | unsigned long long __sleep_time = now - p->timestamp; | 858 | unsigned long sleep_time = now - p->timestamp; |
693 | unsigned long sleep_time; | ||
694 | 859 | ||
695 | if (batch_task(p)) | 860 | if (batch_task(p)) |
696 | sleep_time = 0; | 861 | sleep_time = 0; |
697 | else { | ||
698 | if (__sleep_time > NS_MAX_SLEEP_AVG) | ||
699 | sleep_time = NS_MAX_SLEEP_AVG; | ||
700 | else | ||
701 | sleep_time = (unsigned long)__sleep_time; | ||
702 | } | ||
703 | 862 | ||
704 | if (likely(sleep_time > 0)) { | 863 | if (likely(sleep_time > 0)) { |
705 | /* | 864 | /* |
706 | * User tasks that sleep a long time are categorised as | 865 | * This ceiling is set to the lowest priority that would allow |
707 | * idle. They will only have their sleep_avg increased to a | 866 | * a task to be reinserted into the active array on timeslice |
708 | * level that makes them just interactive priority to stay | 867 | * completion. |
709 | * active yet prevent them suddenly becoming cpu hogs and | ||
710 | * starving other processes. | ||
711 | */ | 868 | */ |
712 | if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) { | 869 | unsigned long ceiling = INTERACTIVE_SLEEP(p); |
713 | unsigned long ceiling; | ||
714 | 870 | ||
715 | ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG - | 871 | if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { |
716 | DEF_TIMESLICE); | 872 | /* |
717 | if (p->sleep_avg < ceiling) | 873 | * Prevents user tasks from achieving best priority |
718 | p->sleep_avg = ceiling; | 874 | * with one single large enough sleep. |
875 | */ | ||
876 | p->sleep_avg = ceiling; | ||
877 | /* | ||
878 | * Using INTERACTIVE_SLEEP() as a ceiling places a | ||
879 | * nice(0) task 1ms sleep away from promotion, and | ||
880 | * gives it 700ms to round-robin with no chance of | ||
881 | * being demoted. This is more than generous, so | ||
882 | * mark this sleep as non-interactive to prevent the | ||
883 | * on-runqueue bonus logic from intervening should | ||
884 | * this task not receive cpu immediately. | ||
885 | */ | ||
886 | p->sleep_type = SLEEP_NONINTERACTIVE; | ||
719 | } else { | 887 | } else { |
720 | /* | 888 | /* |
721 | * Tasks waking from uninterruptible sleep are | 889 | * Tasks waking from uninterruptible sleep are |
@@ -723,12 +891,12 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
723 | * are likely to be waiting on I/O | 891 | * are likely to be waiting on I/O |
724 | */ | 892 | */ |
725 | if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { | 893 | if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { |
726 | if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) | 894 | if (p->sleep_avg >= ceiling) |
727 | sleep_time = 0; | 895 | sleep_time = 0; |
728 | else if (p->sleep_avg + sleep_time >= | 896 | else if (p->sleep_avg + sleep_time >= |
729 | INTERACTIVE_SLEEP(p)) { | 897 | ceiling) { |
730 | p->sleep_avg = INTERACTIVE_SLEEP(p); | 898 | p->sleep_avg = ceiling; |
731 | sleep_time = 0; | 899 | sleep_time = 0; |
732 | } | 900 | } |
733 | } | 901 | } |
734 | 902 | ||
@@ -742,9 +910,9 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
742 | */ | 910 | */ |
743 | p->sleep_avg += sleep_time; | 911 | p->sleep_avg += sleep_time; |
744 | 912 | ||
745 | if (p->sleep_avg > NS_MAX_SLEEP_AVG) | ||
746 | p->sleep_avg = NS_MAX_SLEEP_AVG; | ||
747 | } | 913 | } |
914 | if (p->sleep_avg > NS_MAX_SLEEP_AVG) | ||
915 | p->sleep_avg = NS_MAX_SLEEP_AVG; | ||
748 | } | 916 | } |
749 | 917 | ||
750 | return effective_prio(p); | 918 | return effective_prio(p); |
@@ -756,7 +924,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
756 | * Update all the scheduling statistics stuff. (sleep average | 924 | * Update all the scheduling statistics stuff. (sleep average |
757 | * calculation, priority modifiers, etc.) | 925 | * calculation, priority modifiers, etc.) |
758 | */ | 926 | */ |
759 | static void activate_task(task_t *p, runqueue_t *rq, int local) | 927 | static void activate_task(struct task_struct *p, struct rq *rq, int local) |
760 | { | 928 | { |
761 | unsigned long long now; | 929 | unsigned long long now; |
762 | 930 | ||
@@ -764,7 +932,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) | |||
764 | #ifdef CONFIG_SMP | 932 | #ifdef CONFIG_SMP |
765 | if (!local) { | 933 | if (!local) { |
766 | /* Compensate for drifting sched_clock */ | 934 | /* Compensate for drifting sched_clock */ |
767 | runqueue_t *this_rq = this_rq(); | 935 | struct rq *this_rq = this_rq(); |
768 | now = (now - this_rq->timestamp_last_tick) | 936 | now = (now - this_rq->timestamp_last_tick) |
769 | + rq->timestamp_last_tick; | 937 | + rq->timestamp_last_tick; |
770 | } | 938 | } |
@@ -803,9 +971,9 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) | |||
803 | /* | 971 | /* |
804 | * deactivate_task - remove a task from the runqueue. | 972 | * deactivate_task - remove a task from the runqueue. |
805 | */ | 973 | */ |
806 | static void deactivate_task(struct task_struct *p, runqueue_t *rq) | 974 | static void deactivate_task(struct task_struct *p, struct rq *rq) |
807 | { | 975 | { |
808 | rq->nr_running--; | 976 | dec_nr_running(p, rq); |
809 | dequeue_task(p, p->array); | 977 | dequeue_task(p, p->array); |
810 | p->array = NULL; | 978 | p->array = NULL; |
811 | } | 979 | } |
@@ -818,7 +986,12 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq) | |||
818 | * the target CPU. | 986 | * the target CPU. |
819 | */ | 987 | */ |
820 | #ifdef CONFIG_SMP | 988 | #ifdef CONFIG_SMP |
821 | static void resched_task(task_t *p) | 989 | |
990 | #ifndef tsk_is_polling | ||
991 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | ||
992 | #endif | ||
993 | |||
994 | static void resched_task(struct task_struct *p) | ||
822 | { | 995 | { |
823 | int cpu; | 996 | int cpu; |
824 | 997 | ||
@@ -833,13 +1006,13 @@ static void resched_task(task_t *p) | |||
833 | if (cpu == smp_processor_id()) | 1006 | if (cpu == smp_processor_id()) |
834 | return; | 1007 | return; |
835 | 1008 | ||
836 | /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ | 1009 | /* NEED_RESCHED must be visible before we test polling */ |
837 | smp_mb(); | 1010 | smp_mb(); |
838 | if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) | 1011 | if (!tsk_is_polling(p)) |
839 | smp_send_reschedule(cpu); | 1012 | smp_send_reschedule(cpu); |
840 | } | 1013 | } |
841 | #else | 1014 | #else |
842 | static inline void resched_task(task_t *p) | 1015 | static inline void resched_task(struct task_struct *p) |
843 | { | 1016 | { |
844 | assert_spin_locked(&task_rq(p)->lock); | 1017 | assert_spin_locked(&task_rq(p)->lock); |
845 | set_tsk_need_resched(p); | 1018 | set_tsk_need_resched(p); |
@@ -850,28 +1023,35 @@ static inline void resched_task(task_t *p) | |||
850 | * task_curr - is this task currently executing on a CPU? | 1023 | * task_curr - is this task currently executing on a CPU? |
851 | * @p: the task in question. | 1024 | * @p: the task in question. |
852 | */ | 1025 | */ |
853 | inline int task_curr(const task_t *p) | 1026 | inline int task_curr(const struct task_struct *p) |
854 | { | 1027 | { |
855 | return cpu_curr(task_cpu(p)) == p; | 1028 | return cpu_curr(task_cpu(p)) == p; |
856 | } | 1029 | } |
857 | 1030 | ||
1031 | /* Used instead of source_load when we know the type == 0 */ | ||
1032 | unsigned long weighted_cpuload(const int cpu) | ||
1033 | { | ||
1034 | return cpu_rq(cpu)->raw_weighted_load; | ||
1035 | } | ||
1036 | |||
858 | #ifdef CONFIG_SMP | 1037 | #ifdef CONFIG_SMP |
859 | typedef struct { | 1038 | struct migration_req { |
860 | struct list_head list; | 1039 | struct list_head list; |
861 | 1040 | ||
862 | task_t *task; | 1041 | struct task_struct *task; |
863 | int dest_cpu; | 1042 | int dest_cpu; |
864 | 1043 | ||
865 | struct completion done; | 1044 | struct completion done; |
866 | } migration_req_t; | 1045 | }; |
867 | 1046 | ||
868 | /* | 1047 | /* |
869 | * The task's runqueue lock must be held. | 1048 | * The task's runqueue lock must be held. |
870 | * Returns true if you have to wait for migration thread. | 1049 | * Returns true if you have to wait for migration thread. |
871 | */ | 1050 | */ |
872 | static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) | 1051 | static int |
1052 | migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) | ||
873 | { | 1053 | { |
874 | runqueue_t *rq = task_rq(p); | 1054 | struct rq *rq = task_rq(p); |
875 | 1055 | ||
876 | /* | 1056 | /* |
877 | * If the task is not on a runqueue (and not running), then | 1057 | * If the task is not on a runqueue (and not running), then |
@@ -886,6 +1066,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) | |||
886 | req->task = p; | 1066 | req->task = p; |
887 | req->dest_cpu = dest_cpu; | 1067 | req->dest_cpu = dest_cpu; |
888 | list_add(&req->list, &rq->migration_queue); | 1068 | list_add(&req->list, &rq->migration_queue); |
1069 | |||
889 | return 1; | 1070 | return 1; |
890 | } | 1071 | } |
891 | 1072 | ||
@@ -898,10 +1079,10 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) | |||
898 | * smp_call_function() if an IPI is sent by the same process we are | 1079 | * smp_call_function() if an IPI is sent by the same process we are |
899 | * waiting to become inactive. | 1080 | * waiting to become inactive. |
900 | */ | 1081 | */ |
901 | void wait_task_inactive(task_t *p) | 1082 | void wait_task_inactive(struct task_struct *p) |
902 | { | 1083 | { |
903 | unsigned long flags; | 1084 | unsigned long flags; |
904 | runqueue_t *rq; | 1085 | struct rq *rq; |
905 | int preempted; | 1086 | int preempted; |
906 | 1087 | ||
907 | repeat: | 1088 | repeat: |
@@ -932,7 +1113,7 @@ repeat: | |||
932 | * to another CPU then no harm is done and the purpose has been | 1113 | * to another CPU then no harm is done and the purpose has been |
933 | * achieved as well. | 1114 | * achieved as well. |
934 | */ | 1115 | */ |
935 | void kick_process(task_t *p) | 1116 | void kick_process(struct task_struct *p) |
936 | { | 1117 | { |
937 | int cpu; | 1118 | int cpu; |
938 | 1119 | ||
@@ -944,32 +1125,45 @@ void kick_process(task_t *p) | |||
944 | } | 1125 | } |
945 | 1126 | ||
946 | /* | 1127 | /* |
947 | * Return a low guess at the load of a migration-source cpu. | 1128 | * Return a low guess at the load of a migration-source cpu weighted |
1129 | * according to the scheduling class and "nice" value. | ||
948 | * | 1130 | * |
949 | * We want to under-estimate the load of migration sources, to | 1131 | * We want to under-estimate the load of migration sources, to |
950 | * balance conservatively. | 1132 | * balance conservatively. |
951 | */ | 1133 | */ |
952 | static inline unsigned long source_load(int cpu, int type) | 1134 | static inline unsigned long source_load(int cpu, int type) |
953 | { | 1135 | { |
954 | runqueue_t *rq = cpu_rq(cpu); | 1136 | struct rq *rq = cpu_rq(cpu); |
955 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 1137 | |
956 | if (type == 0) | 1138 | if (type == 0) |
957 | return load_now; | 1139 | return rq->raw_weighted_load; |
958 | 1140 | ||
959 | return min(rq->cpu_load[type-1], load_now); | 1141 | return min(rq->cpu_load[type-1], rq->raw_weighted_load); |
960 | } | 1142 | } |
961 | 1143 | ||
962 | /* | 1144 | /* |
963 | * Return a high guess at the load of a migration-target cpu | 1145 | * Return a high guess at the load of a migration-target cpu weighted |
1146 | * according to the scheduling class and "nice" value. | ||
964 | */ | 1147 | */ |
965 | static inline unsigned long target_load(int cpu, int type) | 1148 | static inline unsigned long target_load(int cpu, int type) |
966 | { | 1149 | { |
967 | runqueue_t *rq = cpu_rq(cpu); | 1150 | struct rq *rq = cpu_rq(cpu); |
968 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 1151 | |
969 | if (type == 0) | 1152 | if (type == 0) |
970 | return load_now; | 1153 | return rq->raw_weighted_load; |
971 | 1154 | ||
972 | return max(rq->cpu_load[type-1], load_now); | 1155 | return max(rq->cpu_load[type-1], rq->raw_weighted_load); |
1156 | } | ||
1157 | |||
1158 | /* | ||
1159 | * Return the average load per task on the cpu's run queue | ||
1160 | */ | ||
1161 | static inline unsigned long cpu_avg_load_per_task(int cpu) | ||
1162 | { | ||
1163 | struct rq *rq = cpu_rq(cpu); | ||
1164 | unsigned long n = rq->nr_running; | ||
1165 | |||
1166 | return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; | ||
973 | } | 1167 | } |
974 | 1168 | ||
975 | /* | 1169 | /* |
@@ -1042,7 +1236,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
1042 | cpus_and(tmp, group->cpumask, p->cpus_allowed); | 1236 | cpus_and(tmp, group->cpumask, p->cpus_allowed); |
1043 | 1237 | ||
1044 | for_each_cpu_mask(i, tmp) { | 1238 | for_each_cpu_mask(i, tmp) { |
1045 | load = source_load(i, 0); | 1239 | load = weighted_cpuload(i); |
1046 | 1240 | ||
1047 | if (load < min_load || (load == min_load && i == this_cpu)) { | 1241 | if (load < min_load || (load == min_load && i == this_cpu)) { |
1048 | min_load = load; | 1242 | min_load = load; |
@@ -1069,9 +1263,15 @@ static int sched_balance_self(int cpu, int flag) | |||
1069 | struct task_struct *t = current; | 1263 | struct task_struct *t = current; |
1070 | struct sched_domain *tmp, *sd = NULL; | 1264 | struct sched_domain *tmp, *sd = NULL; |
1071 | 1265 | ||
1072 | for_each_domain(cpu, tmp) | 1266 | for_each_domain(cpu, tmp) { |
1267 | /* | ||
1268 | * If power savings logic is enabled for a domain, stop there. | ||
1269 | */ | ||
1270 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | ||
1271 | break; | ||
1073 | if (tmp->flags & flag) | 1272 | if (tmp->flags & flag) |
1074 | sd = tmp; | 1273 | sd = tmp; |
1274 | } | ||
1075 | 1275 | ||
1076 | while (sd) { | 1276 | while (sd) { |
1077 | cpumask_t span; | 1277 | cpumask_t span; |
@@ -1116,7 +1316,7 @@ nextlevel: | |||
1116 | * Returns the CPU we should wake onto. | 1316 | * Returns the CPU we should wake onto. |
1117 | */ | 1317 | */ |
1118 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | 1318 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) |
1119 | static int wake_idle(int cpu, task_t *p) | 1319 | static int wake_idle(int cpu, struct task_struct *p) |
1120 | { | 1320 | { |
1121 | cpumask_t tmp; | 1321 | cpumask_t tmp; |
1122 | struct sched_domain *sd; | 1322 | struct sched_domain *sd; |
@@ -1139,7 +1339,7 @@ static int wake_idle(int cpu, task_t *p) | |||
1139 | return cpu; | 1339 | return cpu; |
1140 | } | 1340 | } |
1141 | #else | 1341 | #else |
1142 | static inline int wake_idle(int cpu, task_t *p) | 1342 | static inline int wake_idle(int cpu, struct task_struct *p) |
1143 | { | 1343 | { |
1144 | return cpu; | 1344 | return cpu; |
1145 | } | 1345 | } |
@@ -1159,15 +1359,15 @@ static inline int wake_idle(int cpu, task_t *p) | |||
1159 | * | 1359 | * |
1160 | * returns failure only if the task is already active. | 1360 | * returns failure only if the task is already active. |
1161 | */ | 1361 | */ |
1162 | static int try_to_wake_up(task_t *p, unsigned int state, int sync) | 1362 | static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) |
1163 | { | 1363 | { |
1164 | int cpu, this_cpu, success = 0; | 1364 | int cpu, this_cpu, success = 0; |
1165 | unsigned long flags; | 1365 | unsigned long flags; |
1166 | long old_state; | 1366 | long old_state; |
1167 | runqueue_t *rq; | 1367 | struct rq *rq; |
1168 | #ifdef CONFIG_SMP | 1368 | #ifdef CONFIG_SMP |
1169 | unsigned long load, this_load; | ||
1170 | struct sched_domain *sd, *this_sd = NULL; | 1369 | struct sched_domain *sd, *this_sd = NULL; |
1370 | unsigned long load, this_load; | ||
1171 | int new_cpu; | 1371 | int new_cpu; |
1172 | #endif | 1372 | #endif |
1173 | 1373 | ||
@@ -1221,17 +1421,19 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync) | |||
1221 | 1421 | ||
1222 | if (this_sd->flags & SD_WAKE_AFFINE) { | 1422 | if (this_sd->flags & SD_WAKE_AFFINE) { |
1223 | unsigned long tl = this_load; | 1423 | unsigned long tl = this_load; |
1424 | unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
1425 | |||
1224 | /* | 1426 | /* |
1225 | * If sync wakeup then subtract the (maximum possible) | 1427 | * If sync wakeup then subtract the (maximum possible) |
1226 | * effect of the currently running task from the load | 1428 | * effect of the currently running task from the load |
1227 | * of the current CPU: | 1429 | * of the current CPU: |
1228 | */ | 1430 | */ |
1229 | if (sync) | 1431 | if (sync) |
1230 | tl -= SCHED_LOAD_SCALE; | 1432 | tl -= current->load_weight; |
1231 | 1433 | ||
1232 | if ((tl <= load && | 1434 | if ((tl <= load && |
1233 | tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || | 1435 | tl + target_load(cpu, idx) <= tl_per_task) || |
1234 | 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { | 1436 | 100*(tl + p->load_weight) <= imbalance*load) { |
1235 | /* | 1437 | /* |
1236 | * This domain has SD_WAKE_AFFINE and | 1438 | * This domain has SD_WAKE_AFFINE and |
1237 | * p is cache cold in this domain, and | 1439 | * p is cache cold in this domain, and |
@@ -1315,15 +1517,14 @@ out: | |||
1315 | return success; | 1517 | return success; |
1316 | } | 1518 | } |
1317 | 1519 | ||
1318 | int fastcall wake_up_process(task_t *p) | 1520 | int fastcall wake_up_process(struct task_struct *p) |
1319 | { | 1521 | { |
1320 | return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | | 1522 | return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | |
1321 | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); | 1523 | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); |
1322 | } | 1524 | } |
1323 | |||
1324 | EXPORT_SYMBOL(wake_up_process); | 1525 | EXPORT_SYMBOL(wake_up_process); |
1325 | 1526 | ||
1326 | int fastcall wake_up_state(task_t *p, unsigned int state) | 1527 | int fastcall wake_up_state(struct task_struct *p, unsigned int state) |
1327 | { | 1528 | { |
1328 | return try_to_wake_up(p, state, 0); | 1529 | return try_to_wake_up(p, state, 0); |
1329 | } | 1530 | } |
@@ -1332,7 +1533,7 @@ int fastcall wake_up_state(task_t *p, unsigned int state) | |||
1332 | * Perform scheduler related setup for a newly forked process p. | 1533 | * Perform scheduler related setup for a newly forked process p. |
1333 | * p is forked by current. | 1534 | * p is forked by current. |
1334 | */ | 1535 | */ |
1335 | void fastcall sched_fork(task_t *p, int clone_flags) | 1536 | void fastcall sched_fork(struct task_struct *p, int clone_flags) |
1336 | { | 1537 | { |
1337 | int cpu = get_cpu(); | 1538 | int cpu = get_cpu(); |
1338 | 1539 | ||
@@ -1348,10 +1549,17 @@ void fastcall sched_fork(task_t *p, int clone_flags) | |||
1348 | * event cannot wake it up and insert it on the runqueue either. | 1549 | * event cannot wake it up and insert it on the runqueue either. |
1349 | */ | 1550 | */ |
1350 | p->state = TASK_RUNNING; | 1551 | p->state = TASK_RUNNING; |
1552 | |||
1553 | /* | ||
1554 | * Make sure we do not leak PI boosting priority to the child: | ||
1555 | */ | ||
1556 | p->prio = current->normal_prio; | ||
1557 | |||
1351 | INIT_LIST_HEAD(&p->run_list); | 1558 | INIT_LIST_HEAD(&p->run_list); |
1352 | p->array = NULL; | 1559 | p->array = NULL; |
1353 | #ifdef CONFIG_SCHEDSTATS | 1560 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
1354 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 1561 | if (unlikely(sched_info_on())) |
1562 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | ||
1355 | #endif | 1563 | #endif |
1356 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 1564 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
1357 | p->oncpu = 0; | 1565 | p->oncpu = 0; |
@@ -1394,11 +1602,11 @@ void fastcall sched_fork(task_t *p, int clone_flags) | |||
1394 | * that must be done for every newly created context, then puts the task | 1602 | * that must be done for every newly created context, then puts the task |
1395 | * on the runqueue and wakes it. | 1603 | * on the runqueue and wakes it. |
1396 | */ | 1604 | */ |
1397 | void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) | 1605 | void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) |
1398 | { | 1606 | { |
1607 | struct rq *rq, *this_rq; | ||
1399 | unsigned long flags; | 1608 | unsigned long flags; |
1400 | int this_cpu, cpu; | 1609 | int this_cpu, cpu; |
1401 | runqueue_t *rq, *this_rq; | ||
1402 | 1610 | ||
1403 | rq = task_rq_lock(p, &flags); | 1611 | rq = task_rq_lock(p, &flags); |
1404 | BUG_ON(p->state != TASK_RUNNING); | 1612 | BUG_ON(p->state != TASK_RUNNING); |
@@ -1427,10 +1635,11 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) | |||
1427 | __activate_task(p, rq); | 1635 | __activate_task(p, rq); |
1428 | else { | 1636 | else { |
1429 | p->prio = current->prio; | 1637 | p->prio = current->prio; |
1638 | p->normal_prio = current->normal_prio; | ||
1430 | list_add_tail(&p->run_list, ¤t->run_list); | 1639 | list_add_tail(&p->run_list, ¤t->run_list); |
1431 | p->array = current->array; | 1640 | p->array = current->array; |
1432 | p->array->nr_active++; | 1641 | p->array->nr_active++; |
1433 | rq->nr_running++; | 1642 | inc_nr_running(p, rq); |
1434 | } | 1643 | } |
1435 | set_need_resched(); | 1644 | set_need_resched(); |
1436 | } else | 1645 | } else |
@@ -1477,10 +1686,10 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) | |||
1477 | * artificially, because any timeslice recovered here | 1686 | * artificially, because any timeslice recovered here |
1478 | * was given away by the parent in the first place.) | 1687 | * was given away by the parent in the first place.) |
1479 | */ | 1688 | */ |
1480 | void fastcall sched_exit(task_t *p) | 1689 | void fastcall sched_exit(struct task_struct *p) |
1481 | { | 1690 | { |
1482 | unsigned long flags; | 1691 | unsigned long flags; |
1483 | runqueue_t *rq; | 1692 | struct rq *rq; |
1484 | 1693 | ||
1485 | /* | 1694 | /* |
1486 | * If the child was a (relative-) CPU hog then decrease | 1695 | * If the child was a (relative-) CPU hog then decrease |
@@ -1511,7 +1720,7 @@ void fastcall sched_exit(task_t *p) | |||
1511 | * prepare_task_switch sets up locking and calls architecture specific | 1720 | * prepare_task_switch sets up locking and calls architecture specific |
1512 | * hooks. | 1721 | * hooks. |
1513 | */ | 1722 | */ |
1514 | static inline void prepare_task_switch(runqueue_t *rq, task_t *next) | 1723 | static inline void prepare_task_switch(struct rq *rq, struct task_struct *next) |
1515 | { | 1724 | { |
1516 | prepare_lock_switch(rq, next); | 1725 | prepare_lock_switch(rq, next); |
1517 | prepare_arch_switch(next); | 1726 | prepare_arch_switch(next); |
@@ -1532,7 +1741,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next) | |||
1532 | * with the lock held can cause deadlocks; see schedule() for | 1741 | * with the lock held can cause deadlocks; see schedule() for |
1533 | * details.) | 1742 | * details.) |
1534 | */ | 1743 | */ |
1535 | static inline void finish_task_switch(runqueue_t *rq, task_t *prev) | 1744 | static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) |
1536 | __releases(rq->lock) | 1745 | __releases(rq->lock) |
1537 | { | 1746 | { |
1538 | struct mm_struct *mm = rq->prev_mm; | 1747 | struct mm_struct *mm = rq->prev_mm; |
@@ -1570,10 +1779,11 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev) | |||
1570 | * schedule_tail - first thing a freshly forked thread must call. | 1779 | * schedule_tail - first thing a freshly forked thread must call. |
1571 | * @prev: the thread we just switched away from. | 1780 | * @prev: the thread we just switched away from. |
1572 | */ | 1781 | */ |
1573 | asmlinkage void schedule_tail(task_t *prev) | 1782 | asmlinkage void schedule_tail(struct task_struct *prev) |
1574 | __releases(rq->lock) | 1783 | __releases(rq->lock) |
1575 | { | 1784 | { |
1576 | runqueue_t *rq = this_rq(); | 1785 | struct rq *rq = this_rq(); |
1786 | |||
1577 | finish_task_switch(rq, prev); | 1787 | finish_task_switch(rq, prev); |
1578 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | 1788 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW |
1579 | /* In this case, finish_task_switch does not reenable preemption */ | 1789 | /* In this case, finish_task_switch does not reenable preemption */ |
@@ -1587,8 +1797,9 @@ asmlinkage void schedule_tail(task_t *prev) | |||
1587 | * context_switch - switch to the new MM and the new | 1797 | * context_switch - switch to the new MM and the new |
1588 | * thread's register state. | 1798 | * thread's register state. |
1589 | */ | 1799 | */ |
1590 | static inline | 1800 | static inline struct task_struct * |
1591 | task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) | 1801 | context_switch(struct rq *rq, struct task_struct *prev, |
1802 | struct task_struct *next) | ||
1592 | { | 1803 | { |
1593 | struct mm_struct *mm = next->mm; | 1804 | struct mm_struct *mm = next->mm; |
1594 | struct mm_struct *oldmm = prev->active_mm; | 1805 | struct mm_struct *oldmm = prev->active_mm; |
@@ -1605,6 +1816,15 @@ task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) | |||
1605 | WARN_ON(rq->prev_mm); | 1816 | WARN_ON(rq->prev_mm); |
1606 | rq->prev_mm = oldmm; | 1817 | rq->prev_mm = oldmm; |
1607 | } | 1818 | } |
1819 | /* | ||
1820 | * Since the runqueue lock will be released by the next | ||
1821 | * task (which is an invalid locking op but in the case | ||
1822 | * of the scheduler it's an obvious special-case), so we | ||
1823 | * do an early lockdep release here: | ||
1824 | */ | ||
1825 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
1826 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | ||
1827 | #endif | ||
1608 | 1828 | ||
1609 | /* Here we just switch the register state and the stack. */ | 1829 | /* Here we just switch the register state and the stack. */ |
1610 | switch_to(prev, next, prev); | 1830 | switch_to(prev, next, prev); |
@@ -1648,7 +1868,8 @@ unsigned long nr_uninterruptible(void) | |||
1648 | 1868 | ||
1649 | unsigned long long nr_context_switches(void) | 1869 | unsigned long long nr_context_switches(void) |
1650 | { | 1870 | { |
1651 | unsigned long long i, sum = 0; | 1871 | int i; |
1872 | unsigned long long sum = 0; | ||
1652 | 1873 | ||
1653 | for_each_possible_cpu(i) | 1874 | for_each_possible_cpu(i) |
1654 | sum += cpu_rq(i)->nr_switches; | 1875 | sum += cpu_rq(i)->nr_switches; |
@@ -1684,15 +1905,21 @@ unsigned long nr_active(void) | |||
1684 | #ifdef CONFIG_SMP | 1905 | #ifdef CONFIG_SMP |
1685 | 1906 | ||
1686 | /* | 1907 | /* |
1908 | * Is this task likely cache-hot: | ||
1909 | */ | ||
1910 | static inline int | ||
1911 | task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd) | ||
1912 | { | ||
1913 | return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; | ||
1914 | } | ||
1915 | |||
1916 | /* | ||
1687 | * double_rq_lock - safely lock two runqueues | 1917 | * double_rq_lock - safely lock two runqueues |
1688 | * | 1918 | * |
1689 | * We must take them in cpu order to match code in | ||
1690 | * dependent_sleeper and wake_dependent_sleeper. | ||
1691 | * | ||
1692 | * Note this does not disable interrupts like task_rq_lock, | 1919 | * Note this does not disable interrupts like task_rq_lock, |
1693 | * you need to do so manually before calling. | 1920 | * you need to do so manually before calling. |
1694 | */ | 1921 | */ |
1695 | static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) | 1922 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) |
1696 | __acquires(rq1->lock) | 1923 | __acquires(rq1->lock) |
1697 | __acquires(rq2->lock) | 1924 | __acquires(rq2->lock) |
1698 | { | 1925 | { |
@@ -1700,7 +1927,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) | |||
1700 | spin_lock(&rq1->lock); | 1927 | spin_lock(&rq1->lock); |
1701 | __acquire(rq2->lock); /* Fake it out ;) */ | 1928 | __acquire(rq2->lock); /* Fake it out ;) */ |
1702 | } else { | 1929 | } else { |
1703 | if (rq1->cpu < rq2->cpu) { | 1930 | if (rq1 < rq2) { |
1704 | spin_lock(&rq1->lock); | 1931 | spin_lock(&rq1->lock); |
1705 | spin_lock(&rq2->lock); | 1932 | spin_lock(&rq2->lock); |
1706 | } else { | 1933 | } else { |
@@ -1716,7 +1943,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) | |||
1716 | * Note this does not restore interrupts like task_rq_unlock, | 1943 | * Note this does not restore interrupts like task_rq_unlock, |
1717 | * you need to do so manually after calling. | 1944 | * you need to do so manually after calling. |
1718 | */ | 1945 | */ |
1719 | static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) | 1946 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) |
1720 | __releases(rq1->lock) | 1947 | __releases(rq1->lock) |
1721 | __releases(rq2->lock) | 1948 | __releases(rq2->lock) |
1722 | { | 1949 | { |
@@ -1730,13 +1957,13 @@ static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) | |||
1730 | /* | 1957 | /* |
1731 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | 1958 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. |
1732 | */ | 1959 | */ |
1733 | static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) | 1960 | static void double_lock_balance(struct rq *this_rq, struct rq *busiest) |
1734 | __releases(this_rq->lock) | 1961 | __releases(this_rq->lock) |
1735 | __acquires(busiest->lock) | 1962 | __acquires(busiest->lock) |
1736 | __acquires(this_rq->lock) | 1963 | __acquires(this_rq->lock) |
1737 | { | 1964 | { |
1738 | if (unlikely(!spin_trylock(&busiest->lock))) { | 1965 | if (unlikely(!spin_trylock(&busiest->lock))) { |
1739 | if (busiest->cpu < this_rq->cpu) { | 1966 | if (busiest < this_rq) { |
1740 | spin_unlock(&this_rq->lock); | 1967 | spin_unlock(&this_rq->lock); |
1741 | spin_lock(&busiest->lock); | 1968 | spin_lock(&busiest->lock); |
1742 | spin_lock(&this_rq->lock); | 1969 | spin_lock(&this_rq->lock); |
@@ -1751,11 +1978,11 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) | |||
1751 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then | 1978 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then |
1752 | * the cpu_allowed mask is restored. | 1979 | * the cpu_allowed mask is restored. |
1753 | */ | 1980 | */ |
1754 | static void sched_migrate_task(task_t *p, int dest_cpu) | 1981 | static void sched_migrate_task(struct task_struct *p, int dest_cpu) |
1755 | { | 1982 | { |
1756 | migration_req_t req; | 1983 | struct migration_req req; |
1757 | runqueue_t *rq; | ||
1758 | unsigned long flags; | 1984 | unsigned long flags; |
1985 | struct rq *rq; | ||
1759 | 1986 | ||
1760 | rq = task_rq_lock(p, &flags); | 1987 | rq = task_rq_lock(p, &flags); |
1761 | if (!cpu_isset(dest_cpu, p->cpus_allowed) | 1988 | if (!cpu_isset(dest_cpu, p->cpus_allowed) |
@@ -1766,11 +1993,13 @@ static void sched_migrate_task(task_t *p, int dest_cpu) | |||
1766 | if (migrate_task(p, dest_cpu, &req)) { | 1993 | if (migrate_task(p, dest_cpu, &req)) { |
1767 | /* Need to wait for migration thread (might exit: take ref). */ | 1994 | /* Need to wait for migration thread (might exit: take ref). */ |
1768 | struct task_struct *mt = rq->migration_thread; | 1995 | struct task_struct *mt = rq->migration_thread; |
1996 | |||
1769 | get_task_struct(mt); | 1997 | get_task_struct(mt); |
1770 | task_rq_unlock(rq, &flags); | 1998 | task_rq_unlock(rq, &flags); |
1771 | wake_up_process(mt); | 1999 | wake_up_process(mt); |
1772 | put_task_struct(mt); | 2000 | put_task_struct(mt); |
1773 | wait_for_completion(&req.done); | 2001 | wait_for_completion(&req.done); |
2002 | |||
1774 | return; | 2003 | return; |
1775 | } | 2004 | } |
1776 | out: | 2005 | out: |
@@ -1794,14 +2023,14 @@ void sched_exec(void) | |||
1794 | * pull_task - move a task from a remote runqueue to the local runqueue. | 2023 | * pull_task - move a task from a remote runqueue to the local runqueue. |
1795 | * Both runqueues must be locked. | 2024 | * Both runqueues must be locked. |
1796 | */ | 2025 | */ |
1797 | static | 2026 | static void pull_task(struct rq *src_rq, struct prio_array *src_array, |
1798 | void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | 2027 | struct task_struct *p, struct rq *this_rq, |
1799 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) | 2028 | struct prio_array *this_array, int this_cpu) |
1800 | { | 2029 | { |
1801 | dequeue_task(p, src_array); | 2030 | dequeue_task(p, src_array); |
1802 | src_rq->nr_running--; | 2031 | dec_nr_running(p, src_rq); |
1803 | set_task_cpu(p, this_cpu); | 2032 | set_task_cpu(p, this_cpu); |
1804 | this_rq->nr_running++; | 2033 | inc_nr_running(p, this_rq); |
1805 | enqueue_task(p, this_array); | 2034 | enqueue_task(p, this_array); |
1806 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) | 2035 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) |
1807 | + this_rq->timestamp_last_tick; | 2036 | + this_rq->timestamp_last_tick; |
@@ -1817,7 +2046,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | |||
1817 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 2046 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
1818 | */ | 2047 | */ |
1819 | static | 2048 | static |
1820 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | 2049 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, |
1821 | struct sched_domain *sd, enum idle_type idle, | 2050 | struct sched_domain *sd, enum idle_type idle, |
1822 | int *all_pinned) | 2051 | int *all_pinned) |
1823 | { | 2052 | { |
@@ -1848,26 +2077,42 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | |||
1848 | return 1; | 2077 | return 1; |
1849 | } | 2078 | } |
1850 | 2079 | ||
2080 | #define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) | ||
2081 | |||
1851 | /* | 2082 | /* |
1852 | * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, | 2083 | * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted |
1853 | * as part of a balancing operation within "domain". Returns the number of | 2084 | * load from busiest to this_rq, as part of a balancing operation within |
1854 | * tasks moved. | 2085 | * "domain". Returns the number of tasks moved. |
1855 | * | 2086 | * |
1856 | * Called with both runqueues locked. | 2087 | * Called with both runqueues locked. |
1857 | */ | 2088 | */ |
1858 | static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, | 2089 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1859 | unsigned long max_nr_move, struct sched_domain *sd, | 2090 | unsigned long max_nr_move, unsigned long max_load_move, |
1860 | enum idle_type idle, int *all_pinned) | 2091 | struct sched_domain *sd, enum idle_type idle, |
2092 | int *all_pinned) | ||
1861 | { | 2093 | { |
1862 | prio_array_t *array, *dst_array; | 2094 | int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, |
2095 | best_prio_seen, skip_for_load; | ||
2096 | struct prio_array *array, *dst_array; | ||
1863 | struct list_head *head, *curr; | 2097 | struct list_head *head, *curr; |
1864 | int idx, pulled = 0, pinned = 0; | 2098 | struct task_struct *tmp; |
1865 | task_t *tmp; | 2099 | long rem_load_move; |
1866 | 2100 | ||
1867 | if (max_nr_move == 0) | 2101 | if (max_nr_move == 0 || max_load_move == 0) |
1868 | goto out; | 2102 | goto out; |
1869 | 2103 | ||
2104 | rem_load_move = max_load_move; | ||
1870 | pinned = 1; | 2105 | pinned = 1; |
2106 | this_best_prio = rq_best_prio(this_rq); | ||
2107 | best_prio = rq_best_prio(busiest); | ||
2108 | /* | ||
2109 | * Enable handling of the case where there is more than one task | ||
2110 | * with the best priority. If the current running task is one | ||
2111 | * of those with prio==best_prio we know it won't be moved | ||
2112 | * and therefore it's safe to override the skip (based on load) of | ||
2113 | * any task we find with that prio. | ||
2114 | */ | ||
2115 | best_prio_seen = best_prio == busiest->curr->prio; | ||
1871 | 2116 | ||
1872 | /* | 2117 | /* |
1873 | * We first consider expired tasks. Those will likely not be | 2118 | * We first consider expired tasks. Those will likely not be |
@@ -1903,11 +2148,22 @@ skip_bitmap: | |||
1903 | head = array->queue + idx; | 2148 | head = array->queue + idx; |
1904 | curr = head->prev; | 2149 | curr = head->prev; |
1905 | skip_queue: | 2150 | skip_queue: |
1906 | tmp = list_entry(curr, task_t, run_list); | 2151 | tmp = list_entry(curr, struct task_struct, run_list); |
1907 | 2152 | ||
1908 | curr = curr->prev; | 2153 | curr = curr->prev; |
1909 | 2154 | ||
1910 | if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { | 2155 | /* |
2156 | * To help distribute high priority tasks accross CPUs we don't | ||
2157 | * skip a task if it will be the highest priority task (i.e. smallest | ||
2158 | * prio value) on its new queue regardless of its load weight | ||
2159 | */ | ||
2160 | skip_for_load = tmp->load_weight > rem_load_move; | ||
2161 | if (skip_for_load && idx < this_best_prio) | ||
2162 | skip_for_load = !best_prio_seen && idx == best_prio; | ||
2163 | if (skip_for_load || | ||
2164 | !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { | ||
2165 | |||
2166 | best_prio_seen |= idx == best_prio; | ||
1911 | if (curr != head) | 2167 | if (curr != head) |
1912 | goto skip_queue; | 2168 | goto skip_queue; |
1913 | idx++; | 2169 | idx++; |
@@ -1921,9 +2177,15 @@ skip_queue: | |||
1921 | 2177 | ||
1922 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); | 2178 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); |
1923 | pulled++; | 2179 | pulled++; |
2180 | rem_load_move -= tmp->load_weight; | ||
1924 | 2181 | ||
1925 | /* We only want to steal up to the prescribed number of tasks. */ | 2182 | /* |
1926 | if (pulled < max_nr_move) { | 2183 | * We only want to steal up to the prescribed number of tasks |
2184 | * and the prescribed amount of weighted load. | ||
2185 | */ | ||
2186 | if (pulled < max_nr_move && rem_load_move > 0) { | ||
2187 | if (idx < this_best_prio) | ||
2188 | this_best_prio = idx; | ||
1927 | if (curr != head) | 2189 | if (curr != head) |
1928 | goto skip_queue; | 2190 | goto skip_queue; |
1929 | idx++; | 2191 | idx++; |
@@ -1944,8 +2206,8 @@ out: | |||
1944 | 2206 | ||
1945 | /* | 2207 | /* |
1946 | * find_busiest_group finds and returns the busiest CPU group within the | 2208 | * find_busiest_group finds and returns the busiest CPU group within the |
1947 | * domain. It calculates and returns the number of tasks which should be | 2209 | * domain. It calculates and returns the amount of weighted load which |
1948 | * moved to restore balance via the imbalance parameter. | 2210 | * should be moved to restore balance via the imbalance parameter. |
1949 | */ | 2211 | */ |
1950 | static struct sched_group * | 2212 | static struct sched_group * |
1951 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 2213 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
@@ -1954,9 +2216,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1954 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 2216 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
1955 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 2217 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
1956 | unsigned long max_pull; | 2218 | unsigned long max_pull; |
2219 | unsigned long busiest_load_per_task, busiest_nr_running; | ||
2220 | unsigned long this_load_per_task, this_nr_running; | ||
1957 | int load_idx; | 2221 | int load_idx; |
2222 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2223 | int power_savings_balance = 1; | ||
2224 | unsigned long leader_nr_running = 0, min_load_per_task = 0; | ||
2225 | unsigned long min_nr_running = ULONG_MAX; | ||
2226 | struct sched_group *group_min = NULL, *group_leader = NULL; | ||
2227 | #endif | ||
1958 | 2228 | ||
1959 | max_load = this_load = total_load = total_pwr = 0; | 2229 | max_load = this_load = total_load = total_pwr = 0; |
2230 | busiest_load_per_task = busiest_nr_running = 0; | ||
2231 | this_load_per_task = this_nr_running = 0; | ||
1960 | if (idle == NOT_IDLE) | 2232 | if (idle == NOT_IDLE) |
1961 | load_idx = sd->busy_idx; | 2233 | load_idx = sd->busy_idx; |
1962 | else if (idle == NEWLY_IDLE) | 2234 | else if (idle == NEWLY_IDLE) |
@@ -1965,16 +2237,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1965 | load_idx = sd->idle_idx; | 2237 | load_idx = sd->idle_idx; |
1966 | 2238 | ||
1967 | do { | 2239 | do { |
1968 | unsigned long load; | 2240 | unsigned long load, group_capacity; |
1969 | int local_group; | 2241 | int local_group; |
1970 | int i; | 2242 | int i; |
2243 | unsigned long sum_nr_running, sum_weighted_load; | ||
1971 | 2244 | ||
1972 | local_group = cpu_isset(this_cpu, group->cpumask); | 2245 | local_group = cpu_isset(this_cpu, group->cpumask); |
1973 | 2246 | ||
1974 | /* Tally up the load of all CPUs in the group */ | 2247 | /* Tally up the load of all CPUs in the group */ |
1975 | avg_load = 0; | 2248 | sum_weighted_load = sum_nr_running = avg_load = 0; |
1976 | 2249 | ||
1977 | for_each_cpu_mask(i, group->cpumask) { | 2250 | for_each_cpu_mask(i, group->cpumask) { |
2251 | struct rq *rq = cpu_rq(i); | ||
2252 | |||
1978 | if (*sd_idle && !idle_cpu(i)) | 2253 | if (*sd_idle && !idle_cpu(i)) |
1979 | *sd_idle = 0; | 2254 | *sd_idle = 0; |
1980 | 2255 | ||
@@ -1985,6 +2260,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1985 | load = source_load(i, load_idx); | 2260 | load = source_load(i, load_idx); |
1986 | 2261 | ||
1987 | avg_load += load; | 2262 | avg_load += load; |
2263 | sum_nr_running += rq->nr_running; | ||
2264 | sum_weighted_load += rq->raw_weighted_load; | ||
1988 | } | 2265 | } |
1989 | 2266 | ||
1990 | total_load += avg_load; | 2267 | total_load += avg_load; |
@@ -1993,17 +2270,80 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1993 | /* Adjust by relative CPU power of the group */ | 2270 | /* Adjust by relative CPU power of the group */ |
1994 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | 2271 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; |
1995 | 2272 | ||
2273 | group_capacity = group->cpu_power / SCHED_LOAD_SCALE; | ||
2274 | |||
1996 | if (local_group) { | 2275 | if (local_group) { |
1997 | this_load = avg_load; | 2276 | this_load = avg_load; |
1998 | this = group; | 2277 | this = group; |
1999 | } else if (avg_load > max_load) { | 2278 | this_nr_running = sum_nr_running; |
2279 | this_load_per_task = sum_weighted_load; | ||
2280 | } else if (avg_load > max_load && | ||
2281 | sum_nr_running > group_capacity) { | ||
2000 | max_load = avg_load; | 2282 | max_load = avg_load; |
2001 | busiest = group; | 2283 | busiest = group; |
2284 | busiest_nr_running = sum_nr_running; | ||
2285 | busiest_load_per_task = sum_weighted_load; | ||
2002 | } | 2286 | } |
2287 | |||
2288 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2289 | /* | ||
2290 | * Busy processors will not participate in power savings | ||
2291 | * balance. | ||
2292 | */ | ||
2293 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
2294 | goto group_next; | ||
2295 | |||
2296 | /* | ||
2297 | * If the local group is idle or completely loaded | ||
2298 | * no need to do power savings balance at this domain | ||
2299 | */ | ||
2300 | if (local_group && (this_nr_running >= group_capacity || | ||
2301 | !this_nr_running)) | ||
2302 | power_savings_balance = 0; | ||
2303 | |||
2304 | /* | ||
2305 | * If a group is already running at full capacity or idle, | ||
2306 | * don't include that group in power savings calculations | ||
2307 | */ | ||
2308 | if (!power_savings_balance || sum_nr_running >= group_capacity | ||
2309 | || !sum_nr_running) | ||
2310 | goto group_next; | ||
2311 | |||
2312 | /* | ||
2313 | * Calculate the group which has the least non-idle load. | ||
2314 | * This is the group from where we need to pick up the load | ||
2315 | * for saving power | ||
2316 | */ | ||
2317 | if ((sum_nr_running < min_nr_running) || | ||
2318 | (sum_nr_running == min_nr_running && | ||
2319 | first_cpu(group->cpumask) < | ||
2320 | first_cpu(group_min->cpumask))) { | ||
2321 | group_min = group; | ||
2322 | min_nr_running = sum_nr_running; | ||
2323 | min_load_per_task = sum_weighted_load / | ||
2324 | sum_nr_running; | ||
2325 | } | ||
2326 | |||
2327 | /* | ||
2328 | * Calculate the group which is almost near its | ||
2329 | * capacity but still has some space to pick up some load | ||
2330 | * from other group and save more power | ||
2331 | */ | ||
2332 | if (sum_nr_running <= group_capacity - 1) { | ||
2333 | if (sum_nr_running > leader_nr_running || | ||
2334 | (sum_nr_running == leader_nr_running && | ||
2335 | first_cpu(group->cpumask) > | ||
2336 | first_cpu(group_leader->cpumask))) { | ||
2337 | group_leader = group; | ||
2338 | leader_nr_running = sum_nr_running; | ||
2339 | } | ||
2340 | } | ||
2341 | group_next: | ||
2342 | #endif | ||
2003 | group = group->next; | 2343 | group = group->next; |
2004 | } while (group != sd->groups); | 2344 | } while (group != sd->groups); |
2005 | 2345 | ||
2006 | if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) | 2346 | if (!busiest || this_load >= max_load || busiest_nr_running == 0) |
2007 | goto out_balanced; | 2347 | goto out_balanced; |
2008 | 2348 | ||
2009 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; | 2349 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; |
@@ -2012,6 +2352,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2012 | 100*max_load <= sd->imbalance_pct*this_load) | 2352 | 100*max_load <= sd->imbalance_pct*this_load) |
2013 | goto out_balanced; | 2353 | goto out_balanced; |
2014 | 2354 | ||
2355 | busiest_load_per_task /= busiest_nr_running; | ||
2015 | /* | 2356 | /* |
2016 | * We're trying to get all the cpus to the average_load, so we don't | 2357 | * We're trying to get all the cpus to the average_load, so we don't |
2017 | * want to push ourselves above the average load, nor do we wish to | 2358 | * want to push ourselves above the average load, nor do we wish to |
@@ -2023,21 +2364,49 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2023 | * by pulling tasks to us. Be careful of negative numbers as they'll | 2364 | * by pulling tasks to us. Be careful of negative numbers as they'll |
2024 | * appear as very large values with unsigned longs. | 2365 | * appear as very large values with unsigned longs. |
2025 | */ | 2366 | */ |
2367 | if (max_load <= busiest_load_per_task) | ||
2368 | goto out_balanced; | ||
2369 | |||
2370 | /* | ||
2371 | * In the presence of smp nice balancing, certain scenarios can have | ||
2372 | * max load less than avg load(as we skip the groups at or below | ||
2373 | * its cpu_power, while calculating max_load..) | ||
2374 | */ | ||
2375 | if (max_load < avg_load) { | ||
2376 | *imbalance = 0; | ||
2377 | goto small_imbalance; | ||
2378 | } | ||
2026 | 2379 | ||
2027 | /* Don't want to pull so many tasks that a group would go idle */ | 2380 | /* Don't want to pull so many tasks that a group would go idle */ |
2028 | max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); | 2381 | max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); |
2029 | 2382 | ||
2030 | /* How much load to actually move to equalise the imbalance */ | 2383 | /* How much load to actually move to equalise the imbalance */ |
2031 | *imbalance = min(max_pull * busiest->cpu_power, | 2384 | *imbalance = min(max_pull * busiest->cpu_power, |
2032 | (avg_load - this_load) * this->cpu_power) | 2385 | (avg_load - this_load) * this->cpu_power) |
2033 | / SCHED_LOAD_SCALE; | 2386 | / SCHED_LOAD_SCALE; |
2034 | 2387 | ||
2035 | if (*imbalance < SCHED_LOAD_SCALE) { | 2388 | /* |
2036 | unsigned long pwr_now = 0, pwr_move = 0; | 2389 | * if *imbalance is less than the average load per runnable task |
2037 | unsigned long tmp; | 2390 | * there is no gaurantee that any tasks will be moved so we'll have |
2391 | * a think about bumping its value to force at least one task to be | ||
2392 | * moved | ||
2393 | */ | ||
2394 | if (*imbalance < busiest_load_per_task) { | ||
2395 | unsigned long tmp, pwr_now, pwr_move; | ||
2396 | unsigned int imbn; | ||
2397 | |||
2398 | small_imbalance: | ||
2399 | pwr_move = pwr_now = 0; | ||
2400 | imbn = 2; | ||
2401 | if (this_nr_running) { | ||
2402 | this_load_per_task /= this_nr_running; | ||
2403 | if (busiest_load_per_task > this_load_per_task) | ||
2404 | imbn = 1; | ||
2405 | } else | ||
2406 | this_load_per_task = SCHED_LOAD_SCALE; | ||
2038 | 2407 | ||
2039 | if (max_load - this_load >= SCHED_LOAD_SCALE*2) { | 2408 | if (max_load - this_load >= busiest_load_per_task * imbn) { |
2040 | *imbalance = 1; | 2409 | *imbalance = busiest_load_per_task; |
2041 | return busiest; | 2410 | return busiest; |
2042 | } | 2411 | } |
2043 | 2412 | ||
@@ -2047,39 +2416,47 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2047 | * moving them. | 2416 | * moving them. |
2048 | */ | 2417 | */ |
2049 | 2418 | ||
2050 | pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); | 2419 | pwr_now += busiest->cpu_power * |
2051 | pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); | 2420 | min(busiest_load_per_task, max_load); |
2421 | pwr_now += this->cpu_power * | ||
2422 | min(this_load_per_task, this_load); | ||
2052 | pwr_now /= SCHED_LOAD_SCALE; | 2423 | pwr_now /= SCHED_LOAD_SCALE; |
2053 | 2424 | ||
2054 | /* Amount of load we'd subtract */ | 2425 | /* Amount of load we'd subtract */ |
2055 | tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; | 2426 | tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; |
2056 | if (max_load > tmp) | 2427 | if (max_load > tmp) |
2057 | pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, | 2428 | pwr_move += busiest->cpu_power * |
2058 | max_load - tmp); | 2429 | min(busiest_load_per_task, max_load - tmp); |
2059 | 2430 | ||
2060 | /* Amount of load we'd add */ | 2431 | /* Amount of load we'd add */ |
2061 | if (max_load*busiest->cpu_power < | 2432 | if (max_load*busiest->cpu_power < |
2062 | SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) | 2433 | busiest_load_per_task*SCHED_LOAD_SCALE) |
2063 | tmp = max_load*busiest->cpu_power/this->cpu_power; | 2434 | tmp = max_load*busiest->cpu_power/this->cpu_power; |
2064 | else | 2435 | else |
2065 | tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; | 2436 | tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; |
2066 | pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); | 2437 | pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); |
2067 | pwr_move /= SCHED_LOAD_SCALE; | 2438 | pwr_move /= SCHED_LOAD_SCALE; |
2068 | 2439 | ||
2069 | /* Move if we gain throughput */ | 2440 | /* Move if we gain throughput */ |
2070 | if (pwr_move <= pwr_now) | 2441 | if (pwr_move <= pwr_now) |
2071 | goto out_balanced; | 2442 | goto out_balanced; |
2072 | 2443 | ||
2073 | *imbalance = 1; | 2444 | *imbalance = busiest_load_per_task; |
2074 | return busiest; | ||
2075 | } | 2445 | } |
2076 | 2446 | ||
2077 | /* Get rid of the scaling factor, rounding down as we divide */ | ||
2078 | *imbalance = *imbalance / SCHED_LOAD_SCALE; | ||
2079 | return busiest; | 2447 | return busiest; |
2080 | 2448 | ||
2081 | out_balanced: | 2449 | out_balanced: |
2450 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2451 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
2452 | goto ret; | ||
2082 | 2453 | ||
2454 | if (this == group_leader && group_leader != group_min) { | ||
2455 | *imbalance = min_load_per_task; | ||
2456 | return group_min; | ||
2457 | } | ||
2458 | ret: | ||
2459 | #endif | ||
2083 | *imbalance = 0; | 2460 | *imbalance = 0; |
2084 | return NULL; | 2461 | return NULL; |
2085 | } | 2462 | } |
@@ -2087,19 +2464,23 @@ out_balanced: | |||
2087 | /* | 2464 | /* |
2088 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 2465 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
2089 | */ | 2466 | */ |
2090 | static runqueue_t *find_busiest_queue(struct sched_group *group, | 2467 | static struct rq * |
2091 | enum idle_type idle) | 2468 | find_busiest_queue(struct sched_group *group, enum idle_type idle, |
2469 | unsigned long imbalance) | ||
2092 | { | 2470 | { |
2093 | unsigned long load, max_load = 0; | 2471 | struct rq *busiest = NULL, *rq; |
2094 | runqueue_t *busiest = NULL; | 2472 | unsigned long max_load = 0; |
2095 | int i; | 2473 | int i; |
2096 | 2474 | ||
2097 | for_each_cpu_mask(i, group->cpumask) { | 2475 | for_each_cpu_mask(i, group->cpumask) { |
2098 | load = source_load(i, 0); | 2476 | rq = cpu_rq(i); |
2099 | 2477 | ||
2100 | if (load > max_load) { | 2478 | if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) |
2101 | max_load = load; | 2479 | continue; |
2102 | busiest = cpu_rq(i); | 2480 | |
2481 | if (rq->raw_weighted_load > max_load) { | ||
2482 | max_load = rq->raw_weighted_load; | ||
2483 | busiest = rq; | ||
2103 | } | 2484 | } |
2104 | } | 2485 | } |
2105 | 2486 | ||
@@ -2112,23 +2493,27 @@ static runqueue_t *find_busiest_queue(struct sched_group *group, | |||
2112 | */ | 2493 | */ |
2113 | #define MAX_PINNED_INTERVAL 512 | 2494 | #define MAX_PINNED_INTERVAL 512 |
2114 | 2495 | ||
2496 | static inline unsigned long minus_1_or_zero(unsigned long n) | ||
2497 | { | ||
2498 | return n > 0 ? n - 1 : 0; | ||
2499 | } | ||
2500 | |||
2115 | /* | 2501 | /* |
2116 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2502 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
2117 | * tasks if there is an imbalance. | 2503 | * tasks if there is an imbalance. |
2118 | * | 2504 | * |
2119 | * Called with this_rq unlocked. | 2505 | * Called with this_rq unlocked. |
2120 | */ | 2506 | */ |
2121 | static int load_balance(int this_cpu, runqueue_t *this_rq, | 2507 | static int load_balance(int this_cpu, struct rq *this_rq, |
2122 | struct sched_domain *sd, enum idle_type idle) | 2508 | struct sched_domain *sd, enum idle_type idle) |
2123 | { | 2509 | { |
2510 | int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | ||
2124 | struct sched_group *group; | 2511 | struct sched_group *group; |
2125 | runqueue_t *busiest; | ||
2126 | unsigned long imbalance; | 2512 | unsigned long imbalance; |
2127 | int nr_moved, all_pinned = 0; | 2513 | struct rq *busiest; |
2128 | int active_balance = 0; | ||
2129 | int sd_idle = 0; | ||
2130 | 2514 | ||
2131 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) | 2515 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && |
2516 | !sched_smt_power_savings) | ||
2132 | sd_idle = 1; | 2517 | sd_idle = 1; |
2133 | 2518 | ||
2134 | schedstat_inc(sd, lb_cnt[idle]); | 2519 | schedstat_inc(sd, lb_cnt[idle]); |
@@ -2139,7 +2524,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2139 | goto out_balanced; | 2524 | goto out_balanced; |
2140 | } | 2525 | } |
2141 | 2526 | ||
2142 | busiest = find_busiest_queue(group, idle); | 2527 | busiest = find_busiest_queue(group, idle, imbalance); |
2143 | if (!busiest) { | 2528 | if (!busiest) { |
2144 | schedstat_inc(sd, lb_nobusyq[idle]); | 2529 | schedstat_inc(sd, lb_nobusyq[idle]); |
2145 | goto out_balanced; | 2530 | goto out_balanced; |
@@ -2159,7 +2544,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2159 | */ | 2544 | */ |
2160 | double_rq_lock(this_rq, busiest); | 2545 | double_rq_lock(this_rq, busiest); |
2161 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2546 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
2162 | imbalance, sd, idle, &all_pinned); | 2547 | minus_1_or_zero(busiest->nr_running), |
2548 | imbalance, sd, idle, &all_pinned); | ||
2163 | double_rq_unlock(this_rq, busiest); | 2549 | double_rq_unlock(this_rq, busiest); |
2164 | 2550 | ||
2165 | /* All tasks on this runqueue were pinned by CPU affinity */ | 2551 | /* All tasks on this runqueue were pinned by CPU affinity */ |
@@ -2216,7 +2602,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2216 | sd->balance_interval *= 2; | 2602 | sd->balance_interval *= 2; |
2217 | } | 2603 | } |
2218 | 2604 | ||
2219 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2605 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2606 | !sched_smt_power_savings) | ||
2220 | return -1; | 2607 | return -1; |
2221 | return nr_moved; | 2608 | return nr_moved; |
2222 | 2609 | ||
@@ -2231,7 +2618,8 @@ out_one_pinned: | |||
2231 | (sd->balance_interval < sd->max_interval)) | 2618 | (sd->balance_interval < sd->max_interval)) |
2232 | sd->balance_interval *= 2; | 2619 | sd->balance_interval *= 2; |
2233 | 2620 | ||
2234 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2621 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2622 | !sched_smt_power_savings) | ||
2235 | return -1; | 2623 | return -1; |
2236 | return 0; | 2624 | return 0; |
2237 | } | 2625 | } |
@@ -2243,16 +2631,16 @@ out_one_pinned: | |||
2243 | * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). | 2631 | * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). |
2244 | * this_rq is locked. | 2632 | * this_rq is locked. |
2245 | */ | 2633 | */ |
2246 | static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | 2634 | static int |
2247 | struct sched_domain *sd) | 2635 | load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) |
2248 | { | 2636 | { |
2249 | struct sched_group *group; | 2637 | struct sched_group *group; |
2250 | runqueue_t *busiest = NULL; | 2638 | struct rq *busiest = NULL; |
2251 | unsigned long imbalance; | 2639 | unsigned long imbalance; |
2252 | int nr_moved = 0; | 2640 | int nr_moved = 0; |
2253 | int sd_idle = 0; | 2641 | int sd_idle = 0; |
2254 | 2642 | ||
2255 | if (sd->flags & SD_SHARE_CPUPOWER) | 2643 | if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) |
2256 | sd_idle = 1; | 2644 | sd_idle = 1; |
2257 | 2645 | ||
2258 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); | 2646 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); |
@@ -2262,7 +2650,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2262 | goto out_balanced; | 2650 | goto out_balanced; |
2263 | } | 2651 | } |
2264 | 2652 | ||
2265 | busiest = find_busiest_queue(group, NEWLY_IDLE); | 2653 | busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance); |
2266 | if (!busiest) { | 2654 | if (!busiest) { |
2267 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); | 2655 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); |
2268 | goto out_balanced; | 2656 | goto out_balanced; |
@@ -2277,6 +2665,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2277 | /* Attempt to move tasks */ | 2665 | /* Attempt to move tasks */ |
2278 | double_lock_balance(this_rq, busiest); | 2666 | double_lock_balance(this_rq, busiest); |
2279 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2667 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
2668 | minus_1_or_zero(busiest->nr_running), | ||
2280 | imbalance, sd, NEWLY_IDLE, NULL); | 2669 | imbalance, sd, NEWLY_IDLE, NULL); |
2281 | spin_unlock(&busiest->lock); | 2670 | spin_unlock(&busiest->lock); |
2282 | } | 2671 | } |
@@ -2292,9 +2681,11 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2292 | 2681 | ||
2293 | out_balanced: | 2682 | out_balanced: |
2294 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | 2683 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); |
2295 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2684 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2685 | !sched_smt_power_savings) | ||
2296 | return -1; | 2686 | return -1; |
2297 | sd->nr_balance_failed = 0; | 2687 | sd->nr_balance_failed = 0; |
2688 | |||
2298 | return 0; | 2689 | return 0; |
2299 | } | 2690 | } |
2300 | 2691 | ||
@@ -2302,16 +2693,15 @@ out_balanced: | |||
2302 | * idle_balance is called by schedule() if this_cpu is about to become | 2693 | * idle_balance is called by schedule() if this_cpu is about to become |
2303 | * idle. Attempts to pull tasks from other CPUs. | 2694 | * idle. Attempts to pull tasks from other CPUs. |
2304 | */ | 2695 | */ |
2305 | static void idle_balance(int this_cpu, runqueue_t *this_rq) | 2696 | static void idle_balance(int this_cpu, struct rq *this_rq) |
2306 | { | 2697 | { |
2307 | struct sched_domain *sd; | 2698 | struct sched_domain *sd; |
2308 | 2699 | ||
2309 | for_each_domain(this_cpu, sd) { | 2700 | for_each_domain(this_cpu, sd) { |
2310 | if (sd->flags & SD_BALANCE_NEWIDLE) { | 2701 | if (sd->flags & SD_BALANCE_NEWIDLE) { |
2311 | if (load_balance_newidle(this_cpu, this_rq, sd)) { | 2702 | /* If we've pulled tasks over stop searching: */ |
2312 | /* We've pulled tasks over so stop searching */ | 2703 | if (load_balance_newidle(this_cpu, this_rq, sd)) |
2313 | break; | 2704 | break; |
2314 | } | ||
2315 | } | 2705 | } |
2316 | } | 2706 | } |
2317 | } | 2707 | } |
@@ -2324,14 +2714,14 @@ static void idle_balance(int this_cpu, runqueue_t *this_rq) | |||
2324 | * | 2714 | * |
2325 | * Called with busiest_rq locked. | 2715 | * Called with busiest_rq locked. |
2326 | */ | 2716 | */ |
2327 | static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) | 2717 | static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) |
2328 | { | 2718 | { |
2329 | struct sched_domain *sd; | ||
2330 | runqueue_t *target_rq; | ||
2331 | int target_cpu = busiest_rq->push_cpu; | 2719 | int target_cpu = busiest_rq->push_cpu; |
2720 | struct sched_domain *sd; | ||
2721 | struct rq *target_rq; | ||
2332 | 2722 | ||
2723 | /* Is there any task to move? */ | ||
2333 | if (busiest_rq->nr_running <= 1) | 2724 | if (busiest_rq->nr_running <= 1) |
2334 | /* no task to move */ | ||
2335 | return; | 2725 | return; |
2336 | 2726 | ||
2337 | target_rq = cpu_rq(target_cpu); | 2727 | target_rq = cpu_rq(target_cpu); |
@@ -2347,21 +2737,22 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) | |||
2347 | double_lock_balance(busiest_rq, target_rq); | 2737 | double_lock_balance(busiest_rq, target_rq); |
2348 | 2738 | ||
2349 | /* Search for an sd spanning us and the target CPU. */ | 2739 | /* Search for an sd spanning us and the target CPU. */ |
2350 | for_each_domain(target_cpu, sd) | 2740 | for_each_domain(target_cpu, sd) { |
2351 | if ((sd->flags & SD_LOAD_BALANCE) && | 2741 | if ((sd->flags & SD_LOAD_BALANCE) && |
2352 | cpu_isset(busiest_cpu, sd->span)) | 2742 | cpu_isset(busiest_cpu, sd->span)) |
2353 | break; | 2743 | break; |
2744 | } | ||
2354 | 2745 | ||
2355 | if (unlikely(sd == NULL)) | 2746 | if (likely(sd)) { |
2356 | goto out; | 2747 | schedstat_inc(sd, alb_cnt); |
2357 | |||
2358 | schedstat_inc(sd, alb_cnt); | ||
2359 | 2748 | ||
2360 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) | 2749 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, |
2361 | schedstat_inc(sd, alb_pushed); | 2750 | RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, |
2362 | else | 2751 | NULL)) |
2363 | schedstat_inc(sd, alb_failed); | 2752 | schedstat_inc(sd, alb_pushed); |
2364 | out: | 2753 | else |
2754 | schedstat_inc(sd, alb_failed); | ||
2755 | } | ||
2365 | spin_unlock(&target_rq->lock); | 2756 | spin_unlock(&target_rq->lock); |
2366 | } | 2757 | } |
2367 | 2758 | ||
@@ -2374,23 +2765,27 @@ out: | |||
2374 | * Balancing parameters are set up in arch_init_sched_domains. | 2765 | * Balancing parameters are set up in arch_init_sched_domains. |
2375 | */ | 2766 | */ |
2376 | 2767 | ||
2377 | /* Don't have all balancing operations going off at once */ | 2768 | /* Don't have all balancing operations going off at once: */ |
2378 | #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) | 2769 | static inline unsigned long cpu_offset(int cpu) |
2770 | { | ||
2771 | return jiffies + cpu * HZ / NR_CPUS; | ||
2772 | } | ||
2379 | 2773 | ||
2380 | static void rebalance_tick(int this_cpu, runqueue_t *this_rq, | 2774 | static void |
2381 | enum idle_type idle) | 2775 | rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) |
2382 | { | 2776 | { |
2383 | unsigned long old_load, this_load; | 2777 | unsigned long this_load, interval, j = cpu_offset(this_cpu); |
2384 | unsigned long j = jiffies + CPU_OFFSET(this_cpu); | ||
2385 | struct sched_domain *sd; | 2778 | struct sched_domain *sd; |
2386 | int i; | 2779 | int i, scale; |
2780 | |||
2781 | this_load = this_rq->raw_weighted_load; | ||
2782 | |||
2783 | /* Update our load: */ | ||
2784 | for (i = 0, scale = 1; i < 3; i++, scale <<= 1) { | ||
2785 | unsigned long old_load, new_load; | ||
2387 | 2786 | ||
2388 | this_load = this_rq->nr_running * SCHED_LOAD_SCALE; | ||
2389 | /* Update our load */ | ||
2390 | for (i = 0; i < 3; i++) { | ||
2391 | unsigned long new_load = this_load; | ||
2392 | int scale = 1 << i; | ||
2393 | old_load = this_rq->cpu_load[i]; | 2787 | old_load = this_rq->cpu_load[i]; |
2788 | new_load = this_load; | ||
2394 | /* | 2789 | /* |
2395 | * Round up the averaging division if load is increasing. This | 2790 | * Round up the averaging division if load is increasing. This |
2396 | * prevents us from getting stuck on 9 if the load is 10, for | 2791 | * prevents us from getting stuck on 9 if the load is 10, for |
@@ -2402,8 +2797,6 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, | |||
2402 | } | 2797 | } |
2403 | 2798 | ||
2404 | for_each_domain(this_cpu, sd) { | 2799 | for_each_domain(this_cpu, sd) { |
2405 | unsigned long interval; | ||
2406 | |||
2407 | if (!(sd->flags & SD_LOAD_BALANCE)) | 2800 | if (!(sd->flags & SD_LOAD_BALANCE)) |
2408 | continue; | 2801 | continue; |
2409 | 2802 | ||
@@ -2433,17 +2826,18 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, | |||
2433 | /* | 2826 | /* |
2434 | * on UP we do not need to balance between CPUs: | 2827 | * on UP we do not need to balance between CPUs: |
2435 | */ | 2828 | */ |
2436 | static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) | 2829 | static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle) |
2437 | { | 2830 | { |
2438 | } | 2831 | } |
2439 | static inline void idle_balance(int cpu, runqueue_t *rq) | 2832 | static inline void idle_balance(int cpu, struct rq *rq) |
2440 | { | 2833 | { |
2441 | } | 2834 | } |
2442 | #endif | 2835 | #endif |
2443 | 2836 | ||
2444 | static inline int wake_priority_sleeper(runqueue_t *rq) | 2837 | static inline int wake_priority_sleeper(struct rq *rq) |
2445 | { | 2838 | { |
2446 | int ret = 0; | 2839 | int ret = 0; |
2840 | |||
2447 | #ifdef CONFIG_SCHED_SMT | 2841 | #ifdef CONFIG_SCHED_SMT |
2448 | spin_lock(&rq->lock); | 2842 | spin_lock(&rq->lock); |
2449 | /* | 2843 | /* |
@@ -2467,25 +2861,26 @@ EXPORT_PER_CPU_SYMBOL(kstat); | |||
2467 | * This is called on clock ticks and on context switches. | 2861 | * This is called on clock ticks and on context switches. |
2468 | * Bank in p->sched_time the ns elapsed since the last tick or switch. | 2862 | * Bank in p->sched_time the ns elapsed since the last tick or switch. |
2469 | */ | 2863 | */ |
2470 | static inline void update_cpu_clock(task_t *p, runqueue_t *rq, | 2864 | static inline void |
2471 | unsigned long long now) | 2865 | update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) |
2472 | { | 2866 | { |
2473 | unsigned long long last = max(p->timestamp, rq->timestamp_last_tick); | 2867 | p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick); |
2474 | p->sched_time += now - last; | ||
2475 | } | 2868 | } |
2476 | 2869 | ||
2477 | /* | 2870 | /* |
2478 | * Return current->sched_time plus any more ns on the sched_clock | 2871 | * Return current->sched_time plus any more ns on the sched_clock |
2479 | * that have not yet been banked. | 2872 | * that have not yet been banked. |
2480 | */ | 2873 | */ |
2481 | unsigned long long current_sched_time(const task_t *tsk) | 2874 | unsigned long long current_sched_time(const struct task_struct *p) |
2482 | { | 2875 | { |
2483 | unsigned long long ns; | 2876 | unsigned long long ns; |
2484 | unsigned long flags; | 2877 | unsigned long flags; |
2878 | |||
2485 | local_irq_save(flags); | 2879 | local_irq_save(flags); |
2486 | ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick); | 2880 | ns = max(p->timestamp, task_rq(p)->timestamp_last_tick); |
2487 | ns = tsk->sched_time + (sched_clock() - ns); | 2881 | ns = p->sched_time + sched_clock() - ns; |
2488 | local_irq_restore(flags); | 2882 | local_irq_restore(flags); |
2883 | |||
2489 | return ns; | 2884 | return ns; |
2490 | } | 2885 | } |
2491 | 2886 | ||
@@ -2499,11 +2894,16 @@ unsigned long long current_sched_time(const task_t *tsk) | |||
2499 | * increasing number of running tasks. We also ignore the interactivity | 2894 | * increasing number of running tasks. We also ignore the interactivity |
2500 | * if a better static_prio task has expired: | 2895 | * if a better static_prio task has expired: |
2501 | */ | 2896 | */ |
2502 | #define EXPIRED_STARVING(rq) \ | 2897 | static inline int expired_starving(struct rq *rq) |
2503 | ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ | 2898 | { |
2504 | (jiffies - (rq)->expired_timestamp >= \ | 2899 | if (rq->curr->static_prio > rq->best_expired_prio) |
2505 | STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ | 2900 | return 1; |
2506 | ((rq)->curr->static_prio > (rq)->best_expired_prio)) | 2901 | if (!STARVATION_LIMIT || !rq->expired_timestamp) |
2902 | return 0; | ||
2903 | if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) | ||
2904 | return 1; | ||
2905 | return 0; | ||
2906 | } | ||
2507 | 2907 | ||
2508 | /* | 2908 | /* |
2509 | * Account user cpu time to a process. | 2909 | * Account user cpu time to a process. |
@@ -2536,7 +2936,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
2536 | cputime_t cputime) | 2936 | cputime_t cputime) |
2537 | { | 2937 | { |
2538 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2938 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
2539 | runqueue_t *rq = this_rq(); | 2939 | struct rq *rq = this_rq(); |
2540 | cputime64_t tmp; | 2940 | cputime64_t tmp; |
2541 | 2941 | ||
2542 | p->stime = cputime_add(p->stime, cputime); | 2942 | p->stime = cputime_add(p->stime, cputime); |
@@ -2566,7 +2966,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal) | |||
2566 | { | 2966 | { |
2567 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2967 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
2568 | cputime64_t tmp = cputime_to_cputime64(steal); | 2968 | cputime64_t tmp = cputime_to_cputime64(steal); |
2569 | runqueue_t *rq = this_rq(); | 2969 | struct rq *rq = this_rq(); |
2570 | 2970 | ||
2571 | if (p == rq->idle) { | 2971 | if (p == rq->idle) { |
2572 | p->stime = cputime_add(p->stime, steal); | 2972 | p->stime = cputime_add(p->stime, steal); |
@@ -2587,10 +2987,10 @@ void account_steal_time(struct task_struct *p, cputime_t steal) | |||
2587 | */ | 2987 | */ |
2588 | void scheduler_tick(void) | 2988 | void scheduler_tick(void) |
2589 | { | 2989 | { |
2590 | int cpu = smp_processor_id(); | ||
2591 | runqueue_t *rq = this_rq(); | ||
2592 | task_t *p = current; | ||
2593 | unsigned long long now = sched_clock(); | 2990 | unsigned long long now = sched_clock(); |
2991 | struct task_struct *p = current; | ||
2992 | int cpu = smp_processor_id(); | ||
2993 | struct rq *rq = cpu_rq(cpu); | ||
2594 | 2994 | ||
2595 | update_cpu_clock(p, rq, now); | 2995 | update_cpu_clock(p, rq, now); |
2596 | 2996 | ||
@@ -2640,7 +3040,7 @@ void scheduler_tick(void) | |||
2640 | 3040 | ||
2641 | if (!rq->expired_timestamp) | 3041 | if (!rq->expired_timestamp) |
2642 | rq->expired_timestamp = jiffies; | 3042 | rq->expired_timestamp = jiffies; |
2643 | if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { | 3043 | if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { |
2644 | enqueue_task(p, rq->expired); | 3044 | enqueue_task(p, rq->expired); |
2645 | if (p->static_prio < rq->best_expired_prio) | 3045 | if (p->static_prio < rq->best_expired_prio) |
2646 | rq->best_expired_prio = p->static_prio; | 3046 | rq->best_expired_prio = p->static_prio; |
@@ -2679,55 +3079,42 @@ out: | |||
2679 | } | 3079 | } |
2680 | 3080 | ||
2681 | #ifdef CONFIG_SCHED_SMT | 3081 | #ifdef CONFIG_SCHED_SMT |
2682 | static inline void wakeup_busy_runqueue(runqueue_t *rq) | 3082 | static inline void wakeup_busy_runqueue(struct rq *rq) |
2683 | { | 3083 | { |
2684 | /* If an SMT runqueue is sleeping due to priority reasons wake it up */ | 3084 | /* If an SMT runqueue is sleeping due to priority reasons wake it up */ |
2685 | if (rq->curr == rq->idle && rq->nr_running) | 3085 | if (rq->curr == rq->idle && rq->nr_running) |
2686 | resched_task(rq->idle); | 3086 | resched_task(rq->idle); |
2687 | } | 3087 | } |
2688 | 3088 | ||
2689 | static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 3089 | /* |
3090 | * Called with interrupt disabled and this_rq's runqueue locked. | ||
3091 | */ | ||
3092 | static void wake_sleeping_dependent(int this_cpu) | ||
2690 | { | 3093 | { |
2691 | struct sched_domain *tmp, *sd = NULL; | 3094 | struct sched_domain *tmp, *sd = NULL; |
2692 | cpumask_t sibling_map; | ||
2693 | int i; | 3095 | int i; |
2694 | 3096 | ||
2695 | for_each_domain(this_cpu, tmp) | 3097 | for_each_domain(this_cpu, tmp) { |
2696 | if (tmp->flags & SD_SHARE_CPUPOWER) | 3098 | if (tmp->flags & SD_SHARE_CPUPOWER) { |
2697 | sd = tmp; | 3099 | sd = tmp; |
3100 | break; | ||
3101 | } | ||
3102 | } | ||
2698 | 3103 | ||
2699 | if (!sd) | 3104 | if (!sd) |
2700 | return; | 3105 | return; |
2701 | 3106 | ||
2702 | /* | 3107 | for_each_cpu_mask(i, sd->span) { |
2703 | * Unlock the current runqueue because we have to lock in | 3108 | struct rq *smt_rq = cpu_rq(i); |
2704 | * CPU order to avoid deadlocks. Caller knows that we might | ||
2705 | * unlock. We keep IRQs disabled. | ||
2706 | */ | ||
2707 | spin_unlock(&this_rq->lock); | ||
2708 | |||
2709 | sibling_map = sd->span; | ||
2710 | 3109 | ||
2711 | for_each_cpu_mask(i, sibling_map) | 3110 | if (i == this_cpu) |
2712 | spin_lock(&cpu_rq(i)->lock); | 3111 | continue; |
2713 | /* | 3112 | if (unlikely(!spin_trylock(&smt_rq->lock))) |
2714 | * We clear this CPU from the mask. This both simplifies the | 3113 | continue; |
2715 | * inner loop and keps this_rq locked when we exit: | ||
2716 | */ | ||
2717 | cpu_clear(this_cpu, sibling_map); | ||
2718 | |||
2719 | for_each_cpu_mask(i, sibling_map) { | ||
2720 | runqueue_t *smt_rq = cpu_rq(i); | ||
2721 | 3114 | ||
2722 | wakeup_busy_runqueue(smt_rq); | 3115 | wakeup_busy_runqueue(smt_rq); |
3116 | spin_unlock(&smt_rq->lock); | ||
2723 | } | 3117 | } |
2724 | |||
2725 | for_each_cpu_mask(i, sibling_map) | ||
2726 | spin_unlock(&cpu_rq(i)->lock); | ||
2727 | /* | ||
2728 | * We exit with this_cpu's rq still held and IRQs | ||
2729 | * still disabled: | ||
2730 | */ | ||
2731 | } | 3118 | } |
2732 | 3119 | ||
2733 | /* | 3120 | /* |
@@ -2735,57 +3122,53 @@ static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | |||
2735 | * utilize, if another task runs on a sibling. This models the | 3122 | * utilize, if another task runs on a sibling. This models the |
2736 | * slowdown effect of other tasks running on siblings: | 3123 | * slowdown effect of other tasks running on siblings: |
2737 | */ | 3124 | */ |
2738 | static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) | 3125 | static inline unsigned long |
3126 | smt_slice(struct task_struct *p, struct sched_domain *sd) | ||
2739 | { | 3127 | { |
2740 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; | 3128 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; |
2741 | } | 3129 | } |
2742 | 3130 | ||
2743 | static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 3131 | /* |
3132 | * To minimise lock contention and not have to drop this_rq's runlock we only | ||
3133 | * trylock the sibling runqueues and bypass those runqueues if we fail to | ||
3134 | * acquire their lock. As we only trylock the normal locking order does not | ||
3135 | * need to be obeyed. | ||
3136 | */ | ||
3137 | static int | ||
3138 | dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p) | ||
2744 | { | 3139 | { |
2745 | struct sched_domain *tmp, *sd = NULL; | 3140 | struct sched_domain *tmp, *sd = NULL; |
2746 | cpumask_t sibling_map; | ||
2747 | prio_array_t *array; | ||
2748 | int ret = 0, i; | 3141 | int ret = 0, i; |
2749 | task_t *p; | ||
2750 | 3142 | ||
2751 | for_each_domain(this_cpu, tmp) | 3143 | /* kernel/rt threads do not participate in dependent sleeping */ |
2752 | if (tmp->flags & SD_SHARE_CPUPOWER) | 3144 | if (!p->mm || rt_task(p)) |
3145 | return 0; | ||
3146 | |||
3147 | for_each_domain(this_cpu, tmp) { | ||
3148 | if (tmp->flags & SD_SHARE_CPUPOWER) { | ||
2753 | sd = tmp; | 3149 | sd = tmp; |
3150 | break; | ||
3151 | } | ||
3152 | } | ||
2754 | 3153 | ||
2755 | if (!sd) | 3154 | if (!sd) |
2756 | return 0; | 3155 | return 0; |
2757 | 3156 | ||
2758 | /* | 3157 | for_each_cpu_mask(i, sd->span) { |
2759 | * The same locking rules and details apply as for | 3158 | struct task_struct *smt_curr; |
2760 | * wake_sleeping_dependent(): | 3159 | struct rq *smt_rq; |
2761 | */ | ||
2762 | spin_unlock(&this_rq->lock); | ||
2763 | sibling_map = sd->span; | ||
2764 | for_each_cpu_mask(i, sibling_map) | ||
2765 | spin_lock(&cpu_rq(i)->lock); | ||
2766 | cpu_clear(this_cpu, sibling_map); | ||
2767 | 3160 | ||
2768 | /* | 3161 | if (i == this_cpu) |
2769 | * Establish next task to be run - it might have gone away because | 3162 | continue; |
2770 | * we released the runqueue lock above: | ||
2771 | */ | ||
2772 | if (!this_rq->nr_running) | ||
2773 | goto out_unlock; | ||
2774 | array = this_rq->active; | ||
2775 | if (!array->nr_active) | ||
2776 | array = this_rq->expired; | ||
2777 | BUG_ON(!array->nr_active); | ||
2778 | 3163 | ||
2779 | p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, | 3164 | smt_rq = cpu_rq(i); |
2780 | task_t, run_list); | 3165 | if (unlikely(!spin_trylock(&smt_rq->lock))) |
3166 | continue; | ||
2781 | 3167 | ||
2782 | for_each_cpu_mask(i, sibling_map) { | 3168 | smt_curr = smt_rq->curr; |
2783 | runqueue_t *smt_rq = cpu_rq(i); | ||
2784 | task_t *smt_curr = smt_rq->curr; | ||
2785 | 3169 | ||
2786 | /* Kernel threads do not participate in dependent sleeping */ | 3170 | if (!smt_curr->mm) |
2787 | if (!p->mm || !smt_curr->mm || rt_task(p)) | 3171 | goto unlock; |
2788 | goto check_smt_task; | ||
2789 | 3172 | ||
2790 | /* | 3173 | /* |
2791 | * If a user task with lower static priority than the | 3174 | * If a user task with lower static priority than the |
@@ -2803,49 +3186,23 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | |||
2803 | if ((jiffies % DEF_TIMESLICE) > | 3186 | if ((jiffies % DEF_TIMESLICE) > |
2804 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) | 3187 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) |
2805 | ret = 1; | 3188 | ret = 1; |
2806 | } else | 3189 | } else { |
2807 | if (smt_curr->static_prio < p->static_prio && | 3190 | if (smt_curr->static_prio < p->static_prio && |
2808 | !TASK_PREEMPTS_CURR(p, smt_rq) && | 3191 | !TASK_PREEMPTS_CURR(p, smt_rq) && |
2809 | smt_slice(smt_curr, sd) > task_timeslice(p)) | 3192 | smt_slice(smt_curr, sd) > task_timeslice(p)) |
2810 | ret = 1; | 3193 | ret = 1; |
2811 | |||
2812 | check_smt_task: | ||
2813 | if ((!smt_curr->mm && smt_curr != smt_rq->idle) || | ||
2814 | rt_task(smt_curr)) | ||
2815 | continue; | ||
2816 | if (!p->mm) { | ||
2817 | wakeup_busy_runqueue(smt_rq); | ||
2818 | continue; | ||
2819 | } | ||
2820 | |||
2821 | /* | ||
2822 | * Reschedule a lower priority task on the SMT sibling for | ||
2823 | * it to be put to sleep, or wake it up if it has been put to | ||
2824 | * sleep for priority reasons to see if it should run now. | ||
2825 | */ | ||
2826 | if (rt_task(p)) { | ||
2827 | if ((jiffies % DEF_TIMESLICE) > | ||
2828 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) | ||
2829 | resched_task(smt_curr); | ||
2830 | } else { | ||
2831 | if (TASK_PREEMPTS_CURR(p, smt_rq) && | ||
2832 | smt_slice(p, sd) > task_timeslice(smt_curr)) | ||
2833 | resched_task(smt_curr); | ||
2834 | else | ||
2835 | wakeup_busy_runqueue(smt_rq); | ||
2836 | } | 3194 | } |
3195 | unlock: | ||
3196 | spin_unlock(&smt_rq->lock); | ||
2837 | } | 3197 | } |
2838 | out_unlock: | ||
2839 | for_each_cpu_mask(i, sibling_map) | ||
2840 | spin_unlock(&cpu_rq(i)->lock); | ||
2841 | return ret; | 3198 | return ret; |
2842 | } | 3199 | } |
2843 | #else | 3200 | #else |
2844 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 3201 | static inline void wake_sleeping_dependent(int this_cpu) |
2845 | { | 3202 | { |
2846 | } | 3203 | } |
2847 | 3204 | static inline int | |
2848 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 3205 | dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p) |
2849 | { | 3206 | { |
2850 | return 0; | 3207 | return 0; |
2851 | } | 3208 | } |
@@ -2858,12 +3215,13 @@ void fastcall add_preempt_count(int val) | |||
2858 | /* | 3215 | /* |
2859 | * Underflow? | 3216 | * Underflow? |
2860 | */ | 3217 | */ |
2861 | BUG_ON((preempt_count() < 0)); | 3218 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) |
3219 | return; | ||
2862 | preempt_count() += val; | 3220 | preempt_count() += val; |
2863 | /* | 3221 | /* |
2864 | * Spinlock count overflowing soon? | 3222 | * Spinlock count overflowing soon? |
2865 | */ | 3223 | */ |
2866 | BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); | 3224 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); |
2867 | } | 3225 | } |
2868 | EXPORT_SYMBOL(add_preempt_count); | 3226 | EXPORT_SYMBOL(add_preempt_count); |
2869 | 3227 | ||
@@ -2872,11 +3230,15 @@ void fastcall sub_preempt_count(int val) | |||
2872 | /* | 3230 | /* |
2873 | * Underflow? | 3231 | * Underflow? |
2874 | */ | 3232 | */ |
2875 | BUG_ON(val > preempt_count()); | 3233 | if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) |
3234 | return; | ||
2876 | /* | 3235 | /* |
2877 | * Is the spinlock portion underflowing? | 3236 | * Is the spinlock portion underflowing? |
2878 | */ | 3237 | */ |
2879 | BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); | 3238 | if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && |
3239 | !(preempt_count() & PREEMPT_MASK))) | ||
3240 | return; | ||
3241 | |||
2880 | preempt_count() -= val; | 3242 | preempt_count() -= val; |
2881 | } | 3243 | } |
2882 | EXPORT_SYMBOL(sub_preempt_count); | 3244 | EXPORT_SYMBOL(sub_preempt_count); |
@@ -2894,14 +3256,14 @@ static inline int interactive_sleep(enum sleep_type sleep_type) | |||
2894 | */ | 3256 | */ |
2895 | asmlinkage void __sched schedule(void) | 3257 | asmlinkage void __sched schedule(void) |
2896 | { | 3258 | { |
2897 | long *switch_count; | 3259 | struct task_struct *prev, *next; |
2898 | task_t *prev, *next; | 3260 | struct prio_array *array; |
2899 | runqueue_t *rq; | ||
2900 | prio_array_t *array; | ||
2901 | struct list_head *queue; | 3261 | struct list_head *queue; |
2902 | unsigned long long now; | 3262 | unsigned long long now; |
2903 | unsigned long run_time; | 3263 | unsigned long run_time; |
2904 | int cpu, idx, new_prio; | 3264 | int cpu, idx, new_prio; |
3265 | long *switch_count; | ||
3266 | struct rq *rq; | ||
2905 | 3267 | ||
2906 | /* | 3268 | /* |
2907 | * Test if we are atomic. Since do_exit() needs to call into | 3269 | * Test if we are atomic. Since do_exit() needs to call into |
@@ -2967,32 +3329,13 @@ need_resched_nonpreemptible: | |||
2967 | 3329 | ||
2968 | cpu = smp_processor_id(); | 3330 | cpu = smp_processor_id(); |
2969 | if (unlikely(!rq->nr_running)) { | 3331 | if (unlikely(!rq->nr_running)) { |
2970 | go_idle: | ||
2971 | idle_balance(cpu, rq); | 3332 | idle_balance(cpu, rq); |
2972 | if (!rq->nr_running) { | 3333 | if (!rq->nr_running) { |
2973 | next = rq->idle; | 3334 | next = rq->idle; |
2974 | rq->expired_timestamp = 0; | 3335 | rq->expired_timestamp = 0; |
2975 | wake_sleeping_dependent(cpu, rq); | 3336 | wake_sleeping_dependent(cpu); |
2976 | /* | ||
2977 | * wake_sleeping_dependent() might have released | ||
2978 | * the runqueue, so break out if we got new | ||
2979 | * tasks meanwhile: | ||
2980 | */ | ||
2981 | if (!rq->nr_running) | ||
2982 | goto switch_tasks; | ||
2983 | } | ||
2984 | } else { | ||
2985 | if (dependent_sleeper(cpu, rq)) { | ||
2986 | next = rq->idle; | ||
2987 | goto switch_tasks; | 3337 | goto switch_tasks; |
2988 | } | 3338 | } |
2989 | /* | ||
2990 | * dependent_sleeper() releases and reacquires the runqueue | ||
2991 | * lock, hence go into the idle loop if the rq went | ||
2992 | * empty meanwhile: | ||
2993 | */ | ||
2994 | if (unlikely(!rq->nr_running)) | ||
2995 | goto go_idle; | ||
2996 | } | 3339 | } |
2997 | 3340 | ||
2998 | array = rq->active; | 3341 | array = rq->active; |
@@ -3010,7 +3353,7 @@ go_idle: | |||
3010 | 3353 | ||
3011 | idx = sched_find_first_bit(array->bitmap); | 3354 | idx = sched_find_first_bit(array->bitmap); |
3012 | queue = array->queue + idx; | 3355 | queue = array->queue + idx; |
3013 | next = list_entry(queue->next, task_t, run_list); | 3356 | next = list_entry(queue->next, struct task_struct, run_list); |
3014 | 3357 | ||
3015 | if (!rt_task(next) && interactive_sleep(next->sleep_type)) { | 3358 | if (!rt_task(next) && interactive_sleep(next->sleep_type)) { |
3016 | unsigned long long delta = now - next->timestamp; | 3359 | unsigned long long delta = now - next->timestamp; |
@@ -3030,6 +3373,8 @@ go_idle: | |||
3030 | } | 3373 | } |
3031 | } | 3374 | } |
3032 | next->sleep_type = SLEEP_NORMAL; | 3375 | next->sleep_type = SLEEP_NORMAL; |
3376 | if (dependent_sleeper(cpu, rq, next)) | ||
3377 | next = rq->idle; | ||
3033 | switch_tasks: | 3378 | switch_tasks: |
3034 | if (next == rq->idle) | 3379 | if (next == rq->idle) |
3035 | schedstat_inc(rq, sched_goidle); | 3380 | schedstat_inc(rq, sched_goidle); |
@@ -3071,12 +3416,11 @@ switch_tasks: | |||
3071 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3416 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
3072 | goto need_resched; | 3417 | goto need_resched; |
3073 | } | 3418 | } |
3074 | |||
3075 | EXPORT_SYMBOL(schedule); | 3419 | EXPORT_SYMBOL(schedule); |
3076 | 3420 | ||
3077 | #ifdef CONFIG_PREEMPT | 3421 | #ifdef CONFIG_PREEMPT |
3078 | /* | 3422 | /* |
3079 | * this is is the entry point to schedule() from in-kernel preemption | 3423 | * this is the entry point to schedule() from in-kernel preemption |
3080 | * off of preempt_enable. Kernel preemptions off return from interrupt | 3424 | * off of preempt_enable. Kernel preemptions off return from interrupt |
3081 | * occur there and call schedule directly. | 3425 | * occur there and call schedule directly. |
3082 | */ | 3426 | */ |
@@ -3116,11 +3460,10 @@ need_resched: | |||
3116 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3460 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
3117 | goto need_resched; | 3461 | goto need_resched; |
3118 | } | 3462 | } |
3119 | |||
3120 | EXPORT_SYMBOL(preempt_schedule); | 3463 | EXPORT_SYMBOL(preempt_schedule); |
3121 | 3464 | ||
3122 | /* | 3465 | /* |
3123 | * this is is the entry point to schedule() from kernel preemption | 3466 | * this is the entry point to schedule() from kernel preemption |
3124 | * off of irq context. | 3467 | * off of irq context. |
3125 | * Note, that this is called and return with irqs disabled. This will | 3468 | * Note, that this is called and return with irqs disabled. This will |
3126 | * protect us against recursive calling from irq. | 3469 | * protect us against recursive calling from irq. |
@@ -3132,7 +3475,7 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
3132 | struct task_struct *task = current; | 3475 | struct task_struct *task = current; |
3133 | int saved_lock_depth; | 3476 | int saved_lock_depth; |
3134 | #endif | 3477 | #endif |
3135 | /* Catch callers which need to be fixed*/ | 3478 | /* Catch callers which need to be fixed */ |
3136 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 3479 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
3137 | 3480 | ||
3138 | need_resched: | 3481 | need_resched: |
@@ -3165,10 +3508,8 @@ need_resched: | |||
3165 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, | 3508 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, |
3166 | void *key) | 3509 | void *key) |
3167 | { | 3510 | { |
3168 | task_t *p = curr->private; | 3511 | return try_to_wake_up(curr->private, mode, sync); |
3169 | return try_to_wake_up(p, mode, sync); | ||
3170 | } | 3512 | } |
3171 | |||
3172 | EXPORT_SYMBOL(default_wake_function); | 3513 | EXPORT_SYMBOL(default_wake_function); |
3173 | 3514 | ||
3174 | /* | 3515 | /* |
@@ -3186,13 +3527,11 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | |||
3186 | struct list_head *tmp, *next; | 3527 | struct list_head *tmp, *next; |
3187 | 3528 | ||
3188 | list_for_each_safe(tmp, next, &q->task_list) { | 3529 | list_for_each_safe(tmp, next, &q->task_list) { |
3189 | wait_queue_t *curr; | 3530 | wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); |
3190 | unsigned flags; | 3531 | unsigned flags = curr->flags; |
3191 | curr = list_entry(tmp, wait_queue_t, task_list); | 3532 | |
3192 | flags = curr->flags; | ||
3193 | if (curr->func(curr, mode, sync, key) && | 3533 | if (curr->func(curr, mode, sync, key) && |
3194 | (flags & WQ_FLAG_EXCLUSIVE) && | 3534 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) |
3195 | !--nr_exclusive) | ||
3196 | break; | 3535 | break; |
3197 | } | 3536 | } |
3198 | } | 3537 | } |
@@ -3213,7 +3552,6 @@ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, | |||
3213 | __wake_up_common(q, mode, nr_exclusive, 0, key); | 3552 | __wake_up_common(q, mode, nr_exclusive, 0, key); |
3214 | spin_unlock_irqrestore(&q->lock, flags); | 3553 | spin_unlock_irqrestore(&q->lock, flags); |
3215 | } | 3554 | } |
3216 | |||
3217 | EXPORT_SYMBOL(__wake_up); | 3555 | EXPORT_SYMBOL(__wake_up); |
3218 | 3556 | ||
3219 | /* | 3557 | /* |
@@ -3282,6 +3620,7 @@ EXPORT_SYMBOL(complete_all); | |||
3282 | void fastcall __sched wait_for_completion(struct completion *x) | 3620 | void fastcall __sched wait_for_completion(struct completion *x) |
3283 | { | 3621 | { |
3284 | might_sleep(); | 3622 | might_sleep(); |
3623 | |||
3285 | spin_lock_irq(&x->wait.lock); | 3624 | spin_lock_irq(&x->wait.lock); |
3286 | if (!x->done) { | 3625 | if (!x->done) { |
3287 | DECLARE_WAITQUEUE(wait, current); | 3626 | DECLARE_WAITQUEUE(wait, current); |
@@ -3426,7 +3765,6 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) | |||
3426 | schedule(); | 3765 | schedule(); |
3427 | SLEEP_ON_TAIL | 3766 | SLEEP_ON_TAIL |
3428 | } | 3767 | } |
3429 | |||
3430 | EXPORT_SYMBOL(interruptible_sleep_on); | 3768 | EXPORT_SYMBOL(interruptible_sleep_on); |
3431 | 3769 | ||
3432 | long fastcall __sched | 3770 | long fastcall __sched |
@@ -3442,7 +3780,6 @@ interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) | |||
3442 | 3780 | ||
3443 | return timeout; | 3781 | return timeout; |
3444 | } | 3782 | } |
3445 | |||
3446 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); | 3783 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); |
3447 | 3784 | ||
3448 | void fastcall __sched sleep_on(wait_queue_head_t *q) | 3785 | void fastcall __sched sleep_on(wait_queue_head_t *q) |
@@ -3455,7 +3792,6 @@ void fastcall __sched sleep_on(wait_queue_head_t *q) | |||
3455 | schedule(); | 3792 | schedule(); |
3456 | SLEEP_ON_TAIL | 3793 | SLEEP_ON_TAIL |
3457 | } | 3794 | } |
3458 | |||
3459 | EXPORT_SYMBOL(sleep_on); | 3795 | EXPORT_SYMBOL(sleep_on); |
3460 | 3796 | ||
3461 | long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | 3797 | long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) |
@@ -3473,12 +3809,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | |||
3473 | 3809 | ||
3474 | EXPORT_SYMBOL(sleep_on_timeout); | 3810 | EXPORT_SYMBOL(sleep_on_timeout); |
3475 | 3811 | ||
3476 | void set_user_nice(task_t *p, long nice) | 3812 | #ifdef CONFIG_RT_MUTEXES |
3813 | |||
3814 | /* | ||
3815 | * rt_mutex_setprio - set the current priority of a task | ||
3816 | * @p: task | ||
3817 | * @prio: prio value (kernel-internal form) | ||
3818 | * | ||
3819 | * This function changes the 'effective' priority of a task. It does | ||
3820 | * not touch ->normal_prio like __setscheduler(). | ||
3821 | * | ||
3822 | * Used by the rt_mutex code to implement priority inheritance logic. | ||
3823 | */ | ||
3824 | void rt_mutex_setprio(struct task_struct *p, int prio) | ||
3477 | { | 3825 | { |
3826 | struct prio_array *array; | ||
3478 | unsigned long flags; | 3827 | unsigned long flags; |
3479 | prio_array_t *array; | 3828 | struct rq *rq; |
3480 | runqueue_t *rq; | 3829 | int oldprio; |
3481 | int old_prio, new_prio, delta; | 3830 | |
3831 | BUG_ON(prio < 0 || prio > MAX_PRIO); | ||
3832 | |||
3833 | rq = task_rq_lock(p, &flags); | ||
3834 | |||
3835 | oldprio = p->prio; | ||
3836 | array = p->array; | ||
3837 | if (array) | ||
3838 | dequeue_task(p, array); | ||
3839 | p->prio = prio; | ||
3840 | |||
3841 | if (array) { | ||
3842 | /* | ||
3843 | * If changing to an RT priority then queue it | ||
3844 | * in the active array! | ||
3845 | */ | ||
3846 | if (rt_task(p)) | ||
3847 | array = rq->active; | ||
3848 | enqueue_task(p, array); | ||
3849 | /* | ||
3850 | * Reschedule if we are currently running on this runqueue and | ||
3851 | * our priority decreased, or if we are not currently running on | ||
3852 | * this runqueue and our priority is higher than the current's | ||
3853 | */ | ||
3854 | if (task_running(rq, p)) { | ||
3855 | if (p->prio > oldprio) | ||
3856 | resched_task(rq->curr); | ||
3857 | } else if (TASK_PREEMPTS_CURR(p, rq)) | ||
3858 | resched_task(rq->curr); | ||
3859 | } | ||
3860 | task_rq_unlock(rq, &flags); | ||
3861 | } | ||
3862 | |||
3863 | #endif | ||
3864 | |||
3865 | void set_user_nice(struct task_struct *p, long nice) | ||
3866 | { | ||
3867 | struct prio_array *array; | ||
3868 | int old_prio, delta; | ||
3869 | unsigned long flags; | ||
3870 | struct rq *rq; | ||
3482 | 3871 | ||
3483 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | 3872 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) |
3484 | return; | 3873 | return; |
@@ -3493,22 +3882,25 @@ void set_user_nice(task_t *p, long nice) | |||
3493 | * it wont have any effect on scheduling until the task is | 3882 | * it wont have any effect on scheduling until the task is |
3494 | * not SCHED_NORMAL/SCHED_BATCH: | 3883 | * not SCHED_NORMAL/SCHED_BATCH: |
3495 | */ | 3884 | */ |
3496 | if (rt_task(p)) { | 3885 | if (has_rt_policy(p)) { |
3497 | p->static_prio = NICE_TO_PRIO(nice); | 3886 | p->static_prio = NICE_TO_PRIO(nice); |
3498 | goto out_unlock; | 3887 | goto out_unlock; |
3499 | } | 3888 | } |
3500 | array = p->array; | 3889 | array = p->array; |
3501 | if (array) | 3890 | if (array) { |
3502 | dequeue_task(p, array); | 3891 | dequeue_task(p, array); |
3892 | dec_raw_weighted_load(rq, p); | ||
3893 | } | ||
3503 | 3894 | ||
3504 | old_prio = p->prio; | ||
3505 | new_prio = NICE_TO_PRIO(nice); | ||
3506 | delta = new_prio - old_prio; | ||
3507 | p->static_prio = NICE_TO_PRIO(nice); | 3895 | p->static_prio = NICE_TO_PRIO(nice); |
3508 | p->prio += delta; | 3896 | set_load_weight(p); |
3897 | old_prio = p->prio; | ||
3898 | p->prio = effective_prio(p); | ||
3899 | delta = p->prio - old_prio; | ||
3509 | 3900 | ||
3510 | if (array) { | 3901 | if (array) { |
3511 | enqueue_task(p, array); | 3902 | enqueue_task(p, array); |
3903 | inc_raw_weighted_load(rq, p); | ||
3512 | /* | 3904 | /* |
3513 | * If the task increased its priority or is running and | 3905 | * If the task increased its priority or is running and |
3514 | * lowered its priority, then reschedule its CPU: | 3906 | * lowered its priority, then reschedule its CPU: |
@@ -3519,7 +3911,6 @@ void set_user_nice(task_t *p, long nice) | |||
3519 | out_unlock: | 3911 | out_unlock: |
3520 | task_rq_unlock(rq, &flags); | 3912 | task_rq_unlock(rq, &flags); |
3521 | } | 3913 | } |
3522 | |||
3523 | EXPORT_SYMBOL(set_user_nice); | 3914 | EXPORT_SYMBOL(set_user_nice); |
3524 | 3915 | ||
3525 | /* | 3916 | /* |
@@ -3527,10 +3918,11 @@ EXPORT_SYMBOL(set_user_nice); | |||
3527 | * @p: task | 3918 | * @p: task |
3528 | * @nice: nice value | 3919 | * @nice: nice value |
3529 | */ | 3920 | */ |
3530 | int can_nice(const task_t *p, const int nice) | 3921 | int can_nice(const struct task_struct *p, const int nice) |
3531 | { | 3922 | { |
3532 | /* convert nice value [19,-20] to rlimit style value [1,40] */ | 3923 | /* convert nice value [19,-20] to rlimit style value [1,40] */ |
3533 | int nice_rlim = 20 - nice; | 3924 | int nice_rlim = 20 - nice; |
3925 | |||
3534 | return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || | 3926 | return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || |
3535 | capable(CAP_SYS_NICE)); | 3927 | capable(CAP_SYS_NICE)); |
3536 | } | 3928 | } |
@@ -3546,8 +3938,7 @@ int can_nice(const task_t *p, const int nice) | |||
3546 | */ | 3938 | */ |
3547 | asmlinkage long sys_nice(int increment) | 3939 | asmlinkage long sys_nice(int increment) |
3548 | { | 3940 | { |
3549 | int retval; | 3941 | long nice, retval; |
3550 | long nice; | ||
3551 | 3942 | ||
3552 | /* | 3943 | /* |
3553 | * Setpriority might change our priority at the same moment. | 3944 | * Setpriority might change our priority at the same moment. |
@@ -3586,7 +3977,7 @@ asmlinkage long sys_nice(int increment) | |||
3586 | * RT tasks are offset by -200. Normal tasks are centered | 3977 | * RT tasks are offset by -200. Normal tasks are centered |
3587 | * around 0, value goes from -16 to +15. | 3978 | * around 0, value goes from -16 to +15. |
3588 | */ | 3979 | */ |
3589 | int task_prio(const task_t *p) | 3980 | int task_prio(const struct task_struct *p) |
3590 | { | 3981 | { |
3591 | return p->prio - MAX_RT_PRIO; | 3982 | return p->prio - MAX_RT_PRIO; |
3592 | } | 3983 | } |
@@ -3595,7 +3986,7 @@ int task_prio(const task_t *p) | |||
3595 | * task_nice - return the nice value of a given task. | 3986 | * task_nice - return the nice value of a given task. |
3596 | * @p: the task in question. | 3987 | * @p: the task in question. |
3597 | */ | 3988 | */ |
3598 | int task_nice(const task_t *p) | 3989 | int task_nice(const struct task_struct *p) |
3599 | { | 3990 | { |
3600 | return TASK_NICE(p); | 3991 | return TASK_NICE(p); |
3601 | } | 3992 | } |
@@ -3614,7 +4005,7 @@ int idle_cpu(int cpu) | |||
3614 | * idle_task - return the idle task for a given cpu. | 4005 | * idle_task - return the idle task for a given cpu. |
3615 | * @cpu: the processor in question. | 4006 | * @cpu: the processor in question. |
3616 | */ | 4007 | */ |
3617 | task_t *idle_task(int cpu) | 4008 | struct task_struct *idle_task(int cpu) |
3618 | { | 4009 | { |
3619 | return cpu_rq(cpu)->idle; | 4010 | return cpu_rq(cpu)->idle; |
3620 | } | 4011 | } |
@@ -3623,7 +4014,7 @@ task_t *idle_task(int cpu) | |||
3623 | * find_process_by_pid - find a process with a matching PID value. | 4014 | * find_process_by_pid - find a process with a matching PID value. |
3624 | * @pid: the pid in question. | 4015 | * @pid: the pid in question. |
3625 | */ | 4016 | */ |
3626 | static inline task_t *find_process_by_pid(pid_t pid) | 4017 | static inline struct task_struct *find_process_by_pid(pid_t pid) |
3627 | { | 4018 | { |
3628 | return pid ? find_task_by_pid(pid) : current; | 4019 | return pid ? find_task_by_pid(pid) : current; |
3629 | } | 4020 | } |
@@ -3632,18 +4023,18 @@ static inline task_t *find_process_by_pid(pid_t pid) | |||
3632 | static void __setscheduler(struct task_struct *p, int policy, int prio) | 4023 | static void __setscheduler(struct task_struct *p, int policy, int prio) |
3633 | { | 4024 | { |
3634 | BUG_ON(p->array); | 4025 | BUG_ON(p->array); |
4026 | |||
3635 | p->policy = policy; | 4027 | p->policy = policy; |
3636 | p->rt_priority = prio; | 4028 | p->rt_priority = prio; |
3637 | if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { | 4029 | p->normal_prio = normal_prio(p); |
3638 | p->prio = MAX_RT_PRIO-1 - p->rt_priority; | 4030 | /* we are holding p->pi_lock already */ |
3639 | } else { | 4031 | p->prio = rt_mutex_getprio(p); |
3640 | p->prio = p->static_prio; | 4032 | /* |
3641 | /* | 4033 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: |
3642 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: | 4034 | */ |
3643 | */ | 4035 | if (policy == SCHED_BATCH) |
3644 | if (policy == SCHED_BATCH) | 4036 | p->sleep_avg = 0; |
3645 | p->sleep_avg = 0; | 4037 | set_load_weight(p); |
3646 | } | ||
3647 | } | 4038 | } |
3648 | 4039 | ||
3649 | /** | 4040 | /** |
@@ -3656,12 +4047,13 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) | |||
3656 | int sched_setscheduler(struct task_struct *p, int policy, | 4047 | int sched_setscheduler(struct task_struct *p, int policy, |
3657 | struct sched_param *param) | 4048 | struct sched_param *param) |
3658 | { | 4049 | { |
3659 | int retval; | 4050 | int retval, oldprio, oldpolicy = -1; |
3660 | int oldprio, oldpolicy = -1; | 4051 | struct prio_array *array; |
3661 | prio_array_t *array; | ||
3662 | unsigned long flags; | 4052 | unsigned long flags; |
3663 | runqueue_t *rq; | 4053 | struct rq *rq; |
3664 | 4054 | ||
4055 | /* may grab non-irq protected spin_locks */ | ||
4056 | BUG_ON(in_interrupt()); | ||
3665 | recheck: | 4057 | recheck: |
3666 | /* double check policy once rq lock held */ | 4058 | /* double check policy once rq lock held */ |
3667 | if (policy < 0) | 4059 | if (policy < 0) |
@@ -3710,14 +4102,20 @@ recheck: | |||
3710 | if (retval) | 4102 | if (retval) |
3711 | return retval; | 4103 | return retval; |
3712 | /* | 4104 | /* |
4105 | * make sure no PI-waiters arrive (or leave) while we are | ||
4106 | * changing the priority of the task: | ||
4107 | */ | ||
4108 | spin_lock_irqsave(&p->pi_lock, flags); | ||
4109 | /* | ||
3713 | * To be able to change p->policy safely, the apropriate | 4110 | * To be able to change p->policy safely, the apropriate |
3714 | * runqueue lock must be held. | 4111 | * runqueue lock must be held. |
3715 | */ | 4112 | */ |
3716 | rq = task_rq_lock(p, &flags); | 4113 | rq = __task_rq_lock(p); |
3717 | /* recheck policy now with rq lock held */ | 4114 | /* recheck policy now with rq lock held */ |
3718 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 4115 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
3719 | policy = oldpolicy = -1; | 4116 | policy = oldpolicy = -1; |
3720 | task_rq_unlock(rq, &flags); | 4117 | __task_rq_unlock(rq); |
4118 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
3721 | goto recheck; | 4119 | goto recheck; |
3722 | } | 4120 | } |
3723 | array = p->array; | 4121 | array = p->array; |
@@ -3738,7 +4136,11 @@ recheck: | |||
3738 | } else if (TASK_PREEMPTS_CURR(p, rq)) | 4136 | } else if (TASK_PREEMPTS_CURR(p, rq)) |
3739 | resched_task(rq->curr); | 4137 | resched_task(rq->curr); |
3740 | } | 4138 | } |
3741 | task_rq_unlock(rq, &flags); | 4139 | __task_rq_unlock(rq); |
4140 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
4141 | |||
4142 | rt_mutex_adjust_pi(p); | ||
4143 | |||
3742 | return 0; | 4144 | return 0; |
3743 | } | 4145 | } |
3744 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 4146 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
@@ -3746,9 +4148,9 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); | |||
3746 | static int | 4148 | static int |
3747 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | 4149 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) |
3748 | { | 4150 | { |
3749 | int retval; | ||
3750 | struct sched_param lparam; | 4151 | struct sched_param lparam; |
3751 | struct task_struct *p; | 4152 | struct task_struct *p; |
4153 | int retval; | ||
3752 | 4154 | ||
3753 | if (!param || pid < 0) | 4155 | if (!param || pid < 0) |
3754 | return -EINVAL; | 4156 | return -EINVAL; |
@@ -3760,8 +4162,11 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | |||
3760 | read_unlock_irq(&tasklist_lock); | 4162 | read_unlock_irq(&tasklist_lock); |
3761 | return -ESRCH; | 4163 | return -ESRCH; |
3762 | } | 4164 | } |
3763 | retval = sched_setscheduler(p, policy, &lparam); | 4165 | get_task_struct(p); |
3764 | read_unlock_irq(&tasklist_lock); | 4166 | read_unlock_irq(&tasklist_lock); |
4167 | retval = sched_setscheduler(p, policy, &lparam); | ||
4168 | put_task_struct(p); | ||
4169 | |||
3765 | return retval; | 4170 | return retval; |
3766 | } | 4171 | } |
3767 | 4172 | ||
@@ -3797,8 +4202,8 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) | |||
3797 | */ | 4202 | */ |
3798 | asmlinkage long sys_sched_getscheduler(pid_t pid) | 4203 | asmlinkage long sys_sched_getscheduler(pid_t pid) |
3799 | { | 4204 | { |
4205 | struct task_struct *p; | ||
3800 | int retval = -EINVAL; | 4206 | int retval = -EINVAL; |
3801 | task_t *p; | ||
3802 | 4207 | ||
3803 | if (pid < 0) | 4208 | if (pid < 0) |
3804 | goto out_nounlock; | 4209 | goto out_nounlock; |
@@ -3825,8 +4230,8 @@ out_nounlock: | |||
3825 | asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) | 4230 | asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) |
3826 | { | 4231 | { |
3827 | struct sched_param lp; | 4232 | struct sched_param lp; |
4233 | struct task_struct *p; | ||
3828 | int retval = -EINVAL; | 4234 | int retval = -EINVAL; |
3829 | task_t *p; | ||
3830 | 4235 | ||
3831 | if (!param || pid < 0) | 4236 | if (!param || pid < 0) |
3832 | goto out_nounlock; | 4237 | goto out_nounlock; |
@@ -3859,9 +4264,9 @@ out_unlock: | |||
3859 | 4264 | ||
3860 | long sched_setaffinity(pid_t pid, cpumask_t new_mask) | 4265 | long sched_setaffinity(pid_t pid, cpumask_t new_mask) |
3861 | { | 4266 | { |
3862 | task_t *p; | ||
3863 | int retval; | ||
3864 | cpumask_t cpus_allowed; | 4267 | cpumask_t cpus_allowed; |
4268 | struct task_struct *p; | ||
4269 | int retval; | ||
3865 | 4270 | ||
3866 | lock_cpu_hotplug(); | 4271 | lock_cpu_hotplug(); |
3867 | read_lock(&tasklist_lock); | 4272 | read_lock(&tasklist_lock); |
@@ -3886,6 +4291,10 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
3886 | !capable(CAP_SYS_NICE)) | 4291 | !capable(CAP_SYS_NICE)) |
3887 | goto out_unlock; | 4292 | goto out_unlock; |
3888 | 4293 | ||
4294 | retval = security_task_setscheduler(p, 0, NULL); | ||
4295 | if (retval) | ||
4296 | goto out_unlock; | ||
4297 | |||
3889 | cpus_allowed = cpuset_cpus_allowed(p); | 4298 | cpus_allowed = cpuset_cpus_allowed(p); |
3890 | cpus_and(new_mask, new_mask, cpus_allowed); | 4299 | cpus_and(new_mask, new_mask, cpus_allowed); |
3891 | retval = set_cpus_allowed(p, new_mask); | 4300 | retval = set_cpus_allowed(p, new_mask); |
@@ -3943,8 +4352,8 @@ cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; | |||
3943 | 4352 | ||
3944 | long sched_getaffinity(pid_t pid, cpumask_t *mask) | 4353 | long sched_getaffinity(pid_t pid, cpumask_t *mask) |
3945 | { | 4354 | { |
4355 | struct task_struct *p; | ||
3946 | int retval; | 4356 | int retval; |
3947 | task_t *p; | ||
3948 | 4357 | ||
3949 | lock_cpu_hotplug(); | 4358 | lock_cpu_hotplug(); |
3950 | read_lock(&tasklist_lock); | 4359 | read_lock(&tasklist_lock); |
@@ -3954,7 +4363,10 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) | |||
3954 | if (!p) | 4363 | if (!p) |
3955 | goto out_unlock; | 4364 | goto out_unlock; |
3956 | 4365 | ||
3957 | retval = 0; | 4366 | retval = security_task_getscheduler(p); |
4367 | if (retval) | ||
4368 | goto out_unlock; | ||
4369 | |||
3958 | cpus_and(*mask, p->cpus_allowed, cpu_online_map); | 4370 | cpus_and(*mask, p->cpus_allowed, cpu_online_map); |
3959 | 4371 | ||
3960 | out_unlock: | 4372 | out_unlock: |
@@ -4000,9 +4412,8 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, | |||
4000 | */ | 4412 | */ |
4001 | asmlinkage long sys_sched_yield(void) | 4413 | asmlinkage long sys_sched_yield(void) |
4002 | { | 4414 | { |
4003 | runqueue_t *rq = this_rq_lock(); | 4415 | struct rq *rq = this_rq_lock(); |
4004 | prio_array_t *array = current->array; | 4416 | struct prio_array *array = current->array, *target = rq->expired; |
4005 | prio_array_t *target = rq->expired; | ||
4006 | 4417 | ||
4007 | schedstat_inc(rq, yld_cnt); | 4418 | schedstat_inc(rq, yld_cnt); |
4008 | /* | 4419 | /* |
@@ -4036,6 +4447,7 @@ asmlinkage long sys_sched_yield(void) | |||
4036 | * no need to preempt or enable interrupts: | 4447 | * no need to preempt or enable interrupts: |
4037 | */ | 4448 | */ |
4038 | __release(rq->lock); | 4449 | __release(rq->lock); |
4450 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | ||
4039 | _raw_spin_unlock(&rq->lock); | 4451 | _raw_spin_unlock(&rq->lock); |
4040 | preempt_enable_no_resched(); | 4452 | preempt_enable_no_resched(); |
4041 | 4453 | ||
@@ -4044,17 +4456,25 @@ asmlinkage long sys_sched_yield(void) | |||
4044 | return 0; | 4456 | return 0; |
4045 | } | 4457 | } |
4046 | 4458 | ||
4047 | static inline void __cond_resched(void) | 4459 | static inline int __resched_legal(void) |
4048 | { | 4460 | { |
4461 | if (unlikely(preempt_count())) | ||
4462 | return 0; | ||
4463 | if (unlikely(system_state != SYSTEM_RUNNING)) | ||
4464 | return 0; | ||
4465 | return 1; | ||
4466 | } | ||
4467 | |||
4468 | static void __cond_resched(void) | ||
4469 | { | ||
4470 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | ||
4471 | __might_sleep(__FILE__, __LINE__); | ||
4472 | #endif | ||
4049 | /* | 4473 | /* |
4050 | * The BKS might be reacquired before we have dropped | 4474 | * The BKS might be reacquired before we have dropped |
4051 | * PREEMPT_ACTIVE, which could trigger a second | 4475 | * PREEMPT_ACTIVE, which could trigger a second |
4052 | * cond_resched() call. | 4476 | * cond_resched() call. |
4053 | */ | 4477 | */ |
4054 | if (unlikely(preempt_count())) | ||
4055 | return; | ||
4056 | if (unlikely(system_state != SYSTEM_RUNNING)) | ||
4057 | return; | ||
4058 | do { | 4478 | do { |
4059 | add_preempt_count(PREEMPT_ACTIVE); | 4479 | add_preempt_count(PREEMPT_ACTIVE); |
4060 | schedule(); | 4480 | schedule(); |
@@ -4064,13 +4484,12 @@ static inline void __cond_resched(void) | |||
4064 | 4484 | ||
4065 | int __sched cond_resched(void) | 4485 | int __sched cond_resched(void) |
4066 | { | 4486 | { |
4067 | if (need_resched()) { | 4487 | if (need_resched() && __resched_legal()) { |
4068 | __cond_resched(); | 4488 | __cond_resched(); |
4069 | return 1; | 4489 | return 1; |
4070 | } | 4490 | } |
4071 | return 0; | 4491 | return 0; |
4072 | } | 4492 | } |
4073 | |||
4074 | EXPORT_SYMBOL(cond_resched); | 4493 | EXPORT_SYMBOL(cond_resched); |
4075 | 4494 | ||
4076 | /* | 4495 | /* |
@@ -4091,7 +4510,8 @@ int cond_resched_lock(spinlock_t *lock) | |||
4091 | ret = 1; | 4510 | ret = 1; |
4092 | spin_lock(lock); | 4511 | spin_lock(lock); |
4093 | } | 4512 | } |
4094 | if (need_resched()) { | 4513 | if (need_resched() && __resched_legal()) { |
4514 | spin_release(&lock->dep_map, 1, _THIS_IP_); | ||
4095 | _raw_spin_unlock(lock); | 4515 | _raw_spin_unlock(lock); |
4096 | preempt_enable_no_resched(); | 4516 | preempt_enable_no_resched(); |
4097 | __cond_resched(); | 4517 | __cond_resched(); |
@@ -4100,25 +4520,24 @@ int cond_resched_lock(spinlock_t *lock) | |||
4100 | } | 4520 | } |
4101 | return ret; | 4521 | return ret; |
4102 | } | 4522 | } |
4103 | |||
4104 | EXPORT_SYMBOL(cond_resched_lock); | 4523 | EXPORT_SYMBOL(cond_resched_lock); |
4105 | 4524 | ||
4106 | int __sched cond_resched_softirq(void) | 4525 | int __sched cond_resched_softirq(void) |
4107 | { | 4526 | { |
4108 | BUG_ON(!in_softirq()); | 4527 | BUG_ON(!in_softirq()); |
4109 | 4528 | ||
4110 | if (need_resched()) { | 4529 | if (need_resched() && __resched_legal()) { |
4111 | __local_bh_enable(); | 4530 | raw_local_irq_disable(); |
4531 | _local_bh_enable(); | ||
4532 | raw_local_irq_enable(); | ||
4112 | __cond_resched(); | 4533 | __cond_resched(); |
4113 | local_bh_disable(); | 4534 | local_bh_disable(); |
4114 | return 1; | 4535 | return 1; |
4115 | } | 4536 | } |
4116 | return 0; | 4537 | return 0; |
4117 | } | 4538 | } |
4118 | |||
4119 | EXPORT_SYMBOL(cond_resched_softirq); | 4539 | EXPORT_SYMBOL(cond_resched_softirq); |
4120 | 4540 | ||
4121 | |||
4122 | /** | 4541 | /** |
4123 | * yield - yield the current processor to other threads. | 4542 | * yield - yield the current processor to other threads. |
4124 | * | 4543 | * |
@@ -4130,7 +4549,6 @@ void __sched yield(void) | |||
4130 | set_current_state(TASK_RUNNING); | 4549 | set_current_state(TASK_RUNNING); |
4131 | sys_sched_yield(); | 4550 | sys_sched_yield(); |
4132 | } | 4551 | } |
4133 | |||
4134 | EXPORT_SYMBOL(yield); | 4552 | EXPORT_SYMBOL(yield); |
4135 | 4553 | ||
4136 | /* | 4554 | /* |
@@ -4142,23 +4560,26 @@ EXPORT_SYMBOL(yield); | |||
4142 | */ | 4560 | */ |
4143 | void __sched io_schedule(void) | 4561 | void __sched io_schedule(void) |
4144 | { | 4562 | { |
4145 | struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); | 4563 | struct rq *rq = &__raw_get_cpu_var(runqueues); |
4146 | 4564 | ||
4565 | delayacct_blkio_start(); | ||
4147 | atomic_inc(&rq->nr_iowait); | 4566 | atomic_inc(&rq->nr_iowait); |
4148 | schedule(); | 4567 | schedule(); |
4149 | atomic_dec(&rq->nr_iowait); | 4568 | atomic_dec(&rq->nr_iowait); |
4569 | delayacct_blkio_end(); | ||
4150 | } | 4570 | } |
4151 | |||
4152 | EXPORT_SYMBOL(io_schedule); | 4571 | EXPORT_SYMBOL(io_schedule); |
4153 | 4572 | ||
4154 | long __sched io_schedule_timeout(long timeout) | 4573 | long __sched io_schedule_timeout(long timeout) |
4155 | { | 4574 | { |
4156 | struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); | 4575 | struct rq *rq = &__raw_get_cpu_var(runqueues); |
4157 | long ret; | 4576 | long ret; |
4158 | 4577 | ||
4578 | delayacct_blkio_start(); | ||
4159 | atomic_inc(&rq->nr_iowait); | 4579 | atomic_inc(&rq->nr_iowait); |
4160 | ret = schedule_timeout(timeout); | 4580 | ret = schedule_timeout(timeout); |
4161 | atomic_dec(&rq->nr_iowait); | 4581 | atomic_dec(&rq->nr_iowait); |
4582 | delayacct_blkio_end(); | ||
4162 | return ret; | 4583 | return ret; |
4163 | } | 4584 | } |
4164 | 4585 | ||
@@ -4220,9 +4641,9 @@ asmlinkage long sys_sched_get_priority_min(int policy) | |||
4220 | asmlinkage | 4641 | asmlinkage |
4221 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | 4642 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) |
4222 | { | 4643 | { |
4644 | struct task_struct *p; | ||
4223 | int retval = -EINVAL; | 4645 | int retval = -EINVAL; |
4224 | struct timespec t; | 4646 | struct timespec t; |
4225 | task_t *p; | ||
4226 | 4647 | ||
4227 | if (pid < 0) | 4648 | if (pid < 0) |
4228 | goto out_nounlock; | 4649 | goto out_nounlock; |
@@ -4237,7 +4658,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | |||
4237 | if (retval) | 4658 | if (retval) |
4238 | goto out_unlock; | 4659 | goto out_unlock; |
4239 | 4660 | ||
4240 | jiffies_to_timespec(p->policy & SCHED_FIFO ? | 4661 | jiffies_to_timespec(p->policy == SCHED_FIFO ? |
4241 | 0 : task_timeslice(p), &t); | 4662 | 0 : task_timeslice(p), &t); |
4242 | read_unlock(&tasklist_lock); | 4663 | read_unlock(&tasklist_lock); |
4243 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | 4664 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
@@ -4250,35 +4671,36 @@ out_unlock: | |||
4250 | 4671 | ||
4251 | static inline struct task_struct *eldest_child(struct task_struct *p) | 4672 | static inline struct task_struct *eldest_child(struct task_struct *p) |
4252 | { | 4673 | { |
4253 | if (list_empty(&p->children)) return NULL; | 4674 | if (list_empty(&p->children)) |
4675 | return NULL; | ||
4254 | return list_entry(p->children.next,struct task_struct,sibling); | 4676 | return list_entry(p->children.next,struct task_struct,sibling); |
4255 | } | 4677 | } |
4256 | 4678 | ||
4257 | static inline struct task_struct *older_sibling(struct task_struct *p) | 4679 | static inline struct task_struct *older_sibling(struct task_struct *p) |
4258 | { | 4680 | { |
4259 | if (p->sibling.prev==&p->parent->children) return NULL; | 4681 | if (p->sibling.prev==&p->parent->children) |
4682 | return NULL; | ||
4260 | return list_entry(p->sibling.prev,struct task_struct,sibling); | 4683 | return list_entry(p->sibling.prev,struct task_struct,sibling); |
4261 | } | 4684 | } |
4262 | 4685 | ||
4263 | static inline struct task_struct *younger_sibling(struct task_struct *p) | 4686 | static inline struct task_struct *younger_sibling(struct task_struct *p) |
4264 | { | 4687 | { |
4265 | if (p->sibling.next==&p->parent->children) return NULL; | 4688 | if (p->sibling.next==&p->parent->children) |
4689 | return NULL; | ||
4266 | return list_entry(p->sibling.next,struct task_struct,sibling); | 4690 | return list_entry(p->sibling.next,struct task_struct,sibling); |
4267 | } | 4691 | } |
4268 | 4692 | ||
4269 | static void show_task(task_t *p) | 4693 | static const char stat_nam[] = "RSDTtZX"; |
4694 | |||
4695 | static void show_task(struct task_struct *p) | ||
4270 | { | 4696 | { |
4271 | task_t *relative; | 4697 | struct task_struct *relative; |
4272 | unsigned state; | ||
4273 | unsigned long free = 0; | 4698 | unsigned long free = 0; |
4274 | static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; | 4699 | unsigned state; |
4275 | 4700 | ||
4276 | printk("%-13.13s ", p->comm); | ||
4277 | state = p->state ? __ffs(p->state) + 1 : 0; | 4701 | state = p->state ? __ffs(p->state) + 1 : 0; |
4278 | if (state < ARRAY_SIZE(stat_nam)) | 4702 | printk("%-13.13s %c", p->comm, |
4279 | printk(stat_nam[state]); | 4703 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); |
4280 | else | ||
4281 | printk("?"); | ||
4282 | #if (BITS_PER_LONG == 32) | 4704 | #if (BITS_PER_LONG == 32) |
4283 | if (state == TASK_RUNNING) | 4705 | if (state == TASK_RUNNING) |
4284 | printk(" running "); | 4706 | printk(" running "); |
@@ -4322,7 +4744,7 @@ static void show_task(task_t *p) | |||
4322 | 4744 | ||
4323 | void show_state(void) | 4745 | void show_state(void) |
4324 | { | 4746 | { |
4325 | task_t *g, *p; | 4747 | struct task_struct *g, *p; |
4326 | 4748 | ||
4327 | #if (BITS_PER_LONG == 32) | 4749 | #if (BITS_PER_LONG == 32) |
4328 | printk("\n" | 4750 | printk("\n" |
@@ -4344,7 +4766,7 @@ void show_state(void) | |||
4344 | } while_each_thread(g, p); | 4766 | } while_each_thread(g, p); |
4345 | 4767 | ||
4346 | read_unlock(&tasklist_lock); | 4768 | read_unlock(&tasklist_lock); |
4347 | mutex_debug_show_all_locks(); | 4769 | debug_show_all_locks(); |
4348 | } | 4770 | } |
4349 | 4771 | ||
4350 | /** | 4772 | /** |
@@ -4355,15 +4777,15 @@ void show_state(void) | |||
4355 | * NOTE: this function does not set the idle thread's NEED_RESCHED | 4777 | * NOTE: this function does not set the idle thread's NEED_RESCHED |
4356 | * flag, to make booting more robust. | 4778 | * flag, to make booting more robust. |
4357 | */ | 4779 | */ |
4358 | void __devinit init_idle(task_t *idle, int cpu) | 4780 | void __devinit init_idle(struct task_struct *idle, int cpu) |
4359 | { | 4781 | { |
4360 | runqueue_t *rq = cpu_rq(cpu); | 4782 | struct rq *rq = cpu_rq(cpu); |
4361 | unsigned long flags; | 4783 | unsigned long flags; |
4362 | 4784 | ||
4363 | idle->timestamp = sched_clock(); | 4785 | idle->timestamp = sched_clock(); |
4364 | idle->sleep_avg = 0; | 4786 | idle->sleep_avg = 0; |
4365 | idle->array = NULL; | 4787 | idle->array = NULL; |
4366 | idle->prio = MAX_PRIO; | 4788 | idle->prio = idle->normal_prio = MAX_PRIO; |
4367 | idle->state = TASK_RUNNING; | 4789 | idle->state = TASK_RUNNING; |
4368 | idle->cpus_allowed = cpumask_of_cpu(cpu); | 4790 | idle->cpus_allowed = cpumask_of_cpu(cpu); |
4369 | set_task_cpu(idle, cpu); | 4791 | set_task_cpu(idle, cpu); |
@@ -4396,7 +4818,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | |||
4396 | /* | 4818 | /* |
4397 | * This is how migration works: | 4819 | * This is how migration works: |
4398 | * | 4820 | * |
4399 | * 1) we queue a migration_req_t structure in the source CPU's | 4821 | * 1) we queue a struct migration_req structure in the source CPU's |
4400 | * runqueue and wake up that CPU's migration thread. | 4822 | * runqueue and wake up that CPU's migration thread. |
4401 | * 2) we down() the locked semaphore => thread blocks. | 4823 | * 2) we down() the locked semaphore => thread blocks. |
4402 | * 3) migration thread wakes up (implicitly it forces the migrated | 4824 | * 3) migration thread wakes up (implicitly it forces the migrated |
@@ -4418,12 +4840,12 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | |||
4418 | * task must not exit() & deallocate itself prematurely. The | 4840 | * task must not exit() & deallocate itself prematurely. The |
4419 | * call is not atomic; no spinlocks may be held. | 4841 | * call is not atomic; no spinlocks may be held. |
4420 | */ | 4842 | */ |
4421 | int set_cpus_allowed(task_t *p, cpumask_t new_mask) | 4843 | int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) |
4422 | { | 4844 | { |
4845 | struct migration_req req; | ||
4423 | unsigned long flags; | 4846 | unsigned long flags; |
4847 | struct rq *rq; | ||
4424 | int ret = 0; | 4848 | int ret = 0; |
4425 | migration_req_t req; | ||
4426 | runqueue_t *rq; | ||
4427 | 4849 | ||
4428 | rq = task_rq_lock(p, &flags); | 4850 | rq = task_rq_lock(p, &flags); |
4429 | if (!cpus_intersects(new_mask, cpu_online_map)) { | 4851 | if (!cpus_intersects(new_mask, cpu_online_map)) { |
@@ -4446,9 +4868,9 @@ int set_cpus_allowed(task_t *p, cpumask_t new_mask) | |||
4446 | } | 4868 | } |
4447 | out: | 4869 | out: |
4448 | task_rq_unlock(rq, &flags); | 4870 | task_rq_unlock(rq, &flags); |
4871 | |||
4449 | return ret; | 4872 | return ret; |
4450 | } | 4873 | } |
4451 | |||
4452 | EXPORT_SYMBOL_GPL(set_cpus_allowed); | 4874 | EXPORT_SYMBOL_GPL(set_cpus_allowed); |
4453 | 4875 | ||
4454 | /* | 4876 | /* |
@@ -4459,13 +4881,16 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); | |||
4459 | * | 4881 | * |
4460 | * So we race with normal scheduler movements, but that's OK, as long | 4882 | * So we race with normal scheduler movements, but that's OK, as long |
4461 | * as the task is no longer on this CPU. | 4883 | * as the task is no longer on this CPU. |
4884 | * | ||
4885 | * Returns non-zero if task was successfully migrated. | ||
4462 | */ | 4886 | */ |
4463 | static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | 4887 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) |
4464 | { | 4888 | { |
4465 | runqueue_t *rq_dest, *rq_src; | 4889 | struct rq *rq_dest, *rq_src; |
4890 | int ret = 0; | ||
4466 | 4891 | ||
4467 | if (unlikely(cpu_is_offline(dest_cpu))) | 4892 | if (unlikely(cpu_is_offline(dest_cpu))) |
4468 | return; | 4893 | return ret; |
4469 | 4894 | ||
4470 | rq_src = cpu_rq(src_cpu); | 4895 | rq_src = cpu_rq(src_cpu); |
4471 | rq_dest = cpu_rq(dest_cpu); | 4896 | rq_dest = cpu_rq(dest_cpu); |
@@ -4489,13 +4914,14 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
4489 | p->timestamp = p->timestamp - rq_src->timestamp_last_tick | 4914 | p->timestamp = p->timestamp - rq_src->timestamp_last_tick |
4490 | + rq_dest->timestamp_last_tick; | 4915 | + rq_dest->timestamp_last_tick; |
4491 | deactivate_task(p, rq_src); | 4916 | deactivate_task(p, rq_src); |
4492 | activate_task(p, rq_dest, 0); | 4917 | __activate_task(p, rq_dest); |
4493 | if (TASK_PREEMPTS_CURR(p, rq_dest)) | 4918 | if (TASK_PREEMPTS_CURR(p, rq_dest)) |
4494 | resched_task(rq_dest->curr); | 4919 | resched_task(rq_dest->curr); |
4495 | } | 4920 | } |
4496 | 4921 | ret = 1; | |
4497 | out: | 4922 | out: |
4498 | double_rq_unlock(rq_src, rq_dest); | 4923 | double_rq_unlock(rq_src, rq_dest); |
4924 | return ret; | ||
4499 | } | 4925 | } |
4500 | 4926 | ||
4501 | /* | 4927 | /* |
@@ -4505,16 +4931,16 @@ out: | |||
4505 | */ | 4931 | */ |
4506 | static int migration_thread(void *data) | 4932 | static int migration_thread(void *data) |
4507 | { | 4933 | { |
4508 | runqueue_t *rq; | ||
4509 | int cpu = (long)data; | 4934 | int cpu = (long)data; |
4935 | struct rq *rq; | ||
4510 | 4936 | ||
4511 | rq = cpu_rq(cpu); | 4937 | rq = cpu_rq(cpu); |
4512 | BUG_ON(rq->migration_thread != current); | 4938 | BUG_ON(rq->migration_thread != current); |
4513 | 4939 | ||
4514 | set_current_state(TASK_INTERRUPTIBLE); | 4940 | set_current_state(TASK_INTERRUPTIBLE); |
4515 | while (!kthread_should_stop()) { | 4941 | while (!kthread_should_stop()) { |
4942 | struct migration_req *req; | ||
4516 | struct list_head *head; | 4943 | struct list_head *head; |
4517 | migration_req_t *req; | ||
4518 | 4944 | ||
4519 | try_to_freeze(); | 4945 | try_to_freeze(); |
4520 | 4946 | ||
@@ -4538,7 +4964,7 @@ static int migration_thread(void *data) | |||
4538 | set_current_state(TASK_INTERRUPTIBLE); | 4964 | set_current_state(TASK_INTERRUPTIBLE); |
4539 | continue; | 4965 | continue; |
4540 | } | 4966 | } |
4541 | req = list_entry(head->next, migration_req_t, list); | 4967 | req = list_entry(head->next, struct migration_req, list); |
4542 | list_del_init(head->next); | 4968 | list_del_init(head->next); |
4543 | 4969 | ||
4544 | spin_unlock(&rq->lock); | 4970 | spin_unlock(&rq->lock); |
@@ -4563,36 +4989,42 @@ wait_to_die: | |||
4563 | 4989 | ||
4564 | #ifdef CONFIG_HOTPLUG_CPU | 4990 | #ifdef CONFIG_HOTPLUG_CPU |
4565 | /* Figure out where task on dead CPU should go, use force if neccessary. */ | 4991 | /* Figure out where task on dead CPU should go, use force if neccessary. */ |
4566 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) | 4992 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) |
4567 | { | 4993 | { |
4568 | int dest_cpu; | 4994 | unsigned long flags; |
4569 | cpumask_t mask; | 4995 | cpumask_t mask; |
4996 | struct rq *rq; | ||
4997 | int dest_cpu; | ||
4570 | 4998 | ||
4999 | restart: | ||
4571 | /* On same node? */ | 5000 | /* On same node? */ |
4572 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); | 5001 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); |
4573 | cpus_and(mask, mask, tsk->cpus_allowed); | 5002 | cpus_and(mask, mask, p->cpus_allowed); |
4574 | dest_cpu = any_online_cpu(mask); | 5003 | dest_cpu = any_online_cpu(mask); |
4575 | 5004 | ||
4576 | /* On any allowed CPU? */ | 5005 | /* On any allowed CPU? */ |
4577 | if (dest_cpu == NR_CPUS) | 5006 | if (dest_cpu == NR_CPUS) |
4578 | dest_cpu = any_online_cpu(tsk->cpus_allowed); | 5007 | dest_cpu = any_online_cpu(p->cpus_allowed); |
4579 | 5008 | ||
4580 | /* No more Mr. Nice Guy. */ | 5009 | /* No more Mr. Nice Guy. */ |
4581 | if (dest_cpu == NR_CPUS) { | 5010 | if (dest_cpu == NR_CPUS) { |
4582 | cpus_setall(tsk->cpus_allowed); | 5011 | rq = task_rq_lock(p, &flags); |
4583 | dest_cpu = any_online_cpu(tsk->cpus_allowed); | 5012 | cpus_setall(p->cpus_allowed); |
5013 | dest_cpu = any_online_cpu(p->cpus_allowed); | ||
5014 | task_rq_unlock(rq, &flags); | ||
4584 | 5015 | ||
4585 | /* | 5016 | /* |
4586 | * Don't tell them about moving exiting tasks or | 5017 | * Don't tell them about moving exiting tasks or |
4587 | * kernel threads (both mm NULL), since they never | 5018 | * kernel threads (both mm NULL), since they never |
4588 | * leave kernel. | 5019 | * leave kernel. |
4589 | */ | 5020 | */ |
4590 | if (tsk->mm && printk_ratelimit()) | 5021 | if (p->mm && printk_ratelimit()) |
4591 | printk(KERN_INFO "process %d (%s) no " | 5022 | printk(KERN_INFO "process %d (%s) no " |
4592 | "longer affine to cpu%d\n", | 5023 | "longer affine to cpu%d\n", |
4593 | tsk->pid, tsk->comm, dead_cpu); | 5024 | p->pid, p->comm, dead_cpu); |
4594 | } | 5025 | } |
4595 | __migrate_task(tsk, dead_cpu, dest_cpu); | 5026 | if (!__migrate_task(p, dead_cpu, dest_cpu)) |
5027 | goto restart; | ||
4596 | } | 5028 | } |
4597 | 5029 | ||
4598 | /* | 5030 | /* |
@@ -4602,9 +5034,9 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) | |||
4602 | * their home CPUs. So we just add the counter to another CPU's counter, | 5034 | * their home CPUs. So we just add the counter to another CPU's counter, |
4603 | * to keep the global sum constant after CPU-down: | 5035 | * to keep the global sum constant after CPU-down: |
4604 | */ | 5036 | */ |
4605 | static void migrate_nr_uninterruptible(runqueue_t *rq_src) | 5037 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
4606 | { | 5038 | { |
4607 | runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); | 5039 | struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); |
4608 | unsigned long flags; | 5040 | unsigned long flags; |
4609 | 5041 | ||
4610 | local_irq_save(flags); | 5042 | local_irq_save(flags); |
@@ -4618,48 +5050,51 @@ static void migrate_nr_uninterruptible(runqueue_t *rq_src) | |||
4618 | /* Run through task list and migrate tasks from the dead cpu. */ | 5050 | /* Run through task list and migrate tasks from the dead cpu. */ |
4619 | static void migrate_live_tasks(int src_cpu) | 5051 | static void migrate_live_tasks(int src_cpu) |
4620 | { | 5052 | { |
4621 | struct task_struct *tsk, *t; | 5053 | struct task_struct *p, *t; |
4622 | 5054 | ||
4623 | write_lock_irq(&tasklist_lock); | 5055 | write_lock_irq(&tasklist_lock); |
4624 | 5056 | ||
4625 | do_each_thread(t, tsk) { | 5057 | do_each_thread(t, p) { |
4626 | if (tsk == current) | 5058 | if (p == current) |
4627 | continue; | 5059 | continue; |
4628 | 5060 | ||
4629 | if (task_cpu(tsk) == src_cpu) | 5061 | if (task_cpu(p) == src_cpu) |
4630 | move_task_off_dead_cpu(src_cpu, tsk); | 5062 | move_task_off_dead_cpu(src_cpu, p); |
4631 | } while_each_thread(t, tsk); | 5063 | } while_each_thread(t, p); |
4632 | 5064 | ||
4633 | write_unlock_irq(&tasklist_lock); | 5065 | write_unlock_irq(&tasklist_lock); |
4634 | } | 5066 | } |
4635 | 5067 | ||
4636 | /* Schedules idle task to be the next runnable task on current CPU. | 5068 | /* Schedules idle task to be the next runnable task on current CPU. |
4637 | * It does so by boosting its priority to highest possible and adding it to | 5069 | * It does so by boosting its priority to highest possible and adding it to |
4638 | * the _front_ of runqueue. Used by CPU offline code. | 5070 | * the _front_ of the runqueue. Used by CPU offline code. |
4639 | */ | 5071 | */ |
4640 | void sched_idle_next(void) | 5072 | void sched_idle_next(void) |
4641 | { | 5073 | { |
4642 | int cpu = smp_processor_id(); | 5074 | int this_cpu = smp_processor_id(); |
4643 | runqueue_t *rq = this_rq(); | 5075 | struct rq *rq = cpu_rq(this_cpu); |
4644 | struct task_struct *p = rq->idle; | 5076 | struct task_struct *p = rq->idle; |
4645 | unsigned long flags; | 5077 | unsigned long flags; |
4646 | 5078 | ||
4647 | /* cpu has to be offline */ | 5079 | /* cpu has to be offline */ |
4648 | BUG_ON(cpu_online(cpu)); | 5080 | BUG_ON(cpu_online(this_cpu)); |
4649 | 5081 | ||
4650 | /* Strictly not necessary since rest of the CPUs are stopped by now | 5082 | /* |
4651 | * and interrupts disabled on current cpu. | 5083 | * Strictly not necessary since rest of the CPUs are stopped by now |
5084 | * and interrupts disabled on the current cpu. | ||
4652 | */ | 5085 | */ |
4653 | spin_lock_irqsave(&rq->lock, flags); | 5086 | spin_lock_irqsave(&rq->lock, flags); |
4654 | 5087 | ||
4655 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); | 5088 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); |
4656 | /* Add idle task to _front_ of it's priority queue */ | 5089 | |
5090 | /* Add idle task to the _front_ of its priority queue: */ | ||
4657 | __activate_idle_task(p, rq); | 5091 | __activate_idle_task(p, rq); |
4658 | 5092 | ||
4659 | spin_unlock_irqrestore(&rq->lock, flags); | 5093 | spin_unlock_irqrestore(&rq->lock, flags); |
4660 | } | 5094 | } |
4661 | 5095 | ||
4662 | /* Ensures that the idle task is using init_mm right before its cpu goes | 5096 | /* |
5097 | * Ensures that the idle task is using init_mm right before its cpu goes | ||
4663 | * offline. | 5098 | * offline. |
4664 | */ | 5099 | */ |
4665 | void idle_task_exit(void) | 5100 | void idle_task_exit(void) |
@@ -4673,17 +5108,17 @@ void idle_task_exit(void) | |||
4673 | mmdrop(mm); | 5108 | mmdrop(mm); |
4674 | } | 5109 | } |
4675 | 5110 | ||
4676 | static void migrate_dead(unsigned int dead_cpu, task_t *tsk) | 5111 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) |
4677 | { | 5112 | { |
4678 | struct runqueue *rq = cpu_rq(dead_cpu); | 5113 | struct rq *rq = cpu_rq(dead_cpu); |
4679 | 5114 | ||
4680 | /* Must be exiting, otherwise would be on tasklist. */ | 5115 | /* Must be exiting, otherwise would be on tasklist. */ |
4681 | BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD); | 5116 | BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); |
4682 | 5117 | ||
4683 | /* Cannot have done final schedule yet: would have vanished. */ | 5118 | /* Cannot have done final schedule yet: would have vanished. */ |
4684 | BUG_ON(tsk->flags & PF_DEAD); | 5119 | BUG_ON(p->flags & PF_DEAD); |
4685 | 5120 | ||
4686 | get_task_struct(tsk); | 5121 | get_task_struct(p); |
4687 | 5122 | ||
4688 | /* | 5123 | /* |
4689 | * Drop lock around migration; if someone else moves it, | 5124 | * Drop lock around migration; if someone else moves it, |
@@ -4691,25 +5126,25 @@ static void migrate_dead(unsigned int dead_cpu, task_t *tsk) | |||
4691 | * fine. | 5126 | * fine. |
4692 | */ | 5127 | */ |
4693 | spin_unlock_irq(&rq->lock); | 5128 | spin_unlock_irq(&rq->lock); |
4694 | move_task_off_dead_cpu(dead_cpu, tsk); | 5129 | move_task_off_dead_cpu(dead_cpu, p); |
4695 | spin_lock_irq(&rq->lock); | 5130 | spin_lock_irq(&rq->lock); |
4696 | 5131 | ||
4697 | put_task_struct(tsk); | 5132 | put_task_struct(p); |
4698 | } | 5133 | } |
4699 | 5134 | ||
4700 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ | 5135 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ |
4701 | static void migrate_dead_tasks(unsigned int dead_cpu) | 5136 | static void migrate_dead_tasks(unsigned int dead_cpu) |
4702 | { | 5137 | { |
4703 | unsigned arr, i; | 5138 | struct rq *rq = cpu_rq(dead_cpu); |
4704 | struct runqueue *rq = cpu_rq(dead_cpu); | 5139 | unsigned int arr, i; |
4705 | 5140 | ||
4706 | for (arr = 0; arr < 2; arr++) { | 5141 | for (arr = 0; arr < 2; arr++) { |
4707 | for (i = 0; i < MAX_PRIO; i++) { | 5142 | for (i = 0; i < MAX_PRIO; i++) { |
4708 | struct list_head *list = &rq->arrays[arr].queue[i]; | 5143 | struct list_head *list = &rq->arrays[arr].queue[i]; |
5144 | |||
4709 | while (!list_empty(list)) | 5145 | while (!list_empty(list)) |
4710 | migrate_dead(dead_cpu, | 5146 | migrate_dead(dead_cpu, list_entry(list->next, |
4711 | list_entry(list->next, task_t, | 5147 | struct task_struct, run_list)); |
4712 | run_list)); | ||
4713 | } | 5148 | } |
4714 | } | 5149 | } |
4715 | } | 5150 | } |
@@ -4719,13 +5154,13 @@ static void migrate_dead_tasks(unsigned int dead_cpu) | |||
4719 | * migration_call - callback that gets triggered when a CPU is added. | 5154 | * migration_call - callback that gets triggered when a CPU is added. |
4720 | * Here we can start up the necessary migration thread for the new CPU. | 5155 | * Here we can start up the necessary migration thread for the new CPU. |
4721 | */ | 5156 | */ |
4722 | static int migration_call(struct notifier_block *nfb, unsigned long action, | 5157 | static int __cpuinit |
4723 | void *hcpu) | 5158 | migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) |
4724 | { | 5159 | { |
4725 | int cpu = (long)hcpu; | ||
4726 | struct task_struct *p; | 5160 | struct task_struct *p; |
4727 | struct runqueue *rq; | 5161 | int cpu = (long)hcpu; |
4728 | unsigned long flags; | 5162 | unsigned long flags; |
5163 | struct rq *rq; | ||
4729 | 5164 | ||
4730 | switch (action) { | 5165 | switch (action) { |
4731 | case CPU_UP_PREPARE: | 5166 | case CPU_UP_PREPARE: |
@@ -4740,18 +5175,23 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, | |||
4740 | task_rq_unlock(rq, &flags); | 5175 | task_rq_unlock(rq, &flags); |
4741 | cpu_rq(cpu)->migration_thread = p; | 5176 | cpu_rq(cpu)->migration_thread = p; |
4742 | break; | 5177 | break; |
5178 | |||
4743 | case CPU_ONLINE: | 5179 | case CPU_ONLINE: |
4744 | /* Strictly unneccessary, as first user will wake it. */ | 5180 | /* Strictly unneccessary, as first user will wake it. */ |
4745 | wake_up_process(cpu_rq(cpu)->migration_thread); | 5181 | wake_up_process(cpu_rq(cpu)->migration_thread); |
4746 | break; | 5182 | break; |
5183 | |||
4747 | #ifdef CONFIG_HOTPLUG_CPU | 5184 | #ifdef CONFIG_HOTPLUG_CPU |
4748 | case CPU_UP_CANCELED: | 5185 | case CPU_UP_CANCELED: |
5186 | if (!cpu_rq(cpu)->migration_thread) | ||
5187 | break; | ||
4749 | /* Unbind it from offline cpu so it can run. Fall thru. */ | 5188 | /* Unbind it from offline cpu so it can run. Fall thru. */ |
4750 | kthread_bind(cpu_rq(cpu)->migration_thread, | 5189 | kthread_bind(cpu_rq(cpu)->migration_thread, |
4751 | any_online_cpu(cpu_online_map)); | 5190 | any_online_cpu(cpu_online_map)); |
4752 | kthread_stop(cpu_rq(cpu)->migration_thread); | 5191 | kthread_stop(cpu_rq(cpu)->migration_thread); |
4753 | cpu_rq(cpu)->migration_thread = NULL; | 5192 | cpu_rq(cpu)->migration_thread = NULL; |
4754 | break; | 5193 | break; |
5194 | |||
4755 | case CPU_DEAD: | 5195 | case CPU_DEAD: |
4756 | migrate_live_tasks(cpu); | 5196 | migrate_live_tasks(cpu); |
4757 | rq = cpu_rq(cpu); | 5197 | rq = cpu_rq(cpu); |
@@ -4772,9 +5212,10 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, | |||
4772 | * the requestors. */ | 5212 | * the requestors. */ |
4773 | spin_lock_irq(&rq->lock); | 5213 | spin_lock_irq(&rq->lock); |
4774 | while (!list_empty(&rq->migration_queue)) { | 5214 | while (!list_empty(&rq->migration_queue)) { |
4775 | migration_req_t *req; | 5215 | struct migration_req *req; |
5216 | |||
4776 | req = list_entry(rq->migration_queue.next, | 5217 | req = list_entry(rq->migration_queue.next, |
4777 | migration_req_t, list); | 5218 | struct migration_req, list); |
4778 | list_del_init(&req->list); | 5219 | list_del_init(&req->list); |
4779 | complete(&req->done); | 5220 | complete(&req->done); |
4780 | } | 5221 | } |
@@ -4788,7 +5229,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, | |||
4788 | /* Register at highest priority so that task migration (migrate_all_tasks) | 5229 | /* Register at highest priority so that task migration (migrate_all_tasks) |
4789 | * happens before everything else. | 5230 | * happens before everything else. |
4790 | */ | 5231 | */ |
4791 | static struct notifier_block migration_notifier = { | 5232 | static struct notifier_block __cpuinitdata migration_notifier = { |
4792 | .notifier_call = migration_call, | 5233 | .notifier_call = migration_call, |
4793 | .priority = 10 | 5234 | .priority = 10 |
4794 | }; | 5235 | }; |
@@ -4796,10 +5237,12 @@ static struct notifier_block migration_notifier = { | |||
4796 | int __init migration_init(void) | 5237 | int __init migration_init(void) |
4797 | { | 5238 | { |
4798 | void *cpu = (void *)(long)smp_processor_id(); | 5239 | void *cpu = (void *)(long)smp_processor_id(); |
4799 | /* Start one for boot CPU. */ | 5240 | |
5241 | /* Start one for the boot CPU: */ | ||
4800 | migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); | 5242 | migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); |
4801 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 5243 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
4802 | register_cpu_notifier(&migration_notifier); | 5244 | register_cpu_notifier(&migration_notifier); |
5245 | |||
4803 | return 0; | 5246 | return 0; |
4804 | } | 5247 | } |
4805 | #endif | 5248 | #endif |
@@ -4895,7 +5338,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
4895 | } while (sd); | 5338 | } while (sd); |
4896 | } | 5339 | } |
4897 | #else | 5340 | #else |
4898 | #define sched_domain_debug(sd, cpu) {} | 5341 | # define sched_domain_debug(sd, cpu) do { } while (0) |
4899 | #endif | 5342 | #endif |
4900 | 5343 | ||
4901 | static int sd_degenerate(struct sched_domain *sd) | 5344 | static int sd_degenerate(struct sched_domain *sd) |
@@ -4921,8 +5364,8 @@ static int sd_degenerate(struct sched_domain *sd) | |||
4921 | return 1; | 5364 | return 1; |
4922 | } | 5365 | } |
4923 | 5366 | ||
4924 | static int sd_parent_degenerate(struct sched_domain *sd, | 5367 | static int |
4925 | struct sched_domain *parent) | 5368 | sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) |
4926 | { | 5369 | { |
4927 | unsigned long cflags = sd->flags, pflags = parent->flags; | 5370 | unsigned long cflags = sd->flags, pflags = parent->flags; |
4928 | 5371 | ||
@@ -4955,7 +5398,7 @@ static int sd_parent_degenerate(struct sched_domain *sd, | |||
4955 | */ | 5398 | */ |
4956 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) | 5399 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) |
4957 | { | 5400 | { |
4958 | runqueue_t *rq = cpu_rq(cpu); | 5401 | struct rq *rq = cpu_rq(cpu); |
4959 | struct sched_domain *tmp; | 5402 | struct sched_domain *tmp; |
4960 | 5403 | ||
4961 | /* Remove the sched domains which do not contribute to scheduling. */ | 5404 | /* Remove the sched domains which do not contribute to scheduling. */ |
@@ -5217,8 +5660,8 @@ static void touch_cache(void *__cache, unsigned long __size) | |||
5217 | /* | 5660 | /* |
5218 | * Measure the cache-cost of one task migration. Returns in units of nsec. | 5661 | * Measure the cache-cost of one task migration. Returns in units of nsec. |
5219 | */ | 5662 | */ |
5220 | static unsigned long long measure_one(void *cache, unsigned long size, | 5663 | static unsigned long long |
5221 | int source, int target) | 5664 | measure_one(void *cache, unsigned long size, int source, int target) |
5222 | { | 5665 | { |
5223 | cpumask_t mask, saved_mask; | 5666 | cpumask_t mask, saved_mask; |
5224 | unsigned long long t0, t1, t2, t3, cost; | 5667 | unsigned long long t0, t1, t2, t3, cost; |
@@ -5370,7 +5813,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) | |||
5370 | cache = vmalloc(max_size); | 5813 | cache = vmalloc(max_size); |
5371 | if (!cache) { | 5814 | if (!cache) { |
5372 | printk("could not vmalloc %d bytes for cache!\n", 2*max_size); | 5815 | printk("could not vmalloc %d bytes for cache!\n", 2*max_size); |
5373 | return 1000000; // return 1 msec on very small boxen | 5816 | return 1000000; /* return 1 msec on very small boxen */ |
5374 | } | 5817 | } |
5375 | 5818 | ||
5376 | while (size <= max_size) { | 5819 | while (size <= max_size) { |
@@ -5568,9 +6011,9 @@ static int find_next_best_node(int node, unsigned long *used_nodes) | |||
5568 | */ | 6011 | */ |
5569 | static cpumask_t sched_domain_node_span(int node) | 6012 | static cpumask_t sched_domain_node_span(int node) |
5570 | { | 6013 | { |
5571 | int i; | ||
5572 | cpumask_t span, nodemask; | ||
5573 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); | 6014 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); |
6015 | cpumask_t span, nodemask; | ||
6016 | int i; | ||
5574 | 6017 | ||
5575 | cpus_clear(span); | 6018 | cpus_clear(span); |
5576 | bitmap_zero(used_nodes, MAX_NUMNODES); | 6019 | bitmap_zero(used_nodes, MAX_NUMNODES); |
@@ -5581,6 +6024,7 @@ static cpumask_t sched_domain_node_span(int node) | |||
5581 | 6024 | ||
5582 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | 6025 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { |
5583 | int next_node = find_next_best_node(node, used_nodes); | 6026 | int next_node = find_next_best_node(node, used_nodes); |
6027 | |||
5584 | nodemask = node_to_cpumask(next_node); | 6028 | nodemask = node_to_cpumask(next_node); |
5585 | cpus_or(span, span, nodemask); | 6029 | cpus_or(span, span, nodemask); |
5586 | } | 6030 | } |
@@ -5589,22 +6033,27 @@ static cpumask_t sched_domain_node_span(int node) | |||
5589 | } | 6033 | } |
5590 | #endif | 6034 | #endif |
5591 | 6035 | ||
6036 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | ||
6037 | |||
5592 | /* | 6038 | /* |
5593 | * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we | 6039 | * SMT sched-domains: |
5594 | * can switch it on easily if needed. | ||
5595 | */ | 6040 | */ |
5596 | #ifdef CONFIG_SCHED_SMT | 6041 | #ifdef CONFIG_SCHED_SMT |
5597 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | 6042 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); |
5598 | static struct sched_group sched_group_cpus[NR_CPUS]; | 6043 | static struct sched_group sched_group_cpus[NR_CPUS]; |
6044 | |||
5599 | static int cpu_to_cpu_group(int cpu) | 6045 | static int cpu_to_cpu_group(int cpu) |
5600 | { | 6046 | { |
5601 | return cpu; | 6047 | return cpu; |
5602 | } | 6048 | } |
5603 | #endif | 6049 | #endif |
5604 | 6050 | ||
6051 | /* | ||
6052 | * multi-core sched-domains: | ||
6053 | */ | ||
5605 | #ifdef CONFIG_SCHED_MC | 6054 | #ifdef CONFIG_SCHED_MC |
5606 | static DEFINE_PER_CPU(struct sched_domain, core_domains); | 6055 | static DEFINE_PER_CPU(struct sched_domain, core_domains); |
5607 | static struct sched_group sched_group_core[NR_CPUS]; | 6056 | static struct sched_group *sched_group_core_bycpu[NR_CPUS]; |
5608 | #endif | 6057 | #endif |
5609 | 6058 | ||
5610 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 6059 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
@@ -5620,10 +6069,11 @@ static int cpu_to_core_group(int cpu) | |||
5620 | #endif | 6069 | #endif |
5621 | 6070 | ||
5622 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | 6071 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); |
5623 | static struct sched_group sched_group_phys[NR_CPUS]; | 6072 | static struct sched_group *sched_group_phys_bycpu[NR_CPUS]; |
6073 | |||
5624 | static int cpu_to_phys_group(int cpu) | 6074 | static int cpu_to_phys_group(int cpu) |
5625 | { | 6075 | { |
5626 | #if defined(CONFIG_SCHED_MC) | 6076 | #ifdef CONFIG_SCHED_MC |
5627 | cpumask_t mask = cpu_coregroup_map(cpu); | 6077 | cpumask_t mask = cpu_coregroup_map(cpu); |
5628 | return first_cpu(mask); | 6078 | return first_cpu(mask); |
5629 | #elif defined(CONFIG_SCHED_SMT) | 6079 | #elif defined(CONFIG_SCHED_SMT) |
@@ -5677,13 +6127,74 @@ next_sg: | |||
5677 | } | 6127 | } |
5678 | #endif | 6128 | #endif |
5679 | 6129 | ||
6130 | /* Free memory allocated for various sched_group structures */ | ||
6131 | static void free_sched_groups(const cpumask_t *cpu_map) | ||
6132 | { | ||
6133 | int cpu; | ||
6134 | #ifdef CONFIG_NUMA | ||
6135 | int i; | ||
6136 | |||
6137 | for_each_cpu_mask(cpu, *cpu_map) { | ||
6138 | struct sched_group *sched_group_allnodes | ||
6139 | = sched_group_allnodes_bycpu[cpu]; | ||
6140 | struct sched_group **sched_group_nodes | ||
6141 | = sched_group_nodes_bycpu[cpu]; | ||
6142 | |||
6143 | if (sched_group_allnodes) { | ||
6144 | kfree(sched_group_allnodes); | ||
6145 | sched_group_allnodes_bycpu[cpu] = NULL; | ||
6146 | } | ||
6147 | |||
6148 | if (!sched_group_nodes) | ||
6149 | continue; | ||
6150 | |||
6151 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
6152 | cpumask_t nodemask = node_to_cpumask(i); | ||
6153 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | ||
6154 | |||
6155 | cpus_and(nodemask, nodemask, *cpu_map); | ||
6156 | if (cpus_empty(nodemask)) | ||
6157 | continue; | ||
6158 | |||
6159 | if (sg == NULL) | ||
6160 | continue; | ||
6161 | sg = sg->next; | ||
6162 | next_sg: | ||
6163 | oldsg = sg; | ||
6164 | sg = sg->next; | ||
6165 | kfree(oldsg); | ||
6166 | if (oldsg != sched_group_nodes[i]) | ||
6167 | goto next_sg; | ||
6168 | } | ||
6169 | kfree(sched_group_nodes); | ||
6170 | sched_group_nodes_bycpu[cpu] = NULL; | ||
6171 | } | ||
6172 | #endif | ||
6173 | for_each_cpu_mask(cpu, *cpu_map) { | ||
6174 | if (sched_group_phys_bycpu[cpu]) { | ||
6175 | kfree(sched_group_phys_bycpu[cpu]); | ||
6176 | sched_group_phys_bycpu[cpu] = NULL; | ||
6177 | } | ||
6178 | #ifdef CONFIG_SCHED_MC | ||
6179 | if (sched_group_core_bycpu[cpu]) { | ||
6180 | kfree(sched_group_core_bycpu[cpu]); | ||
6181 | sched_group_core_bycpu[cpu] = NULL; | ||
6182 | } | ||
6183 | #endif | ||
6184 | } | ||
6185 | } | ||
6186 | |||
5680 | /* | 6187 | /* |
5681 | * Build sched domains for a given set of cpus and attach the sched domains | 6188 | * Build sched domains for a given set of cpus and attach the sched domains |
5682 | * to the individual cpus | 6189 | * to the individual cpus |
5683 | */ | 6190 | */ |
5684 | void build_sched_domains(const cpumask_t *cpu_map) | 6191 | static int build_sched_domains(const cpumask_t *cpu_map) |
5685 | { | 6192 | { |
5686 | int i; | 6193 | int i; |
6194 | struct sched_group *sched_group_phys = NULL; | ||
6195 | #ifdef CONFIG_SCHED_MC | ||
6196 | struct sched_group *sched_group_core = NULL; | ||
6197 | #endif | ||
5687 | #ifdef CONFIG_NUMA | 6198 | #ifdef CONFIG_NUMA |
5688 | struct sched_group **sched_group_nodes = NULL; | 6199 | struct sched_group **sched_group_nodes = NULL; |
5689 | struct sched_group *sched_group_allnodes = NULL; | 6200 | struct sched_group *sched_group_allnodes = NULL; |
@@ -5691,11 +6202,11 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5691 | /* | 6202 | /* |
5692 | * Allocate the per-node list of sched groups | 6203 | * Allocate the per-node list of sched groups |
5693 | */ | 6204 | */ |
5694 | sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, | 6205 | sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, |
5695 | GFP_ATOMIC); | 6206 | GFP_KERNEL); |
5696 | if (!sched_group_nodes) { | 6207 | if (!sched_group_nodes) { |
5697 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 6208 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
5698 | return; | 6209 | return -ENOMEM; |
5699 | } | 6210 | } |
5700 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | 6211 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; |
5701 | #endif | 6212 | #endif |
@@ -5721,7 +6232,7 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5721 | if (!sched_group_allnodes) { | 6232 | if (!sched_group_allnodes) { |
5722 | printk(KERN_WARNING | 6233 | printk(KERN_WARNING |
5723 | "Can not alloc allnodes sched group\n"); | 6234 | "Can not alloc allnodes sched group\n"); |
5724 | break; | 6235 | goto error; |
5725 | } | 6236 | } |
5726 | sched_group_allnodes_bycpu[i] | 6237 | sched_group_allnodes_bycpu[i] |
5727 | = sched_group_allnodes; | 6238 | = sched_group_allnodes; |
@@ -5742,6 +6253,18 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5742 | cpus_and(sd->span, sd->span, *cpu_map); | 6253 | cpus_and(sd->span, sd->span, *cpu_map); |
5743 | #endif | 6254 | #endif |
5744 | 6255 | ||
6256 | if (!sched_group_phys) { | ||
6257 | sched_group_phys | ||
6258 | = kmalloc(sizeof(struct sched_group) * NR_CPUS, | ||
6259 | GFP_KERNEL); | ||
6260 | if (!sched_group_phys) { | ||
6261 | printk (KERN_WARNING "Can not alloc phys sched" | ||
6262 | "group\n"); | ||
6263 | goto error; | ||
6264 | } | ||
6265 | sched_group_phys_bycpu[i] = sched_group_phys; | ||
6266 | } | ||
6267 | |||
5745 | p = sd; | 6268 | p = sd; |
5746 | sd = &per_cpu(phys_domains, i); | 6269 | sd = &per_cpu(phys_domains, i); |
5747 | group = cpu_to_phys_group(i); | 6270 | group = cpu_to_phys_group(i); |
@@ -5751,6 +6274,18 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5751 | sd->groups = &sched_group_phys[group]; | 6274 | sd->groups = &sched_group_phys[group]; |
5752 | 6275 | ||
5753 | #ifdef CONFIG_SCHED_MC | 6276 | #ifdef CONFIG_SCHED_MC |
6277 | if (!sched_group_core) { | ||
6278 | sched_group_core | ||
6279 | = kmalloc(sizeof(struct sched_group) * NR_CPUS, | ||
6280 | GFP_KERNEL); | ||
6281 | if (!sched_group_core) { | ||
6282 | printk (KERN_WARNING "Can not alloc core sched" | ||
6283 | "group\n"); | ||
6284 | goto error; | ||
6285 | } | ||
6286 | sched_group_core_bycpu[i] = sched_group_core; | ||
6287 | } | ||
6288 | |||
5754 | p = sd; | 6289 | p = sd; |
5755 | sd = &per_cpu(core_domains, i); | 6290 | sd = &per_cpu(core_domains, i); |
5756 | group = cpu_to_core_group(i); | 6291 | group = cpu_to_core_group(i); |
@@ -5834,24 +6369,21 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5834 | domainspan = sched_domain_node_span(i); | 6369 | domainspan = sched_domain_node_span(i); |
5835 | cpus_and(domainspan, domainspan, *cpu_map); | 6370 | cpus_and(domainspan, domainspan, *cpu_map); |
5836 | 6371 | ||
5837 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | 6372 | sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); |
6373 | if (!sg) { | ||
6374 | printk(KERN_WARNING "Can not alloc domain group for " | ||
6375 | "node %d\n", i); | ||
6376 | goto error; | ||
6377 | } | ||
5838 | sched_group_nodes[i] = sg; | 6378 | sched_group_nodes[i] = sg; |
5839 | for_each_cpu_mask(j, nodemask) { | 6379 | for_each_cpu_mask(j, nodemask) { |
5840 | struct sched_domain *sd; | 6380 | struct sched_domain *sd; |
5841 | sd = &per_cpu(node_domains, j); | 6381 | sd = &per_cpu(node_domains, j); |
5842 | sd->groups = sg; | 6382 | sd->groups = sg; |
5843 | if (sd->groups == NULL) { | ||
5844 | /* Turn off balancing if we have no groups */ | ||
5845 | sd->flags = 0; | ||
5846 | } | ||
5847 | } | ||
5848 | if (!sg) { | ||
5849 | printk(KERN_WARNING | ||
5850 | "Can not alloc domain group for node %d\n", i); | ||
5851 | continue; | ||
5852 | } | 6383 | } |
5853 | sg->cpu_power = 0; | 6384 | sg->cpu_power = 0; |
5854 | sg->cpumask = nodemask; | 6385 | sg->cpumask = nodemask; |
6386 | sg->next = sg; | ||
5855 | cpus_or(covered, covered, nodemask); | 6387 | cpus_or(covered, covered, nodemask); |
5856 | prev = sg; | 6388 | prev = sg; |
5857 | 6389 | ||
@@ -5870,54 +6402,90 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5870 | if (cpus_empty(tmp)) | 6402 | if (cpus_empty(tmp)) |
5871 | continue; | 6403 | continue; |
5872 | 6404 | ||
5873 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | 6405 | sg = kmalloc_node(sizeof(struct sched_group), |
6406 | GFP_KERNEL, i); | ||
5874 | if (!sg) { | 6407 | if (!sg) { |
5875 | printk(KERN_WARNING | 6408 | printk(KERN_WARNING |
5876 | "Can not alloc domain group for node %d\n", j); | 6409 | "Can not alloc domain group for node %d\n", j); |
5877 | break; | 6410 | goto error; |
5878 | } | 6411 | } |
5879 | sg->cpu_power = 0; | 6412 | sg->cpu_power = 0; |
5880 | sg->cpumask = tmp; | 6413 | sg->cpumask = tmp; |
6414 | sg->next = prev->next; | ||
5881 | cpus_or(covered, covered, tmp); | 6415 | cpus_or(covered, covered, tmp); |
5882 | prev->next = sg; | 6416 | prev->next = sg; |
5883 | prev = sg; | 6417 | prev = sg; |
5884 | } | 6418 | } |
5885 | prev->next = sched_group_nodes[i]; | ||
5886 | } | 6419 | } |
5887 | #endif | 6420 | #endif |
5888 | 6421 | ||
5889 | /* Calculate CPU power for physical packages and nodes */ | 6422 | /* Calculate CPU power for physical packages and nodes */ |
6423 | #ifdef CONFIG_SCHED_SMT | ||
5890 | for_each_cpu_mask(i, *cpu_map) { | 6424 | for_each_cpu_mask(i, *cpu_map) { |
5891 | int power; | ||
5892 | struct sched_domain *sd; | 6425 | struct sched_domain *sd; |
5893 | #ifdef CONFIG_SCHED_SMT | ||
5894 | sd = &per_cpu(cpu_domains, i); | 6426 | sd = &per_cpu(cpu_domains, i); |
5895 | power = SCHED_LOAD_SCALE; | 6427 | sd->groups->cpu_power = SCHED_LOAD_SCALE; |
5896 | sd->groups->cpu_power = power; | 6428 | } |
5897 | #endif | 6429 | #endif |
5898 | #ifdef CONFIG_SCHED_MC | 6430 | #ifdef CONFIG_SCHED_MC |
6431 | for_each_cpu_mask(i, *cpu_map) { | ||
6432 | int power; | ||
6433 | struct sched_domain *sd; | ||
5899 | sd = &per_cpu(core_domains, i); | 6434 | sd = &per_cpu(core_domains, i); |
5900 | power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) | 6435 | if (sched_smt_power_savings) |
6436 | power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); | ||
6437 | else | ||
6438 | power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) | ||
5901 | * SCHED_LOAD_SCALE / 10; | 6439 | * SCHED_LOAD_SCALE / 10; |
5902 | sd->groups->cpu_power = power; | 6440 | sd->groups->cpu_power = power; |
6441 | } | ||
6442 | #endif | ||
5903 | 6443 | ||
6444 | for_each_cpu_mask(i, *cpu_map) { | ||
6445 | struct sched_domain *sd; | ||
6446 | #ifdef CONFIG_SCHED_MC | ||
5904 | sd = &per_cpu(phys_domains, i); | 6447 | sd = &per_cpu(phys_domains, i); |
6448 | if (i != first_cpu(sd->groups->cpumask)) | ||
6449 | continue; | ||
5905 | 6450 | ||
5906 | /* | 6451 | sd->groups->cpu_power = 0; |
5907 | * This has to be < 2 * SCHED_LOAD_SCALE | 6452 | if (sched_mc_power_savings || sched_smt_power_savings) { |
5908 | * Lets keep it SCHED_LOAD_SCALE, so that | 6453 | int j; |
5909 | * while calculating NUMA group's cpu_power | 6454 | |
5910 | * we can simply do | 6455 | for_each_cpu_mask(j, sd->groups->cpumask) { |
5911 | * numa_group->cpu_power += phys_group->cpu_power; | 6456 | struct sched_domain *sd1; |
5912 | * | 6457 | sd1 = &per_cpu(core_domains, j); |
5913 | * See "only add power once for each physical pkg" | 6458 | /* |
5914 | * comment below | 6459 | * for each core we will add once |
5915 | */ | 6460 | * to the group in physical domain |
5916 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | 6461 | */ |
6462 | if (j != first_cpu(sd1->groups->cpumask)) | ||
6463 | continue; | ||
6464 | |||
6465 | if (sched_smt_power_savings) | ||
6466 | sd->groups->cpu_power += sd1->groups->cpu_power; | ||
6467 | else | ||
6468 | sd->groups->cpu_power += SCHED_LOAD_SCALE; | ||
6469 | } | ||
6470 | } else | ||
6471 | /* | ||
6472 | * This has to be < 2 * SCHED_LOAD_SCALE | ||
6473 | * Lets keep it SCHED_LOAD_SCALE, so that | ||
6474 | * while calculating NUMA group's cpu_power | ||
6475 | * we can simply do | ||
6476 | * numa_group->cpu_power += phys_group->cpu_power; | ||
6477 | * | ||
6478 | * See "only add power once for each physical pkg" | ||
6479 | * comment below | ||
6480 | */ | ||
6481 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | ||
5917 | #else | 6482 | #else |
6483 | int power; | ||
5918 | sd = &per_cpu(phys_domains, i); | 6484 | sd = &per_cpu(phys_domains, i); |
5919 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * | 6485 | if (sched_smt_power_savings) |
5920 | (cpus_weight(sd->groups->cpumask)-1) / 10; | 6486 | power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); |
6487 | else | ||
6488 | power = SCHED_LOAD_SCALE; | ||
5921 | sd->groups->cpu_power = power; | 6489 | sd->groups->cpu_power = power; |
5922 | #endif | 6490 | #endif |
5923 | } | 6491 | } |
@@ -5945,13 +6513,20 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5945 | * Tune cache-hot values: | 6513 | * Tune cache-hot values: |
5946 | */ | 6514 | */ |
5947 | calibrate_migration_costs(cpu_map); | 6515 | calibrate_migration_costs(cpu_map); |
6516 | |||
6517 | return 0; | ||
6518 | |||
6519 | error: | ||
6520 | free_sched_groups(cpu_map); | ||
6521 | return -ENOMEM; | ||
5948 | } | 6522 | } |
5949 | /* | 6523 | /* |
5950 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 6524 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
5951 | */ | 6525 | */ |
5952 | static void arch_init_sched_domains(const cpumask_t *cpu_map) | 6526 | static int arch_init_sched_domains(const cpumask_t *cpu_map) |
5953 | { | 6527 | { |
5954 | cpumask_t cpu_default_map; | 6528 | cpumask_t cpu_default_map; |
6529 | int err; | ||
5955 | 6530 | ||
5956 | /* | 6531 | /* |
5957 | * Setup mask for cpus without special case scheduling requirements. | 6532 | * Setup mask for cpus without special case scheduling requirements. |
@@ -5960,51 +6535,14 @@ static void arch_init_sched_domains(const cpumask_t *cpu_map) | |||
5960 | */ | 6535 | */ |
5961 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); | 6536 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); |
5962 | 6537 | ||
5963 | build_sched_domains(&cpu_default_map); | 6538 | err = build_sched_domains(&cpu_default_map); |
6539 | |||
6540 | return err; | ||
5964 | } | 6541 | } |
5965 | 6542 | ||
5966 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | 6543 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) |
5967 | { | 6544 | { |
5968 | #ifdef CONFIG_NUMA | 6545 | free_sched_groups(cpu_map); |
5969 | int i; | ||
5970 | int cpu; | ||
5971 | |||
5972 | for_each_cpu_mask(cpu, *cpu_map) { | ||
5973 | struct sched_group *sched_group_allnodes | ||
5974 | = sched_group_allnodes_bycpu[cpu]; | ||
5975 | struct sched_group **sched_group_nodes | ||
5976 | = sched_group_nodes_bycpu[cpu]; | ||
5977 | |||
5978 | if (sched_group_allnodes) { | ||
5979 | kfree(sched_group_allnodes); | ||
5980 | sched_group_allnodes_bycpu[cpu] = NULL; | ||
5981 | } | ||
5982 | |||
5983 | if (!sched_group_nodes) | ||
5984 | continue; | ||
5985 | |||
5986 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
5987 | cpumask_t nodemask = node_to_cpumask(i); | ||
5988 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | ||
5989 | |||
5990 | cpus_and(nodemask, nodemask, *cpu_map); | ||
5991 | if (cpus_empty(nodemask)) | ||
5992 | continue; | ||
5993 | |||
5994 | if (sg == NULL) | ||
5995 | continue; | ||
5996 | sg = sg->next; | ||
5997 | next_sg: | ||
5998 | oldsg = sg; | ||
5999 | sg = sg->next; | ||
6000 | kfree(oldsg); | ||
6001 | if (oldsg != sched_group_nodes[i]) | ||
6002 | goto next_sg; | ||
6003 | } | ||
6004 | kfree(sched_group_nodes); | ||
6005 | sched_group_nodes_bycpu[cpu] = NULL; | ||
6006 | } | ||
6007 | #endif | ||
6008 | } | 6546 | } |
6009 | 6547 | ||
6010 | /* | 6548 | /* |
@@ -6029,9 +6567,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
6029 | * correct sched domains | 6567 | * correct sched domains |
6030 | * Call with hotplug lock held | 6568 | * Call with hotplug lock held |
6031 | */ | 6569 | */ |
6032 | void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | 6570 | int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) |
6033 | { | 6571 | { |
6034 | cpumask_t change_map; | 6572 | cpumask_t change_map; |
6573 | int err = 0; | ||
6035 | 6574 | ||
6036 | cpus_and(*partition1, *partition1, cpu_online_map); | 6575 | cpus_and(*partition1, *partition1, cpu_online_map); |
6037 | cpus_and(*partition2, *partition2, cpu_online_map); | 6576 | cpus_and(*partition2, *partition2, cpu_online_map); |
@@ -6040,11 +6579,90 @@ void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | |||
6040 | /* Detach sched domains from all of the affected cpus */ | 6579 | /* Detach sched domains from all of the affected cpus */ |
6041 | detach_destroy_domains(&change_map); | 6580 | detach_destroy_domains(&change_map); |
6042 | if (!cpus_empty(*partition1)) | 6581 | if (!cpus_empty(*partition1)) |
6043 | build_sched_domains(partition1); | 6582 | err = build_sched_domains(partition1); |
6044 | if (!cpus_empty(*partition2)) | 6583 | if (!err && !cpus_empty(*partition2)) |
6045 | build_sched_domains(partition2); | 6584 | err = build_sched_domains(partition2); |
6585 | |||
6586 | return err; | ||
6046 | } | 6587 | } |
6047 | 6588 | ||
6589 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
6590 | int arch_reinit_sched_domains(void) | ||
6591 | { | ||
6592 | int err; | ||
6593 | |||
6594 | lock_cpu_hotplug(); | ||
6595 | detach_destroy_domains(&cpu_online_map); | ||
6596 | err = arch_init_sched_domains(&cpu_online_map); | ||
6597 | unlock_cpu_hotplug(); | ||
6598 | |||
6599 | return err; | ||
6600 | } | ||
6601 | |||
6602 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | ||
6603 | { | ||
6604 | int ret; | ||
6605 | |||
6606 | if (buf[0] != '0' && buf[0] != '1') | ||
6607 | return -EINVAL; | ||
6608 | |||
6609 | if (smt) | ||
6610 | sched_smt_power_savings = (buf[0] == '1'); | ||
6611 | else | ||
6612 | sched_mc_power_savings = (buf[0] == '1'); | ||
6613 | |||
6614 | ret = arch_reinit_sched_domains(); | ||
6615 | |||
6616 | return ret ? ret : count; | ||
6617 | } | ||
6618 | |||
6619 | int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | ||
6620 | { | ||
6621 | int err = 0; | ||
6622 | |||
6623 | #ifdef CONFIG_SCHED_SMT | ||
6624 | if (smt_capable()) | ||
6625 | err = sysfs_create_file(&cls->kset.kobj, | ||
6626 | &attr_sched_smt_power_savings.attr); | ||
6627 | #endif | ||
6628 | #ifdef CONFIG_SCHED_MC | ||
6629 | if (!err && mc_capable()) | ||
6630 | err = sysfs_create_file(&cls->kset.kobj, | ||
6631 | &attr_sched_mc_power_savings.attr); | ||
6632 | #endif | ||
6633 | return err; | ||
6634 | } | ||
6635 | #endif | ||
6636 | |||
6637 | #ifdef CONFIG_SCHED_MC | ||
6638 | static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) | ||
6639 | { | ||
6640 | return sprintf(page, "%u\n", sched_mc_power_savings); | ||
6641 | } | ||
6642 | static ssize_t sched_mc_power_savings_store(struct sys_device *dev, | ||
6643 | const char *buf, size_t count) | ||
6644 | { | ||
6645 | return sched_power_savings_store(buf, count, 0); | ||
6646 | } | ||
6647 | SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, | ||
6648 | sched_mc_power_savings_store); | ||
6649 | #endif | ||
6650 | |||
6651 | #ifdef CONFIG_SCHED_SMT | ||
6652 | static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) | ||
6653 | { | ||
6654 | return sprintf(page, "%u\n", sched_smt_power_savings); | ||
6655 | } | ||
6656 | static ssize_t sched_smt_power_savings_store(struct sys_device *dev, | ||
6657 | const char *buf, size_t count) | ||
6658 | { | ||
6659 | return sched_power_savings_store(buf, count, 1); | ||
6660 | } | ||
6661 | SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, | ||
6662 | sched_smt_power_savings_store); | ||
6663 | #endif | ||
6664 | |||
6665 | |||
6048 | #ifdef CONFIG_HOTPLUG_CPU | 6666 | #ifdef CONFIG_HOTPLUG_CPU |
6049 | /* | 6667 | /* |
6050 | * Force a reinitialization of the sched domains hierarchy. The domains | 6668 | * Force a reinitialization of the sched domains hierarchy. The domains |
@@ -6098,6 +6716,7 @@ int in_sched_functions(unsigned long addr) | |||
6098 | { | 6716 | { |
6099 | /* Linker adds these: start and end of __sched functions */ | 6717 | /* Linker adds these: start and end of __sched functions */ |
6100 | extern char __sched_text_start[], __sched_text_end[]; | 6718 | extern char __sched_text_start[], __sched_text_end[]; |
6719 | |||
6101 | return in_lock_functions(addr) || | 6720 | return in_lock_functions(addr) || |
6102 | (addr >= (unsigned long)__sched_text_start | 6721 | (addr >= (unsigned long)__sched_text_start |
6103 | && addr < (unsigned long)__sched_text_end); | 6722 | && addr < (unsigned long)__sched_text_end); |
@@ -6105,14 +6724,15 @@ int in_sched_functions(unsigned long addr) | |||
6105 | 6724 | ||
6106 | void __init sched_init(void) | 6725 | void __init sched_init(void) |
6107 | { | 6726 | { |
6108 | runqueue_t *rq; | ||
6109 | int i, j, k; | 6727 | int i, j, k; |
6110 | 6728 | ||
6111 | for_each_possible_cpu(i) { | 6729 | for_each_possible_cpu(i) { |
6112 | prio_array_t *array; | 6730 | struct prio_array *array; |
6731 | struct rq *rq; | ||
6113 | 6732 | ||
6114 | rq = cpu_rq(i); | 6733 | rq = cpu_rq(i); |
6115 | spin_lock_init(&rq->lock); | 6734 | spin_lock_init(&rq->lock); |
6735 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); | ||
6116 | rq->nr_running = 0; | 6736 | rq->nr_running = 0; |
6117 | rq->active = rq->arrays; | 6737 | rq->active = rq->arrays; |
6118 | rq->expired = rq->arrays + 1; | 6738 | rq->expired = rq->arrays + 1; |
@@ -6126,7 +6746,6 @@ void __init sched_init(void) | |||
6126 | rq->push_cpu = 0; | 6746 | rq->push_cpu = 0; |
6127 | rq->migration_thread = NULL; | 6747 | rq->migration_thread = NULL; |
6128 | INIT_LIST_HEAD(&rq->migration_queue); | 6748 | INIT_LIST_HEAD(&rq->migration_queue); |
6129 | rq->cpu = i; | ||
6130 | #endif | 6749 | #endif |
6131 | atomic_set(&rq->nr_iowait, 0); | 6750 | atomic_set(&rq->nr_iowait, 0); |
6132 | 6751 | ||
@@ -6141,6 +6760,7 @@ void __init sched_init(void) | |||
6141 | } | 6760 | } |
6142 | } | 6761 | } |
6143 | 6762 | ||
6763 | set_load_weight(&init_task); | ||
6144 | /* | 6764 | /* |
6145 | * The boot idle thread does lazy MMU switching as well: | 6765 | * The boot idle thread does lazy MMU switching as well: |
6146 | */ | 6766 | */ |
@@ -6159,7 +6779,7 @@ void __init sched_init(void) | |||
6159 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 6779 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
6160 | void __might_sleep(char *file, int line) | 6780 | void __might_sleep(char *file, int line) |
6161 | { | 6781 | { |
6162 | #if defined(in_atomic) | 6782 | #ifdef in_atomic |
6163 | static unsigned long prev_jiffy; /* ratelimiting */ | 6783 | static unsigned long prev_jiffy; /* ratelimiting */ |
6164 | 6784 | ||
6165 | if ((in_atomic() || irqs_disabled()) && | 6785 | if ((in_atomic() || irqs_disabled()) && |
@@ -6181,17 +6801,18 @@ EXPORT_SYMBOL(__might_sleep); | |||
6181 | #ifdef CONFIG_MAGIC_SYSRQ | 6801 | #ifdef CONFIG_MAGIC_SYSRQ |
6182 | void normalize_rt_tasks(void) | 6802 | void normalize_rt_tasks(void) |
6183 | { | 6803 | { |
6804 | struct prio_array *array; | ||
6184 | struct task_struct *p; | 6805 | struct task_struct *p; |
6185 | prio_array_t *array; | ||
6186 | unsigned long flags; | 6806 | unsigned long flags; |
6187 | runqueue_t *rq; | 6807 | struct rq *rq; |
6188 | 6808 | ||
6189 | read_lock_irq(&tasklist_lock); | 6809 | read_lock_irq(&tasklist_lock); |
6190 | for_each_process (p) { | 6810 | for_each_process(p) { |
6191 | if (!rt_task(p)) | 6811 | if (!rt_task(p)) |
6192 | continue; | 6812 | continue; |
6193 | 6813 | ||
6194 | rq = task_rq_lock(p, &flags); | 6814 | spin_lock_irqsave(&p->pi_lock, flags); |
6815 | rq = __task_rq_lock(p); | ||
6195 | 6816 | ||
6196 | array = p->array; | 6817 | array = p->array; |
6197 | if (array) | 6818 | if (array) |
@@ -6202,7 +6823,8 @@ void normalize_rt_tasks(void) | |||
6202 | resched_task(rq->curr); | 6823 | resched_task(rq->curr); |
6203 | } | 6824 | } |
6204 | 6825 | ||
6205 | task_rq_unlock(rq, &flags); | 6826 | __task_rq_unlock(rq); |
6827 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
6206 | } | 6828 | } |
6207 | read_unlock_irq(&tasklist_lock); | 6829 | read_unlock_irq(&tasklist_lock); |
6208 | } | 6830 | } |
@@ -6226,7 +6848,7 @@ void normalize_rt_tasks(void) | |||
6226 | * | 6848 | * |
6227 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 6849 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
6228 | */ | 6850 | */ |
6229 | task_t *curr_task(int cpu) | 6851 | struct task_struct *curr_task(int cpu) |
6230 | { | 6852 | { |
6231 | return cpu_curr(cpu); | 6853 | return cpu_curr(cpu); |
6232 | } | 6854 | } |
@@ -6246,7 +6868,7 @@ task_t *curr_task(int cpu) | |||
6246 | * | 6868 | * |
6247 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 6869 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
6248 | */ | 6870 | */ |
6249 | void set_curr_task(int cpu, task_t *p) | 6871 | void set_curr_task(int cpu, struct task_struct *p) |
6250 | { | 6872 | { |
6251 | cpu_curr(cpu) = p; | 6873 | cpu_curr(cpu) = p; |
6252 | } | 6874 | } |
diff --git a/kernel/signal.c b/kernel/signal.c index e5f8aea78ffe..7fe874d12fae 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -10,7 +10,6 @@ | |||
10 | * to allow signals to be sent reliably. | 10 | * to allow signals to be sent reliably. |
11 | */ | 11 | */ |
12 | 12 | ||
13 | #include <linux/config.h> | ||
14 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
15 | #include <linux/module.h> | 14 | #include <linux/module.h> |
16 | #include <linux/smp_lock.h> | 15 | #include <linux/smp_lock.h> |
@@ -23,12 +22,12 @@ | |||
23 | #include <linux/syscalls.h> | 22 | #include <linux/syscalls.h> |
24 | #include <linux/ptrace.h> | 23 | #include <linux/ptrace.h> |
25 | #include <linux/signal.h> | 24 | #include <linux/signal.h> |
26 | #include <linux/audit.h> | ||
27 | #include <linux/capability.h> | 25 | #include <linux/capability.h> |
28 | #include <asm/param.h> | 26 | #include <asm/param.h> |
29 | #include <asm/uaccess.h> | 27 | #include <asm/uaccess.h> |
30 | #include <asm/unistd.h> | 28 | #include <asm/unistd.h> |
31 | #include <asm/siginfo.h> | 29 | #include <asm/siginfo.h> |
30 | #include "audit.h" /* audit_signal_info() */ | ||
32 | 31 | ||
33 | /* | 32 | /* |
34 | * SLAB caches for signal bits. | 33 | * SLAB caches for signal bits. |
@@ -584,7 +583,7 @@ static int check_kill_permission(int sig, struct siginfo *info, | |||
584 | && !capable(CAP_KILL)) | 583 | && !capable(CAP_KILL)) |
585 | return error; | 584 | return error; |
586 | 585 | ||
587 | error = security_task_kill(t, info, sig); | 586 | error = security_task_kill(t, info, sig, 0); |
588 | if (!error) | 587 | if (!error) |
589 | audit_signal_info(sig, t); /* Let audit system see the signal */ | 588 | audit_signal_info(sig, t); /* Let audit system see the signal */ |
590 | return error; | 589 | return error; |
@@ -1107,7 +1106,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) | |||
1107 | 1106 | ||
1108 | /* like kill_proc_info(), but doesn't use uid/euid of "current" */ | 1107 | /* like kill_proc_info(), but doesn't use uid/euid of "current" */ |
1109 | int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, | 1108 | int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, |
1110 | uid_t uid, uid_t euid) | 1109 | uid_t uid, uid_t euid, u32 secid) |
1111 | { | 1110 | { |
1112 | int ret = -EINVAL; | 1111 | int ret = -EINVAL; |
1113 | struct task_struct *p; | 1112 | struct task_struct *p; |
@@ -1127,6 +1126,9 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, | |||
1127 | ret = -EPERM; | 1126 | ret = -EPERM; |
1128 | goto out_unlock; | 1127 | goto out_unlock; |
1129 | } | 1128 | } |
1129 | ret = security_task_kill(p, info, sig, secid); | ||
1130 | if (ret) | ||
1131 | goto out_unlock; | ||
1130 | if (sig && p->sighand) { | 1132 | if (sig && p->sighand) { |
1131 | unsigned long flags; | 1133 | unsigned long flags; |
1132 | spin_lock_irqsave(&p->sighand->siglock, flags); | 1134 | spin_lock_irqsave(&p->sighand->siglock, flags); |
@@ -1531,6 +1533,35 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why) | |||
1531 | spin_unlock_irqrestore(&sighand->siglock, flags); | 1533 | spin_unlock_irqrestore(&sighand->siglock, flags); |
1532 | } | 1534 | } |
1533 | 1535 | ||
1536 | static inline int may_ptrace_stop(void) | ||
1537 | { | ||
1538 | if (!likely(current->ptrace & PT_PTRACED)) | ||
1539 | return 0; | ||
1540 | |||
1541 | if (unlikely(current->parent == current->real_parent && | ||
1542 | (current->ptrace & PT_ATTACHED))) | ||
1543 | return 0; | ||
1544 | |||
1545 | if (unlikely(current->signal == current->parent->signal) && | ||
1546 | unlikely(current->signal->flags & SIGNAL_GROUP_EXIT)) | ||
1547 | return 0; | ||
1548 | |||
1549 | /* | ||
1550 | * Are we in the middle of do_coredump? | ||
1551 | * If so and our tracer is also part of the coredump stopping | ||
1552 | * is a deadlock situation, and pointless because our tracer | ||
1553 | * is dead so don't allow us to stop. | ||
1554 | * If SIGKILL was already sent before the caller unlocked | ||
1555 | * ->siglock we must see ->core_waiters != 0. Otherwise it | ||
1556 | * is safe to enter schedule(). | ||
1557 | */ | ||
1558 | if (unlikely(current->mm->core_waiters) && | ||
1559 | unlikely(current->mm == current->parent->mm)) | ||
1560 | return 0; | ||
1561 | |||
1562 | return 1; | ||
1563 | } | ||
1564 | |||
1534 | /* | 1565 | /* |
1535 | * This must be called with current->sighand->siglock held. | 1566 | * This must be called with current->sighand->siglock held. |
1536 | * | 1567 | * |
@@ -1559,11 +1590,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) | |||
1559 | spin_unlock_irq(¤t->sighand->siglock); | 1590 | spin_unlock_irq(¤t->sighand->siglock); |
1560 | try_to_freeze(); | 1591 | try_to_freeze(); |
1561 | read_lock(&tasklist_lock); | 1592 | read_lock(&tasklist_lock); |
1562 | if (likely(current->ptrace & PT_PTRACED) && | 1593 | if (may_ptrace_stop()) { |
1563 | likely(current->parent != current->real_parent || | ||
1564 | !(current->ptrace & PT_ATTACHED)) && | ||
1565 | (likely(current->parent->signal != current->signal) || | ||
1566 | !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { | ||
1567 | do_notify_parent_cldstop(current, CLD_TRAPPED); | 1594 | do_notify_parent_cldstop(current, CLD_TRAPPED); |
1568 | read_unlock(&tasklist_lock); | 1595 | read_unlock(&tasklist_lock); |
1569 | schedule(); | 1596 | schedule(); |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 336f92d64e2e..0f08a84ae307 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -62,6 +62,119 @@ static inline void wakeup_softirqd(void) | |||
62 | } | 62 | } |
63 | 63 | ||
64 | /* | 64 | /* |
65 | * This one is for softirq.c-internal use, | ||
66 | * where hardirqs are disabled legitimately: | ||
67 | */ | ||
68 | static void __local_bh_disable(unsigned long ip) | ||
69 | { | ||
70 | unsigned long flags; | ||
71 | |||
72 | WARN_ON_ONCE(in_irq()); | ||
73 | |||
74 | raw_local_irq_save(flags); | ||
75 | add_preempt_count(SOFTIRQ_OFFSET); | ||
76 | /* | ||
77 | * Were softirqs turned off above: | ||
78 | */ | ||
79 | if (softirq_count() == SOFTIRQ_OFFSET) | ||
80 | trace_softirqs_off(ip); | ||
81 | raw_local_irq_restore(flags); | ||
82 | } | ||
83 | |||
84 | void local_bh_disable(void) | ||
85 | { | ||
86 | __local_bh_disable((unsigned long)__builtin_return_address(0)); | ||
87 | } | ||
88 | |||
89 | EXPORT_SYMBOL(local_bh_disable); | ||
90 | |||
91 | void __local_bh_enable(void) | ||
92 | { | ||
93 | WARN_ON_ONCE(in_irq()); | ||
94 | |||
95 | /* | ||
96 | * softirqs should never be enabled by __local_bh_enable(), | ||
97 | * it always nests inside local_bh_enable() sections: | ||
98 | */ | ||
99 | WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET); | ||
100 | |||
101 | sub_preempt_count(SOFTIRQ_OFFSET); | ||
102 | } | ||
103 | EXPORT_SYMBOL_GPL(__local_bh_enable); | ||
104 | |||
105 | /* | ||
106 | * Special-case - softirqs can safely be enabled in | ||
107 | * cond_resched_softirq(), or by __do_softirq(), | ||
108 | * without processing still-pending softirqs: | ||
109 | */ | ||
110 | void _local_bh_enable(void) | ||
111 | { | ||
112 | WARN_ON_ONCE(in_irq()); | ||
113 | WARN_ON_ONCE(!irqs_disabled()); | ||
114 | |||
115 | if (softirq_count() == SOFTIRQ_OFFSET) | ||
116 | trace_softirqs_on((unsigned long)__builtin_return_address(0)); | ||
117 | sub_preempt_count(SOFTIRQ_OFFSET); | ||
118 | } | ||
119 | |||
120 | EXPORT_SYMBOL(_local_bh_enable); | ||
121 | |||
122 | void local_bh_enable(void) | ||
123 | { | ||
124 | unsigned long flags; | ||
125 | |||
126 | WARN_ON_ONCE(in_irq()); | ||
127 | WARN_ON_ONCE(irqs_disabled()); | ||
128 | |||
129 | local_irq_save(flags); | ||
130 | /* | ||
131 | * Are softirqs going to be turned on now: | ||
132 | */ | ||
133 | if (softirq_count() == SOFTIRQ_OFFSET) | ||
134 | trace_softirqs_on((unsigned long)__builtin_return_address(0)); | ||
135 | /* | ||
136 | * Keep preemption disabled until we are done with | ||
137 | * softirq processing: | ||
138 | */ | ||
139 | sub_preempt_count(SOFTIRQ_OFFSET - 1); | ||
140 | |||
141 | if (unlikely(!in_interrupt() && local_softirq_pending())) | ||
142 | do_softirq(); | ||
143 | |||
144 | dec_preempt_count(); | ||
145 | local_irq_restore(flags); | ||
146 | preempt_check_resched(); | ||
147 | } | ||
148 | EXPORT_SYMBOL(local_bh_enable); | ||
149 | |||
150 | void local_bh_enable_ip(unsigned long ip) | ||
151 | { | ||
152 | unsigned long flags; | ||
153 | |||
154 | WARN_ON_ONCE(in_irq()); | ||
155 | |||
156 | local_irq_save(flags); | ||
157 | /* | ||
158 | * Are softirqs going to be turned on now: | ||
159 | */ | ||
160 | if (softirq_count() == SOFTIRQ_OFFSET) | ||
161 | trace_softirqs_on(ip); | ||
162 | /* | ||
163 | * Keep preemption disabled until we are done with | ||
164 | * softirq processing: | ||
165 | */ | ||
166 | sub_preempt_count(SOFTIRQ_OFFSET - 1); | ||
167 | |||
168 | if (unlikely(!in_interrupt() && local_softirq_pending())) | ||
169 | do_softirq(); | ||
170 | |||
171 | dec_preempt_count(); | ||
172 | local_irq_restore(flags); | ||
173 | preempt_check_resched(); | ||
174 | } | ||
175 | EXPORT_SYMBOL(local_bh_enable_ip); | ||
176 | |||
177 | /* | ||
65 | * We restart softirq processing MAX_SOFTIRQ_RESTART times, | 178 | * We restart softirq processing MAX_SOFTIRQ_RESTART times, |
66 | * and we fall back to softirqd after that. | 179 | * and we fall back to softirqd after that. |
67 | * | 180 | * |
@@ -80,8 +193,11 @@ asmlinkage void __do_softirq(void) | |||
80 | int cpu; | 193 | int cpu; |
81 | 194 | ||
82 | pending = local_softirq_pending(); | 195 | pending = local_softirq_pending(); |
196 | account_system_vtime(current); | ||
197 | |||
198 | __local_bh_disable((unsigned long)__builtin_return_address(0)); | ||
199 | trace_softirq_enter(); | ||
83 | 200 | ||
84 | local_bh_disable(); | ||
85 | cpu = smp_processor_id(); | 201 | cpu = smp_processor_id(); |
86 | restart: | 202 | restart: |
87 | /* Reset the pending bitmask before enabling irqs */ | 203 | /* Reset the pending bitmask before enabling irqs */ |
@@ -109,7 +225,10 @@ restart: | |||
109 | if (pending) | 225 | if (pending) |
110 | wakeup_softirqd(); | 226 | wakeup_softirqd(); |
111 | 227 | ||
112 | __local_bh_enable(); | 228 | trace_softirq_exit(); |
229 | |||
230 | account_system_vtime(current); | ||
231 | _local_bh_enable(); | ||
113 | } | 232 | } |
114 | 233 | ||
115 | #ifndef __ARCH_HAS_DO_SOFTIRQ | 234 | #ifndef __ARCH_HAS_DO_SOFTIRQ |
@@ -136,23 +255,6 @@ EXPORT_SYMBOL(do_softirq); | |||
136 | 255 | ||
137 | #endif | 256 | #endif |
138 | 257 | ||
139 | void local_bh_enable(void) | ||
140 | { | ||
141 | WARN_ON(irqs_disabled()); | ||
142 | /* | ||
143 | * Keep preemption disabled until we are done with | ||
144 | * softirq processing: | ||
145 | */ | ||
146 | sub_preempt_count(SOFTIRQ_OFFSET - 1); | ||
147 | |||
148 | if (unlikely(!in_interrupt() && local_softirq_pending())) | ||
149 | do_softirq(); | ||
150 | |||
151 | dec_preempt_count(); | ||
152 | preempt_check_resched(); | ||
153 | } | ||
154 | EXPORT_SYMBOL(local_bh_enable); | ||
155 | |||
156 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED | 258 | #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED |
157 | # define invoke_softirq() __do_softirq() | 259 | # define invoke_softirq() __do_softirq() |
158 | #else | 260 | #else |
@@ -165,6 +267,7 @@ EXPORT_SYMBOL(local_bh_enable); | |||
165 | void irq_exit(void) | 267 | void irq_exit(void) |
166 | { | 268 | { |
167 | account_system_vtime(current); | 269 | account_system_vtime(current); |
270 | trace_hardirq_exit(); | ||
168 | sub_preempt_count(IRQ_EXIT_OFFSET); | 271 | sub_preempt_count(IRQ_EXIT_OFFSET); |
169 | if (!in_interrupt() && local_softirq_pending()) | 272 | if (!in_interrupt() && local_softirq_pending()) |
170 | invoke_softirq(); | 273 | invoke_softirq(); |
@@ -208,8 +311,6 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data) | |||
208 | softirq_vec[nr].action = action; | 311 | softirq_vec[nr].action = action; |
209 | } | 312 | } |
210 | 313 | ||
211 | EXPORT_SYMBOL(open_softirq); | ||
212 | |||
213 | /* Tasklets */ | 314 | /* Tasklets */ |
214 | struct tasklet_head | 315 | struct tasklet_head |
215 | { | 316 | { |
@@ -446,7 +547,7 @@ static void takeover_tasklets(unsigned int cpu) | |||
446 | } | 547 | } |
447 | #endif /* CONFIG_HOTPLUG_CPU */ | 548 | #endif /* CONFIG_HOTPLUG_CPU */ |
448 | 549 | ||
449 | static int cpu_callback(struct notifier_block *nfb, | 550 | static int __devinit cpu_callback(struct notifier_block *nfb, |
450 | unsigned long action, | 551 | unsigned long action, |
451 | void *hcpu) | 552 | void *hcpu) |
452 | { | 553 | { |
@@ -470,6 +571,8 @@ static int cpu_callback(struct notifier_block *nfb, | |||
470 | break; | 571 | break; |
471 | #ifdef CONFIG_HOTPLUG_CPU | 572 | #ifdef CONFIG_HOTPLUG_CPU |
472 | case CPU_UP_CANCELED: | 573 | case CPU_UP_CANCELED: |
574 | if (!per_cpu(ksoftirqd, hotcpu)) | ||
575 | break; | ||
473 | /* Unbind so it can run. Fall thru. */ | 576 | /* Unbind so it can run. Fall thru. */ |
474 | kthread_bind(per_cpu(ksoftirqd, hotcpu), | 577 | kthread_bind(per_cpu(ksoftirqd, hotcpu), |
475 | any_online_cpu(cpu_online_map)); | 578 | any_online_cpu(cpu_online_map)); |
@@ -484,7 +587,7 @@ static int cpu_callback(struct notifier_block *nfb, | |||
484 | return NOTIFY_OK; | 587 | return NOTIFY_OK; |
485 | } | 588 | } |
486 | 589 | ||
487 | static struct notifier_block cpu_nfb = { | 590 | static struct notifier_block __devinitdata cpu_nfb = { |
488 | .notifier_call = cpu_callback | 591 | .notifier_call = cpu_callback |
489 | }; | 592 | }; |
490 | 593 | ||
diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 14c7faf02909..6b76caa22981 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c | |||
@@ -36,7 +36,7 @@ static struct notifier_block panic_block = { | |||
36 | 36 | ||
37 | void touch_softlockup_watchdog(void) | 37 | void touch_softlockup_watchdog(void) |
38 | { | 38 | { |
39 | per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies; | 39 | __raw_get_cpu_var(touch_timestamp) = jiffies; |
40 | } | 40 | } |
41 | EXPORT_SYMBOL(touch_softlockup_watchdog); | 41 | EXPORT_SYMBOL(touch_softlockup_watchdog); |
42 | 42 | ||
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu) | |||
104 | /* | 104 | /* |
105 | * Create/destroy watchdog threads as CPUs come and go: | 105 | * Create/destroy watchdog threads as CPUs come and go: |
106 | */ | 106 | */ |
107 | static int | 107 | static int __devinit |
108 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | 108 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) |
109 | { | 109 | { |
110 | int hotcpu = (unsigned long)hcpu; | 110 | int hotcpu = (unsigned long)hcpu; |
@@ -127,6 +127,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
127 | break; | 127 | break; |
128 | #ifdef CONFIG_HOTPLUG_CPU | 128 | #ifdef CONFIG_HOTPLUG_CPU |
129 | case CPU_UP_CANCELED: | 129 | case CPU_UP_CANCELED: |
130 | if (!per_cpu(watchdog_task, hotcpu)) | ||
131 | break; | ||
130 | /* Unbind so it can run. Fall thru. */ | 132 | /* Unbind so it can run. Fall thru. */ |
131 | kthread_bind(per_cpu(watchdog_task, hotcpu), | 133 | kthread_bind(per_cpu(watchdog_task, hotcpu), |
132 | any_online_cpu(cpu_online_map)); | 134 | any_online_cpu(cpu_online_map)); |
@@ -140,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
140 | return NOTIFY_OK; | 142 | return NOTIFY_OK; |
141 | } | 143 | } |
142 | 144 | ||
143 | static struct notifier_block cpu_nfb = { | 145 | static struct notifier_block __devinitdata cpu_nfb = { |
144 | .notifier_call = cpu_callback | 146 | .notifier_call = cpu_callback |
145 | }; | 147 | }; |
146 | 148 | ||
diff --git a/kernel/spinlock.c b/kernel/spinlock.c index d1b810782bc4..bfd6ad9c0330 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c | |||
@@ -9,11 +9,11 @@ | |||
9 | * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) | 9 | * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/config.h> | ||
13 | #include <linux/linkage.h> | 12 | #include <linux/linkage.h> |
14 | #include <linux/preempt.h> | 13 | #include <linux/preempt.h> |
15 | #include <linux/spinlock.h> | 14 | #include <linux/spinlock.h> |
16 | #include <linux/interrupt.h> | 15 | #include <linux/interrupt.h> |
16 | #include <linux/debug_locks.h> | ||
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | 18 | ||
19 | /* | 19 | /* |
@@ -30,8 +30,10 @@ EXPORT_SYMBOL(generic__raw_read_trylock); | |||
30 | int __lockfunc _spin_trylock(spinlock_t *lock) | 30 | int __lockfunc _spin_trylock(spinlock_t *lock) |
31 | { | 31 | { |
32 | preempt_disable(); | 32 | preempt_disable(); |
33 | if (_raw_spin_trylock(lock)) | 33 | if (_raw_spin_trylock(lock)) { |
34 | spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); | ||
34 | return 1; | 35 | return 1; |
36 | } | ||
35 | 37 | ||
36 | preempt_enable(); | 38 | preempt_enable(); |
37 | return 0; | 39 | return 0; |
@@ -41,8 +43,10 @@ EXPORT_SYMBOL(_spin_trylock); | |||
41 | int __lockfunc _read_trylock(rwlock_t *lock) | 43 | int __lockfunc _read_trylock(rwlock_t *lock) |
42 | { | 44 | { |
43 | preempt_disable(); | 45 | preempt_disable(); |
44 | if (_raw_read_trylock(lock)) | 46 | if (_raw_read_trylock(lock)) { |
47 | rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_); | ||
45 | return 1; | 48 | return 1; |
49 | } | ||
46 | 50 | ||
47 | preempt_enable(); | 51 | preempt_enable(); |
48 | return 0; | 52 | return 0; |
@@ -52,19 +56,28 @@ EXPORT_SYMBOL(_read_trylock); | |||
52 | int __lockfunc _write_trylock(rwlock_t *lock) | 56 | int __lockfunc _write_trylock(rwlock_t *lock) |
53 | { | 57 | { |
54 | preempt_disable(); | 58 | preempt_disable(); |
55 | if (_raw_write_trylock(lock)) | 59 | if (_raw_write_trylock(lock)) { |
60 | rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_); | ||
56 | return 1; | 61 | return 1; |
62 | } | ||
57 | 63 | ||
58 | preempt_enable(); | 64 | preempt_enable(); |
59 | return 0; | 65 | return 0; |
60 | } | 66 | } |
61 | EXPORT_SYMBOL(_write_trylock); | 67 | EXPORT_SYMBOL(_write_trylock); |
62 | 68 | ||
63 | #if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) | 69 | /* |
70 | * If lockdep is enabled then we use the non-preemption spin-ops | ||
71 | * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are | ||
72 | * not re-enabled during lock-acquire (which the preempt-spin-ops do): | ||
73 | */ | ||
74 | #if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \ | ||
75 | defined(CONFIG_PROVE_LOCKING) | ||
64 | 76 | ||
65 | void __lockfunc _read_lock(rwlock_t *lock) | 77 | void __lockfunc _read_lock(rwlock_t *lock) |
66 | { | 78 | { |
67 | preempt_disable(); | 79 | preempt_disable(); |
80 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); | ||
68 | _raw_read_lock(lock); | 81 | _raw_read_lock(lock); |
69 | } | 82 | } |
70 | EXPORT_SYMBOL(_read_lock); | 83 | EXPORT_SYMBOL(_read_lock); |
@@ -75,7 +88,17 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) | |||
75 | 88 | ||
76 | local_irq_save(flags); | 89 | local_irq_save(flags); |
77 | preempt_disable(); | 90 | preempt_disable(); |
91 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); | ||
92 | /* | ||
93 | * On lockdep we dont want the hand-coded irq-enable of | ||
94 | * _raw_spin_lock_flags() code, because lockdep assumes | ||
95 | * that interrupts are not re-enabled during lock-acquire: | ||
96 | */ | ||
97 | #ifdef CONFIG_PROVE_LOCKING | ||
98 | _raw_spin_lock(lock); | ||
99 | #else | ||
78 | _raw_spin_lock_flags(lock, &flags); | 100 | _raw_spin_lock_flags(lock, &flags); |
101 | #endif | ||
79 | return flags; | 102 | return flags; |
80 | } | 103 | } |
81 | EXPORT_SYMBOL(_spin_lock_irqsave); | 104 | EXPORT_SYMBOL(_spin_lock_irqsave); |
@@ -84,6 +107,7 @@ void __lockfunc _spin_lock_irq(spinlock_t *lock) | |||
84 | { | 107 | { |
85 | local_irq_disable(); | 108 | local_irq_disable(); |
86 | preempt_disable(); | 109 | preempt_disable(); |
110 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); | ||
87 | _raw_spin_lock(lock); | 111 | _raw_spin_lock(lock); |
88 | } | 112 | } |
89 | EXPORT_SYMBOL(_spin_lock_irq); | 113 | EXPORT_SYMBOL(_spin_lock_irq); |
@@ -92,6 +116,7 @@ void __lockfunc _spin_lock_bh(spinlock_t *lock) | |||
92 | { | 116 | { |
93 | local_bh_disable(); | 117 | local_bh_disable(); |
94 | preempt_disable(); | 118 | preempt_disable(); |
119 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); | ||
95 | _raw_spin_lock(lock); | 120 | _raw_spin_lock(lock); |
96 | } | 121 | } |
97 | EXPORT_SYMBOL(_spin_lock_bh); | 122 | EXPORT_SYMBOL(_spin_lock_bh); |
@@ -102,6 +127,7 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) | |||
102 | 127 | ||
103 | local_irq_save(flags); | 128 | local_irq_save(flags); |
104 | preempt_disable(); | 129 | preempt_disable(); |
130 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); | ||
105 | _raw_read_lock(lock); | 131 | _raw_read_lock(lock); |
106 | return flags; | 132 | return flags; |
107 | } | 133 | } |
@@ -111,6 +137,7 @@ void __lockfunc _read_lock_irq(rwlock_t *lock) | |||
111 | { | 137 | { |
112 | local_irq_disable(); | 138 | local_irq_disable(); |
113 | preempt_disable(); | 139 | preempt_disable(); |
140 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); | ||
114 | _raw_read_lock(lock); | 141 | _raw_read_lock(lock); |
115 | } | 142 | } |
116 | EXPORT_SYMBOL(_read_lock_irq); | 143 | EXPORT_SYMBOL(_read_lock_irq); |
@@ -119,6 +146,7 @@ void __lockfunc _read_lock_bh(rwlock_t *lock) | |||
119 | { | 146 | { |
120 | local_bh_disable(); | 147 | local_bh_disable(); |
121 | preempt_disable(); | 148 | preempt_disable(); |
149 | rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); | ||
122 | _raw_read_lock(lock); | 150 | _raw_read_lock(lock); |
123 | } | 151 | } |
124 | EXPORT_SYMBOL(_read_lock_bh); | 152 | EXPORT_SYMBOL(_read_lock_bh); |
@@ -129,6 +157,7 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) | |||
129 | 157 | ||
130 | local_irq_save(flags); | 158 | local_irq_save(flags); |
131 | preempt_disable(); | 159 | preempt_disable(); |
160 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); | ||
132 | _raw_write_lock(lock); | 161 | _raw_write_lock(lock); |
133 | return flags; | 162 | return flags; |
134 | } | 163 | } |
@@ -138,6 +167,7 @@ void __lockfunc _write_lock_irq(rwlock_t *lock) | |||
138 | { | 167 | { |
139 | local_irq_disable(); | 168 | local_irq_disable(); |
140 | preempt_disable(); | 169 | preempt_disable(); |
170 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); | ||
141 | _raw_write_lock(lock); | 171 | _raw_write_lock(lock); |
142 | } | 172 | } |
143 | EXPORT_SYMBOL(_write_lock_irq); | 173 | EXPORT_SYMBOL(_write_lock_irq); |
@@ -146,6 +176,7 @@ void __lockfunc _write_lock_bh(rwlock_t *lock) | |||
146 | { | 176 | { |
147 | local_bh_disable(); | 177 | local_bh_disable(); |
148 | preempt_disable(); | 178 | preempt_disable(); |
179 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); | ||
149 | _raw_write_lock(lock); | 180 | _raw_write_lock(lock); |
150 | } | 181 | } |
151 | EXPORT_SYMBOL(_write_lock_bh); | 182 | EXPORT_SYMBOL(_write_lock_bh); |
@@ -153,6 +184,7 @@ EXPORT_SYMBOL(_write_lock_bh); | |||
153 | void __lockfunc _spin_lock(spinlock_t *lock) | 184 | void __lockfunc _spin_lock(spinlock_t *lock) |
154 | { | 185 | { |
155 | preempt_disable(); | 186 | preempt_disable(); |
187 | spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); | ||
156 | _raw_spin_lock(lock); | 188 | _raw_spin_lock(lock); |
157 | } | 189 | } |
158 | 190 | ||
@@ -161,6 +193,7 @@ EXPORT_SYMBOL(_spin_lock); | |||
161 | void __lockfunc _write_lock(rwlock_t *lock) | 193 | void __lockfunc _write_lock(rwlock_t *lock) |
162 | { | 194 | { |
163 | preempt_disable(); | 195 | preempt_disable(); |
196 | rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); | ||
164 | _raw_write_lock(lock); | 197 | _raw_write_lock(lock); |
165 | } | 198 | } |
166 | 199 | ||
@@ -256,8 +289,22 @@ BUILD_LOCK_OPS(write, rwlock); | |||
256 | 289 | ||
257 | #endif /* CONFIG_PREEMPT */ | 290 | #endif /* CONFIG_PREEMPT */ |
258 | 291 | ||
292 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
293 | |||
294 | void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) | ||
295 | { | ||
296 | preempt_disable(); | ||
297 | spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); | ||
298 | _raw_spin_lock(lock); | ||
299 | } | ||
300 | |||
301 | EXPORT_SYMBOL(_spin_lock_nested); | ||
302 | |||
303 | #endif | ||
304 | |||
259 | void __lockfunc _spin_unlock(spinlock_t *lock) | 305 | void __lockfunc _spin_unlock(spinlock_t *lock) |
260 | { | 306 | { |
307 | spin_release(&lock->dep_map, 1, _RET_IP_); | ||
261 | _raw_spin_unlock(lock); | 308 | _raw_spin_unlock(lock); |
262 | preempt_enable(); | 309 | preempt_enable(); |
263 | } | 310 | } |
@@ -265,6 +312,7 @@ EXPORT_SYMBOL(_spin_unlock); | |||
265 | 312 | ||
266 | void __lockfunc _write_unlock(rwlock_t *lock) | 313 | void __lockfunc _write_unlock(rwlock_t *lock) |
267 | { | 314 | { |
315 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | ||
268 | _raw_write_unlock(lock); | 316 | _raw_write_unlock(lock); |
269 | preempt_enable(); | 317 | preempt_enable(); |
270 | } | 318 | } |
@@ -272,6 +320,7 @@ EXPORT_SYMBOL(_write_unlock); | |||
272 | 320 | ||
273 | void __lockfunc _read_unlock(rwlock_t *lock) | 321 | void __lockfunc _read_unlock(rwlock_t *lock) |
274 | { | 322 | { |
323 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | ||
275 | _raw_read_unlock(lock); | 324 | _raw_read_unlock(lock); |
276 | preempt_enable(); | 325 | preempt_enable(); |
277 | } | 326 | } |
@@ -279,6 +328,7 @@ EXPORT_SYMBOL(_read_unlock); | |||
279 | 328 | ||
280 | void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) | 329 | void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) |
281 | { | 330 | { |
331 | spin_release(&lock->dep_map, 1, _RET_IP_); | ||
282 | _raw_spin_unlock(lock); | 332 | _raw_spin_unlock(lock); |
283 | local_irq_restore(flags); | 333 | local_irq_restore(flags); |
284 | preempt_enable(); | 334 | preempt_enable(); |
@@ -287,6 +337,7 @@ EXPORT_SYMBOL(_spin_unlock_irqrestore); | |||
287 | 337 | ||
288 | void __lockfunc _spin_unlock_irq(spinlock_t *lock) | 338 | void __lockfunc _spin_unlock_irq(spinlock_t *lock) |
289 | { | 339 | { |
340 | spin_release(&lock->dep_map, 1, _RET_IP_); | ||
290 | _raw_spin_unlock(lock); | 341 | _raw_spin_unlock(lock); |
291 | local_irq_enable(); | 342 | local_irq_enable(); |
292 | preempt_enable(); | 343 | preempt_enable(); |
@@ -295,14 +346,16 @@ EXPORT_SYMBOL(_spin_unlock_irq); | |||
295 | 346 | ||
296 | void __lockfunc _spin_unlock_bh(spinlock_t *lock) | 347 | void __lockfunc _spin_unlock_bh(spinlock_t *lock) |
297 | { | 348 | { |
349 | spin_release(&lock->dep_map, 1, _RET_IP_); | ||
298 | _raw_spin_unlock(lock); | 350 | _raw_spin_unlock(lock); |
299 | preempt_enable_no_resched(); | 351 | preempt_enable_no_resched(); |
300 | local_bh_enable(); | 352 | local_bh_enable_ip((unsigned long)__builtin_return_address(0)); |
301 | } | 353 | } |
302 | EXPORT_SYMBOL(_spin_unlock_bh); | 354 | EXPORT_SYMBOL(_spin_unlock_bh); |
303 | 355 | ||
304 | void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) | 356 | void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) |
305 | { | 357 | { |
358 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | ||
306 | _raw_read_unlock(lock); | 359 | _raw_read_unlock(lock); |
307 | local_irq_restore(flags); | 360 | local_irq_restore(flags); |
308 | preempt_enable(); | 361 | preempt_enable(); |
@@ -311,6 +364,7 @@ EXPORT_SYMBOL(_read_unlock_irqrestore); | |||
311 | 364 | ||
312 | void __lockfunc _read_unlock_irq(rwlock_t *lock) | 365 | void __lockfunc _read_unlock_irq(rwlock_t *lock) |
313 | { | 366 | { |
367 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | ||
314 | _raw_read_unlock(lock); | 368 | _raw_read_unlock(lock); |
315 | local_irq_enable(); | 369 | local_irq_enable(); |
316 | preempt_enable(); | 370 | preempt_enable(); |
@@ -319,14 +373,16 @@ EXPORT_SYMBOL(_read_unlock_irq); | |||
319 | 373 | ||
320 | void __lockfunc _read_unlock_bh(rwlock_t *lock) | 374 | void __lockfunc _read_unlock_bh(rwlock_t *lock) |
321 | { | 375 | { |
376 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | ||
322 | _raw_read_unlock(lock); | 377 | _raw_read_unlock(lock); |
323 | preempt_enable_no_resched(); | 378 | preempt_enable_no_resched(); |
324 | local_bh_enable(); | 379 | local_bh_enable_ip((unsigned long)__builtin_return_address(0)); |
325 | } | 380 | } |
326 | EXPORT_SYMBOL(_read_unlock_bh); | 381 | EXPORT_SYMBOL(_read_unlock_bh); |
327 | 382 | ||
328 | void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) | 383 | void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) |
329 | { | 384 | { |
385 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | ||
330 | _raw_write_unlock(lock); | 386 | _raw_write_unlock(lock); |
331 | local_irq_restore(flags); | 387 | local_irq_restore(flags); |
332 | preempt_enable(); | 388 | preempt_enable(); |
@@ -335,6 +391,7 @@ EXPORT_SYMBOL(_write_unlock_irqrestore); | |||
335 | 391 | ||
336 | void __lockfunc _write_unlock_irq(rwlock_t *lock) | 392 | void __lockfunc _write_unlock_irq(rwlock_t *lock) |
337 | { | 393 | { |
394 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | ||
338 | _raw_write_unlock(lock); | 395 | _raw_write_unlock(lock); |
339 | local_irq_enable(); | 396 | local_irq_enable(); |
340 | preempt_enable(); | 397 | preempt_enable(); |
@@ -343,9 +400,10 @@ EXPORT_SYMBOL(_write_unlock_irq); | |||
343 | 400 | ||
344 | void __lockfunc _write_unlock_bh(rwlock_t *lock) | 401 | void __lockfunc _write_unlock_bh(rwlock_t *lock) |
345 | { | 402 | { |
403 | rwlock_release(&lock->dep_map, 1, _RET_IP_); | ||
346 | _raw_write_unlock(lock); | 404 | _raw_write_unlock(lock); |
347 | preempt_enable_no_resched(); | 405 | preempt_enable_no_resched(); |
348 | local_bh_enable(); | 406 | local_bh_enable_ip((unsigned long)__builtin_return_address(0)); |
349 | } | 407 | } |
350 | EXPORT_SYMBOL(_write_unlock_bh); | 408 | EXPORT_SYMBOL(_write_unlock_bh); |
351 | 409 | ||
@@ -353,11 +411,13 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock) | |||
353 | { | 411 | { |
354 | local_bh_disable(); | 412 | local_bh_disable(); |
355 | preempt_disable(); | 413 | preempt_disable(); |
356 | if (_raw_spin_trylock(lock)) | 414 | if (_raw_spin_trylock(lock)) { |
415 | spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); | ||
357 | return 1; | 416 | return 1; |
417 | } | ||
358 | 418 | ||
359 | preempt_enable_no_resched(); | 419 | preempt_enable_no_resched(); |
360 | local_bh_enable(); | 420 | local_bh_enable_ip((unsigned long)__builtin_return_address(0)); |
361 | return 0; | 421 | return 0; |
362 | } | 422 | } |
363 | EXPORT_SYMBOL(_spin_trylock_bh); | 423 | EXPORT_SYMBOL(_spin_trylock_bh); |
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c new file mode 100644 index 000000000000..b71816e47a30 --- /dev/null +++ b/kernel/stacktrace.c | |||
@@ -0,0 +1,24 @@ | |||
1 | /* | ||
2 | * kernel/stacktrace.c | ||
3 | * | ||
4 | * Stack trace management functions | ||
5 | * | ||
6 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | */ | ||
8 | #include <linux/sched.h> | ||
9 | #include <linux/kallsyms.h> | ||
10 | #include <linux/stacktrace.h> | ||
11 | |||
12 | void print_stack_trace(struct stack_trace *trace, int spaces) | ||
13 | { | ||
14 | int i, j; | ||
15 | |||
16 | for (i = 0; i < trace->nr_entries; i++) { | ||
17 | unsigned long ip = trace->entries[i]; | ||
18 | |||
19 | for (j = 0; j < spaces + 1; j++) | ||
20 | printk(" "); | ||
21 | print_ip_sym(ip); | ||
22 | } | ||
23 | } | ||
24 | |||
diff --git a/kernel/sys.c b/kernel/sys.c index 0b6ec0e7936f..e236f98f7ec5 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -4,7 +4,6 @@ | |||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | 4 | * Copyright (C) 1991, 1992 Linus Torvalds |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include <linux/config.h> | ||
8 | #include <linux/module.h> | 7 | #include <linux/module.h> |
9 | #include <linux/mm.h> | 8 | #include <linux/mm.h> |
10 | #include <linux/utsname.h> | 9 | #include <linux/utsname.h> |
@@ -13,7 +12,6 @@ | |||
13 | #include <linux/notifier.h> | 12 | #include <linux/notifier.h> |
14 | #include <linux/reboot.h> | 13 | #include <linux/reboot.h> |
15 | #include <linux/prctl.h> | 14 | #include <linux/prctl.h> |
16 | #include <linux/init.h> | ||
17 | #include <linux/highuid.h> | 15 | #include <linux/highuid.h> |
18 | #include <linux/fs.h> | 16 | #include <linux/fs.h> |
19 | #include <linux/kernel.h> | 17 | #include <linux/kernel.h> |
@@ -57,6 +55,12 @@ | |||
57 | #ifndef GET_FPEXC_CTL | 55 | #ifndef GET_FPEXC_CTL |
58 | # define GET_FPEXC_CTL(a,b) (-EINVAL) | 56 | # define GET_FPEXC_CTL(a,b) (-EINVAL) |
59 | #endif | 57 | #endif |
58 | #ifndef GET_ENDIAN | ||
59 | # define GET_ENDIAN(a,b) (-EINVAL) | ||
60 | #endif | ||
61 | #ifndef SET_ENDIAN | ||
62 | # define SET_ENDIAN(a,b) (-EINVAL) | ||
63 | #endif | ||
60 | 64 | ||
61 | /* | 65 | /* |
62 | * this is where the system-wide overflow UID and GID are defined, for | 66 | * this is where the system-wide overflow UID and GID are defined, for |
@@ -132,14 +136,15 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, | |||
132 | unsigned long val, void *v) | 136 | unsigned long val, void *v) |
133 | { | 137 | { |
134 | int ret = NOTIFY_DONE; | 138 | int ret = NOTIFY_DONE; |
135 | struct notifier_block *nb; | 139 | struct notifier_block *nb, *next_nb; |
136 | 140 | ||
137 | nb = rcu_dereference(*nl); | 141 | nb = rcu_dereference(*nl); |
138 | while (nb) { | 142 | while (nb) { |
143 | next_nb = rcu_dereference(nb->next); | ||
139 | ret = nb->notifier_call(nb, val, v); | 144 | ret = nb->notifier_call(nb, val, v); |
140 | if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) | 145 | if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) |
141 | break; | 146 | break; |
142 | nb = rcu_dereference(nb->next); | 147 | nb = next_nb; |
143 | } | 148 | } |
144 | return ret; | 149 | return ret; |
145 | } | 150 | } |
@@ -583,7 +588,7 @@ void emergency_restart(void) | |||
583 | } | 588 | } |
584 | EXPORT_SYMBOL_GPL(emergency_restart); | 589 | EXPORT_SYMBOL_GPL(emergency_restart); |
585 | 590 | ||
586 | void kernel_restart_prepare(char *cmd) | 591 | static void kernel_restart_prepare(char *cmd) |
587 | { | 592 | { |
588 | blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); | 593 | blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); |
589 | system_state = SYSTEM_RESTART; | 594 | system_state = SYSTEM_RESTART; |
@@ -617,7 +622,7 @@ EXPORT_SYMBOL_GPL(kernel_restart); | |||
617 | * Move into place and start executing a preloaded standalone | 622 | * Move into place and start executing a preloaded standalone |
618 | * executable. If nothing was preloaded return an error. | 623 | * executable. If nothing was preloaded return an error. |
619 | */ | 624 | */ |
620 | void kernel_kexec(void) | 625 | static void kernel_kexec(void) |
621 | { | 626 | { |
622 | #ifdef CONFIG_KEXEC | 627 | #ifdef CONFIG_KEXEC |
623 | struct kimage *image; | 628 | struct kimage *image; |
@@ -631,7 +636,6 @@ void kernel_kexec(void) | |||
631 | machine_kexec(image); | 636 | machine_kexec(image); |
632 | #endif | 637 | #endif |
633 | } | 638 | } |
634 | EXPORT_SYMBOL_GPL(kernel_kexec); | ||
635 | 639 | ||
636 | void kernel_shutdown_prepare(enum system_states state) | 640 | void kernel_shutdown_prepare(enum system_states state) |
637 | { | 641 | { |
@@ -1860,23 +1864,20 @@ out: | |||
1860 | * fields when reaping, so a sample either gets all the additions of a | 1864 | * fields when reaping, so a sample either gets all the additions of a |
1861 | * given child after it's reaped, or none so this sample is before reaping. | 1865 | * given child after it's reaped, or none so this sample is before reaping. |
1862 | * | 1866 | * |
1863 | * tasklist_lock locking optimisation: | 1867 | * Locking: |
1864 | * If we are current and single threaded, we do not need to take the tasklist | 1868 | * We need to take the siglock for CHILDEREN, SELF and BOTH |
1865 | * lock or the siglock. No one else can take our signal_struct away, | 1869 | * for the cases current multithreaded, non-current single threaded |
1866 | * no one else can reap the children to update signal->c* counters, and | 1870 | * non-current multithreaded. Thread traversal is now safe with |
1867 | * no one else can race with the signal-> fields. | 1871 | * the siglock held. |
1868 | * If we do not take the tasklist_lock, the signal-> fields could be read | 1872 | * Strictly speaking, we donot need to take the siglock if we are current and |
1869 | * out of order while another thread was just exiting. So we place a | 1873 | * single threaded, as no one else can take our signal_struct away, no one |
1870 | * read memory barrier when we avoid the lock. On the writer side, | 1874 | * else can reap the children to update signal->c* counters, and no one else |
1871 | * write memory barrier is implied in __exit_signal as __exit_signal releases | 1875 | * can race with the signal-> fields. If we do not take any lock, the |
1872 | * the siglock spinlock after updating the signal-> fields. | 1876 | * signal-> fields could be read out of order while another thread was just |
1873 | * | 1877 | * exiting. So we should place a read memory barrier when we avoid the lock. |
1874 | * We don't really need the siglock when we access the non c* fields | 1878 | * On the writer side, write memory barrier is implied in __exit_signal |
1875 | * of the signal_struct (for RUSAGE_SELF) even in multithreaded | 1879 | * as __exit_signal releases the siglock spinlock after updating the signal-> |
1876 | * case, since we take the tasklist lock for read and the non c* signal-> | 1880 | * fields. But we don't do this yet to keep things simple. |
1877 | * fields are updated only in __exit_signal, which is called with | ||
1878 | * tasklist_lock taken for write, hence these two threads cannot execute | ||
1879 | * concurrently. | ||
1880 | * | 1881 | * |
1881 | */ | 1882 | */ |
1882 | 1883 | ||
@@ -1885,35 +1886,25 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1885 | struct task_struct *t; | 1886 | struct task_struct *t; |
1886 | unsigned long flags; | 1887 | unsigned long flags; |
1887 | cputime_t utime, stime; | 1888 | cputime_t utime, stime; |
1888 | int need_lock = 0; | ||
1889 | 1889 | ||
1890 | memset((char *) r, 0, sizeof *r); | 1890 | memset((char *) r, 0, sizeof *r); |
1891 | utime = stime = cputime_zero; | 1891 | utime = stime = cputime_zero; |
1892 | 1892 | ||
1893 | if (p != current || !thread_group_empty(p)) | 1893 | rcu_read_lock(); |
1894 | need_lock = 1; | 1894 | if (!lock_task_sighand(p, &flags)) { |
1895 | 1895 | rcu_read_unlock(); | |
1896 | if (need_lock) { | 1896 | return; |
1897 | read_lock(&tasklist_lock); | 1897 | } |
1898 | if (unlikely(!p->signal)) { | ||
1899 | read_unlock(&tasklist_lock); | ||
1900 | return; | ||
1901 | } | ||
1902 | } else | ||
1903 | /* See locking comments above */ | ||
1904 | smp_rmb(); | ||
1905 | 1898 | ||
1906 | switch (who) { | 1899 | switch (who) { |
1907 | case RUSAGE_BOTH: | 1900 | case RUSAGE_BOTH: |
1908 | case RUSAGE_CHILDREN: | 1901 | case RUSAGE_CHILDREN: |
1909 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
1910 | utime = p->signal->cutime; | 1902 | utime = p->signal->cutime; |
1911 | stime = p->signal->cstime; | 1903 | stime = p->signal->cstime; |
1912 | r->ru_nvcsw = p->signal->cnvcsw; | 1904 | r->ru_nvcsw = p->signal->cnvcsw; |
1913 | r->ru_nivcsw = p->signal->cnivcsw; | 1905 | r->ru_nivcsw = p->signal->cnivcsw; |
1914 | r->ru_minflt = p->signal->cmin_flt; | 1906 | r->ru_minflt = p->signal->cmin_flt; |
1915 | r->ru_majflt = p->signal->cmaj_flt; | 1907 | r->ru_majflt = p->signal->cmaj_flt; |
1916 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | ||
1917 | 1908 | ||
1918 | if (who == RUSAGE_CHILDREN) | 1909 | if (who == RUSAGE_CHILDREN) |
1919 | break; | 1910 | break; |
@@ -1941,8 +1932,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1941 | BUG(); | 1932 | BUG(); |
1942 | } | 1933 | } |
1943 | 1934 | ||
1944 | if (need_lock) | 1935 | unlock_task_sighand(p, &flags); |
1945 | read_unlock(&tasklist_lock); | 1936 | rcu_read_unlock(); |
1937 | |||
1946 | cputime_to_timeval(utime, &r->ru_utime); | 1938 | cputime_to_timeval(utime, &r->ru_utime); |
1947 | cputime_to_timeval(stime, &r->ru_stime); | 1939 | cputime_to_timeval(stime, &r->ru_stime); |
1948 | } | 1940 | } |
@@ -1991,7 +1983,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | |||
1991 | error = current->mm->dumpable; | 1983 | error = current->mm->dumpable; |
1992 | break; | 1984 | break; |
1993 | case PR_SET_DUMPABLE: | 1985 | case PR_SET_DUMPABLE: |
1994 | if (arg2 < 0 || arg2 > 2) { | 1986 | if (arg2 < 0 || arg2 > 1) { |
1995 | error = -EINVAL; | 1987 | error = -EINVAL; |
1996 | break; | 1988 | break; |
1997 | } | 1989 | } |
@@ -2057,6 +2049,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | |||
2057 | return -EFAULT; | 2049 | return -EFAULT; |
2058 | return 0; | 2050 | return 0; |
2059 | } | 2051 | } |
2052 | case PR_GET_ENDIAN: | ||
2053 | error = GET_ENDIAN(current, arg2); | ||
2054 | break; | ||
2055 | case PR_SET_ENDIAN: | ||
2056 | error = SET_ENDIAN(current, arg2); | ||
2057 | break; | ||
2058 | |||
2060 | default: | 2059 | default: |
2061 | error = -EINVAL; | 2060 | error = -EINVAL; |
2062 | break; | 2061 | break; |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 5433195040f1..6991bece67e8 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -87,6 +87,7 @@ cond_syscall(sys_inotify_init); | |||
87 | cond_syscall(sys_inotify_add_watch); | 87 | cond_syscall(sys_inotify_add_watch); |
88 | cond_syscall(sys_inotify_rm_watch); | 88 | cond_syscall(sys_inotify_rm_watch); |
89 | cond_syscall(sys_migrate_pages); | 89 | cond_syscall(sys_migrate_pages); |
90 | cond_syscall(sys_move_pages); | ||
90 | cond_syscall(sys_chown16); | 91 | cond_syscall(sys_chown16); |
91 | cond_syscall(sys_fchown16); | 92 | cond_syscall(sys_fchown16); |
92 | cond_syscall(sys_getegid16); | 93 | cond_syscall(sys_getegid16); |
@@ -132,3 +133,4 @@ cond_syscall(sys_mincore); | |||
132 | cond_syscall(sys_madvise); | 133 | cond_syscall(sys_madvise); |
133 | cond_syscall(sys_mremap); | 134 | cond_syscall(sys_mremap); |
134 | cond_syscall(sys_remap_file_pages); | 135 | cond_syscall(sys_remap_file_pages); |
136 | cond_syscall(compat_sys_move_pages); | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e82726faeeff..362a0cc37138 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -18,7 +18,6 @@ | |||
18 | * Removed it and replaced it with older style, 03/23/00, Bill Wendling | 18 | * Removed it and replaced it with older style, 03/23/00, Bill Wendling |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include <linux/config.h> | ||
22 | #include <linux/module.h> | 21 | #include <linux/module.h> |
23 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
24 | #include <linux/swap.h> | 23 | #include <linux/swap.h> |
@@ -59,6 +58,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp, | |||
59 | extern int C_A_D; | 58 | extern int C_A_D; |
60 | extern int sysctl_overcommit_memory; | 59 | extern int sysctl_overcommit_memory; |
61 | extern int sysctl_overcommit_ratio; | 60 | extern int sysctl_overcommit_ratio; |
61 | extern int sysctl_panic_on_oom; | ||
62 | extern int max_threads; | 62 | extern int max_threads; |
63 | extern int sysrq_enabled; | 63 | extern int sysrq_enabled; |
64 | extern int core_uses_pid; | 64 | extern int core_uses_pid; |
@@ -72,6 +72,7 @@ extern int printk_ratelimit_burst; | |||
72 | extern int pid_max_min, pid_max_max; | 72 | extern int pid_max_min, pid_max_max; |
73 | extern int sysctl_drop_caches; | 73 | extern int sysctl_drop_caches; |
74 | extern int percpu_pagelist_fraction; | 74 | extern int percpu_pagelist_fraction; |
75 | extern int compat_log; | ||
75 | 76 | ||
76 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | 77 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) |
77 | int unknown_nmi_panic; | 78 | int unknown_nmi_panic; |
@@ -131,6 +132,10 @@ extern int acct_parm[]; | |||
131 | extern int no_unaligned_warning; | 132 | extern int no_unaligned_warning; |
132 | #endif | 133 | #endif |
133 | 134 | ||
135 | #ifdef CONFIG_RT_MUTEXES | ||
136 | extern int max_lock_depth; | ||
137 | #endif | ||
138 | |||
134 | static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, | 139 | static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, |
135 | ctl_table *, void **); | 140 | ctl_table *, void **); |
136 | static int proc_doutsstring(ctl_table *table, int write, struct file *filp, | 141 | static int proc_doutsstring(ctl_table *table, int write, struct file *filp, |
@@ -142,7 +147,6 @@ static struct ctl_table_header root_table_header = | |||
142 | 147 | ||
143 | static ctl_table kern_table[]; | 148 | static ctl_table kern_table[]; |
144 | static ctl_table vm_table[]; | 149 | static ctl_table vm_table[]; |
145 | static ctl_table proc_table[]; | ||
146 | static ctl_table fs_table[]; | 150 | static ctl_table fs_table[]; |
147 | static ctl_table debug_table[]; | 151 | static ctl_table debug_table[]; |
148 | static ctl_table dev_table[]; | 152 | static ctl_table dev_table[]; |
@@ -150,7 +154,7 @@ extern ctl_table random_table[]; | |||
150 | #ifdef CONFIG_UNIX98_PTYS | 154 | #ifdef CONFIG_UNIX98_PTYS |
151 | extern ctl_table pty_table[]; | 155 | extern ctl_table pty_table[]; |
152 | #endif | 156 | #endif |
153 | #ifdef CONFIG_INOTIFY | 157 | #ifdef CONFIG_INOTIFY_USER |
154 | extern ctl_table inotify_table[]; | 158 | extern ctl_table inotify_table[]; |
155 | #endif | 159 | #endif |
156 | 160 | ||
@@ -202,12 +206,6 @@ static ctl_table root_table[] = { | |||
202 | }, | 206 | }, |
203 | #endif | 207 | #endif |
204 | { | 208 | { |
205 | .ctl_name = CTL_PROC, | ||
206 | .procname = "proc", | ||
207 | .mode = 0555, | ||
208 | .child = proc_table, | ||
209 | }, | ||
210 | { | ||
211 | .ctl_name = CTL_FS, | 209 | .ctl_name = CTL_FS, |
212 | .procname = "fs", | 210 | .procname = "fs", |
213 | .mode = 0555, | 211 | .mode = 0555, |
@@ -398,7 +396,7 @@ static ctl_table kern_table[] = { | |||
398 | .strategy = &sysctl_string, | 396 | .strategy = &sysctl_string, |
399 | }, | 397 | }, |
400 | #endif | 398 | #endif |
401 | #ifdef CONFIG_HOTPLUG | 399 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) |
402 | { | 400 | { |
403 | .ctl_name = KERN_HOTPLUG, | 401 | .ctl_name = KERN_HOTPLUG, |
404 | .procname = "hotplug", | 402 | .procname = "hotplug", |
@@ -683,6 +681,27 @@ static ctl_table kern_table[] = { | |||
683 | .proc_handler = &proc_dointvec, | 681 | .proc_handler = &proc_dointvec, |
684 | }, | 682 | }, |
685 | #endif | 683 | #endif |
684 | #ifdef CONFIG_COMPAT | ||
685 | { | ||
686 | .ctl_name = KERN_COMPAT_LOG, | ||
687 | .procname = "compat-log", | ||
688 | .data = &compat_log, | ||
689 | .maxlen = sizeof (int), | ||
690 | .mode = 0644, | ||
691 | .proc_handler = &proc_dointvec, | ||
692 | }, | ||
693 | #endif | ||
694 | #ifdef CONFIG_RT_MUTEXES | ||
695 | { | ||
696 | .ctl_name = KERN_MAX_LOCK_DEPTH, | ||
697 | .procname = "max_lock_depth", | ||
698 | .data = &max_lock_depth, | ||
699 | .maxlen = sizeof(int), | ||
700 | .mode = 0644, | ||
701 | .proc_handler = &proc_dointvec, | ||
702 | }, | ||
703 | #endif | ||
704 | |||
686 | { .ctl_name = 0 } | 705 | { .ctl_name = 0 } |
687 | }; | 706 | }; |
688 | 707 | ||
@@ -702,6 +721,14 @@ static ctl_table vm_table[] = { | |||
702 | .proc_handler = &proc_dointvec, | 721 | .proc_handler = &proc_dointvec, |
703 | }, | 722 | }, |
704 | { | 723 | { |
724 | .ctl_name = VM_PANIC_ON_OOM, | ||
725 | .procname = "panic_on_oom", | ||
726 | .data = &sysctl_panic_on_oom, | ||
727 | .maxlen = sizeof(sysctl_panic_on_oom), | ||
728 | .mode = 0644, | ||
729 | .proc_handler = &proc_dointvec, | ||
730 | }, | ||
731 | { | ||
705 | .ctl_name = VM_OVERCOMMIT_RATIO, | 732 | .ctl_name = VM_OVERCOMMIT_RATIO, |
706 | .procname = "overcommit_ratio", | 733 | .procname = "overcommit_ratio", |
707 | .data = &sysctl_overcommit_ratio, | 734 | .data = &sysctl_overcommit_ratio, |
@@ -906,19 +933,29 @@ static ctl_table vm_table[] = { | |||
906 | .extra1 = &zero, | 933 | .extra1 = &zero, |
907 | }, | 934 | }, |
908 | { | 935 | { |
909 | .ctl_name = VM_ZONE_RECLAIM_INTERVAL, | 936 | .ctl_name = VM_MIN_UNMAPPED, |
910 | .procname = "zone_reclaim_interval", | 937 | .procname = "min_unmapped_ratio", |
911 | .data = &zone_reclaim_interval, | 938 | .data = &sysctl_min_unmapped_ratio, |
912 | .maxlen = sizeof(zone_reclaim_interval), | 939 | .maxlen = sizeof(sysctl_min_unmapped_ratio), |
913 | .mode = 0644, | 940 | .mode = 0644, |
914 | .proc_handler = &proc_dointvec_jiffies, | 941 | .proc_handler = &sysctl_min_unmapped_ratio_sysctl_handler, |
915 | .strategy = &sysctl_jiffies, | 942 | .strategy = &sysctl_intvec, |
943 | .extra1 = &zero, | ||
944 | .extra2 = &one_hundred, | ||
945 | }, | ||
946 | #endif | ||
947 | #ifdef CONFIG_X86_32 | ||
948 | { | ||
949 | .ctl_name = VM_VDSO_ENABLED, | ||
950 | .procname = "vdso_enabled", | ||
951 | .data = &vdso_enabled, | ||
952 | .maxlen = sizeof(vdso_enabled), | ||
953 | .mode = 0644, | ||
954 | .proc_handler = &proc_dointvec, | ||
955 | .strategy = &sysctl_intvec, | ||
956 | .extra1 = &zero, | ||
916 | }, | 957 | }, |
917 | #endif | 958 | #endif |
918 | { .ctl_name = 0 } | ||
919 | }; | ||
920 | |||
921 | static ctl_table proc_table[] = { | ||
922 | { .ctl_name = 0 } | 959 | { .ctl_name = 0 } |
923 | }; | 960 | }; |
924 | 961 | ||
@@ -1028,7 +1065,7 @@ static ctl_table fs_table[] = { | |||
1028 | .mode = 0644, | 1065 | .mode = 0644, |
1029 | .proc_handler = &proc_doulongvec_minmax, | 1066 | .proc_handler = &proc_doulongvec_minmax, |
1030 | }, | 1067 | }, |
1031 | #ifdef CONFIG_INOTIFY | 1068 | #ifdef CONFIG_INOTIFY_USER |
1032 | { | 1069 | { |
1033 | .ctl_name = FS_INOTIFY, | 1070 | .ctl_name = FS_INOTIFY, |
1034 | .procname = "inotify", | 1071 | .procname = "inotify", |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c new file mode 100644 index 000000000000..f45179ce028e --- /dev/null +++ b/kernel/taskstats.c | |||
@@ -0,0 +1,568 @@ | |||
1 | /* | ||
2 | * taskstats.c - Export per-task statistics to userland | ||
3 | * | ||
4 | * Copyright (C) Shailabh Nagar, IBM Corp. 2006 | ||
5 | * (C) Balbir Singh, IBM Corp. 2006 | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License as published by | ||
9 | * the Free Software Foundation; either version 2 of the License, or | ||
10 | * (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | * GNU General Public License for more details. | ||
16 | * | ||
17 | */ | ||
18 | |||
19 | #include <linux/kernel.h> | ||
20 | #include <linux/taskstats_kern.h> | ||
21 | #include <linux/delayacct.h> | ||
22 | #include <linux/cpumask.h> | ||
23 | #include <linux/percpu.h> | ||
24 | #include <net/genetlink.h> | ||
25 | #include <asm/atomic.h> | ||
26 | |||
27 | /* | ||
28 | * Maximum length of a cpumask that can be specified in | ||
29 | * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute | ||
30 | */ | ||
31 | #define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) | ||
32 | |||
33 | static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; | ||
34 | static int family_registered; | ||
35 | kmem_cache_t *taskstats_cache; | ||
36 | |||
37 | static struct genl_family family = { | ||
38 | .id = GENL_ID_GENERATE, | ||
39 | .name = TASKSTATS_GENL_NAME, | ||
40 | .version = TASKSTATS_GENL_VERSION, | ||
41 | .maxattr = TASKSTATS_CMD_ATTR_MAX, | ||
42 | }; | ||
43 | |||
44 | static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] | ||
45 | __read_mostly = { | ||
46 | [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, | ||
47 | [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, | ||
48 | [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, | ||
49 | [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; | ||
50 | |||
51 | struct listener { | ||
52 | struct list_head list; | ||
53 | pid_t pid; | ||
54 | char valid; | ||
55 | }; | ||
56 | |||
57 | struct listener_list { | ||
58 | struct rw_semaphore sem; | ||
59 | struct list_head list; | ||
60 | }; | ||
61 | static DEFINE_PER_CPU(struct listener_list, listener_array); | ||
62 | |||
63 | enum actions { | ||
64 | REGISTER, | ||
65 | DEREGISTER, | ||
66 | CPU_DONT_CARE | ||
67 | }; | ||
68 | |||
69 | static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, | ||
70 | void **replyp, size_t size) | ||
71 | { | ||
72 | struct sk_buff *skb; | ||
73 | void *reply; | ||
74 | |||
75 | /* | ||
76 | * If new attributes are added, please revisit this allocation | ||
77 | */ | ||
78 | skb = nlmsg_new(size); | ||
79 | if (!skb) | ||
80 | return -ENOMEM; | ||
81 | |||
82 | if (!info) { | ||
83 | int seq = get_cpu_var(taskstats_seqnum)++; | ||
84 | put_cpu_var(taskstats_seqnum); | ||
85 | |||
86 | reply = genlmsg_put(skb, 0, seq, | ||
87 | family.id, 0, 0, | ||
88 | cmd, family.version); | ||
89 | } else | ||
90 | reply = genlmsg_put(skb, info->snd_pid, info->snd_seq, | ||
91 | family.id, 0, 0, | ||
92 | cmd, family.version); | ||
93 | if (reply == NULL) { | ||
94 | nlmsg_free(skb); | ||
95 | return -EINVAL; | ||
96 | } | ||
97 | |||
98 | *skbp = skb; | ||
99 | *replyp = reply; | ||
100 | return 0; | ||
101 | } | ||
102 | |||
103 | /* | ||
104 | * Send taskstats data in @skb to listener with nl_pid @pid | ||
105 | */ | ||
106 | static int send_reply(struct sk_buff *skb, pid_t pid) | ||
107 | { | ||
108 | struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); | ||
109 | void *reply = genlmsg_data(genlhdr); | ||
110 | int rc; | ||
111 | |||
112 | rc = genlmsg_end(skb, reply); | ||
113 | if (rc < 0) { | ||
114 | nlmsg_free(skb); | ||
115 | return rc; | ||
116 | } | ||
117 | |||
118 | return genlmsg_unicast(skb, pid); | ||
119 | } | ||
120 | |||
121 | /* | ||
122 | * Send taskstats data in @skb to listeners registered for @cpu's exit data | ||
123 | */ | ||
124 | static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) | ||
125 | { | ||
126 | struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); | ||
127 | struct listener_list *listeners; | ||
128 | struct listener *s, *tmp; | ||
129 | struct sk_buff *skb_next, *skb_cur = skb; | ||
130 | void *reply = genlmsg_data(genlhdr); | ||
131 | int rc, ret, delcount = 0; | ||
132 | |||
133 | rc = genlmsg_end(skb, reply); | ||
134 | if (rc < 0) { | ||
135 | nlmsg_free(skb); | ||
136 | return rc; | ||
137 | } | ||
138 | |||
139 | rc = 0; | ||
140 | listeners = &per_cpu(listener_array, cpu); | ||
141 | down_read(&listeners->sem); | ||
142 | list_for_each_entry_safe(s, tmp, &listeners->list, list) { | ||
143 | skb_next = NULL; | ||
144 | if (!list_is_last(&s->list, &listeners->list)) { | ||
145 | skb_next = skb_clone(skb_cur, GFP_KERNEL); | ||
146 | if (!skb_next) { | ||
147 | nlmsg_free(skb_cur); | ||
148 | rc = -ENOMEM; | ||
149 | break; | ||
150 | } | ||
151 | } | ||
152 | ret = genlmsg_unicast(skb_cur, s->pid); | ||
153 | if (ret == -ECONNREFUSED) { | ||
154 | s->valid = 0; | ||
155 | delcount++; | ||
156 | rc = ret; | ||
157 | } | ||
158 | skb_cur = skb_next; | ||
159 | } | ||
160 | up_read(&listeners->sem); | ||
161 | |||
162 | if (!delcount) | ||
163 | return rc; | ||
164 | |||
165 | /* Delete invalidated entries */ | ||
166 | down_write(&listeners->sem); | ||
167 | list_for_each_entry_safe(s, tmp, &listeners->list, list) { | ||
168 | if (!s->valid) { | ||
169 | list_del(&s->list); | ||
170 | kfree(s); | ||
171 | } | ||
172 | } | ||
173 | up_write(&listeners->sem); | ||
174 | return rc; | ||
175 | } | ||
176 | |||
177 | static int fill_pid(pid_t pid, struct task_struct *pidtsk, | ||
178 | struct taskstats *stats) | ||
179 | { | ||
180 | int rc; | ||
181 | struct task_struct *tsk = pidtsk; | ||
182 | |||
183 | if (!pidtsk) { | ||
184 | read_lock(&tasklist_lock); | ||
185 | tsk = find_task_by_pid(pid); | ||
186 | if (!tsk) { | ||
187 | read_unlock(&tasklist_lock); | ||
188 | return -ESRCH; | ||
189 | } | ||
190 | get_task_struct(tsk); | ||
191 | read_unlock(&tasklist_lock); | ||
192 | } else | ||
193 | get_task_struct(tsk); | ||
194 | |||
195 | /* | ||
196 | * Each accounting subsystem adds calls to its functions to | ||
197 | * fill in relevant parts of struct taskstsats as follows | ||
198 | * | ||
199 | * rc = per-task-foo(stats, tsk); | ||
200 | * if (rc) | ||
201 | * goto err; | ||
202 | */ | ||
203 | |||
204 | rc = delayacct_add_tsk(stats, tsk); | ||
205 | stats->version = TASKSTATS_VERSION; | ||
206 | |||
207 | /* Define err: label here if needed */ | ||
208 | put_task_struct(tsk); | ||
209 | return rc; | ||
210 | |||
211 | } | ||
212 | |||
213 | static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, | ||
214 | struct taskstats *stats) | ||
215 | { | ||
216 | struct task_struct *tsk, *first; | ||
217 | unsigned long flags; | ||
218 | |||
219 | /* | ||
220 | * Add additional stats from live tasks except zombie thread group | ||
221 | * leaders who are already counted with the dead tasks | ||
222 | */ | ||
223 | first = tgidtsk; | ||
224 | if (!first) { | ||
225 | read_lock(&tasklist_lock); | ||
226 | first = find_task_by_pid(tgid); | ||
227 | if (!first) { | ||
228 | read_unlock(&tasklist_lock); | ||
229 | return -ESRCH; | ||
230 | } | ||
231 | get_task_struct(first); | ||
232 | read_unlock(&tasklist_lock); | ||
233 | } else | ||
234 | get_task_struct(first); | ||
235 | |||
236 | /* Start with stats from dead tasks */ | ||
237 | spin_lock_irqsave(&first->signal->stats_lock, flags); | ||
238 | if (first->signal->stats) | ||
239 | memcpy(stats, first->signal->stats, sizeof(*stats)); | ||
240 | spin_unlock_irqrestore(&first->signal->stats_lock, flags); | ||
241 | |||
242 | tsk = first; | ||
243 | read_lock(&tasklist_lock); | ||
244 | do { | ||
245 | if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) | ||
246 | continue; | ||
247 | /* | ||
248 | * Accounting subsystem can call its functions here to | ||
249 | * fill in relevant parts of struct taskstsats as follows | ||
250 | * | ||
251 | * per-task-foo(stats, tsk); | ||
252 | */ | ||
253 | delayacct_add_tsk(stats, tsk); | ||
254 | |||
255 | } while_each_thread(first, tsk); | ||
256 | read_unlock(&tasklist_lock); | ||
257 | stats->version = TASKSTATS_VERSION; | ||
258 | |||
259 | /* | ||
260 | * Accounting subsytems can also add calls here to modify | ||
261 | * fields of taskstats. | ||
262 | */ | ||
263 | |||
264 | return 0; | ||
265 | } | ||
266 | |||
267 | |||
268 | static void fill_tgid_exit(struct task_struct *tsk) | ||
269 | { | ||
270 | unsigned long flags; | ||
271 | |||
272 | spin_lock_irqsave(&tsk->signal->stats_lock, flags); | ||
273 | if (!tsk->signal->stats) | ||
274 | goto ret; | ||
275 | |||
276 | /* | ||
277 | * Each accounting subsystem calls its functions here to | ||
278 | * accumalate its per-task stats for tsk, into the per-tgid structure | ||
279 | * | ||
280 | * per-task-foo(tsk->signal->stats, tsk); | ||
281 | */ | ||
282 | delayacct_add_tsk(tsk->signal->stats, tsk); | ||
283 | ret: | ||
284 | spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); | ||
285 | return; | ||
286 | } | ||
287 | |||
288 | static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) | ||
289 | { | ||
290 | struct listener_list *listeners; | ||
291 | struct listener *s, *tmp; | ||
292 | unsigned int cpu; | ||
293 | cpumask_t mask = *maskp; | ||
294 | |||
295 | if (!cpus_subset(mask, cpu_possible_map)) | ||
296 | return -EINVAL; | ||
297 | |||
298 | if (isadd == REGISTER) { | ||
299 | for_each_cpu_mask(cpu, mask) { | ||
300 | s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, | ||
301 | cpu_to_node(cpu)); | ||
302 | if (!s) | ||
303 | goto cleanup; | ||
304 | s->pid = pid; | ||
305 | INIT_LIST_HEAD(&s->list); | ||
306 | s->valid = 1; | ||
307 | |||
308 | listeners = &per_cpu(listener_array, cpu); | ||
309 | down_write(&listeners->sem); | ||
310 | list_add(&s->list, &listeners->list); | ||
311 | up_write(&listeners->sem); | ||
312 | } | ||
313 | return 0; | ||
314 | } | ||
315 | |||
316 | /* Deregister or cleanup */ | ||
317 | cleanup: | ||
318 | for_each_cpu_mask(cpu, mask) { | ||
319 | listeners = &per_cpu(listener_array, cpu); | ||
320 | down_write(&listeners->sem); | ||
321 | list_for_each_entry_safe(s, tmp, &listeners->list, list) { | ||
322 | if (s->pid == pid) { | ||
323 | list_del(&s->list); | ||
324 | kfree(s); | ||
325 | break; | ||
326 | } | ||
327 | } | ||
328 | up_write(&listeners->sem); | ||
329 | } | ||
330 | return 0; | ||
331 | } | ||
332 | |||
333 | static int parse(struct nlattr *na, cpumask_t *mask) | ||
334 | { | ||
335 | char *data; | ||
336 | int len; | ||
337 | int ret; | ||
338 | |||
339 | if (na == NULL) | ||
340 | return 1; | ||
341 | len = nla_len(na); | ||
342 | if (len > TASKSTATS_CPUMASK_MAXLEN) | ||
343 | return -E2BIG; | ||
344 | if (len < 1) | ||
345 | return -EINVAL; | ||
346 | data = kmalloc(len, GFP_KERNEL); | ||
347 | if (!data) | ||
348 | return -ENOMEM; | ||
349 | nla_strlcpy(data, na, len); | ||
350 | ret = cpulist_parse(data, *mask); | ||
351 | kfree(data); | ||
352 | return ret; | ||
353 | } | ||
354 | |||
355 | static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | ||
356 | { | ||
357 | int rc = 0; | ||
358 | struct sk_buff *rep_skb; | ||
359 | struct taskstats stats; | ||
360 | void *reply; | ||
361 | size_t size; | ||
362 | struct nlattr *na; | ||
363 | cpumask_t mask; | ||
364 | |||
365 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); | ||
366 | if (rc < 0) | ||
367 | return rc; | ||
368 | if (rc == 0) | ||
369 | return add_del_listener(info->snd_pid, &mask, REGISTER); | ||
370 | |||
371 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); | ||
372 | if (rc < 0) | ||
373 | return rc; | ||
374 | if (rc == 0) | ||
375 | return add_del_listener(info->snd_pid, &mask, DEREGISTER); | ||
376 | |||
377 | /* | ||
378 | * Size includes space for nested attributes | ||
379 | */ | ||
380 | size = nla_total_size(sizeof(u32)) + | ||
381 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | ||
382 | |||
383 | memset(&stats, 0, sizeof(stats)); | ||
384 | rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); | ||
385 | if (rc < 0) | ||
386 | return rc; | ||
387 | |||
388 | if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { | ||
389 | u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); | ||
390 | rc = fill_pid(pid, NULL, &stats); | ||
391 | if (rc < 0) | ||
392 | goto err; | ||
393 | |||
394 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); | ||
395 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid); | ||
396 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | ||
397 | stats); | ||
398 | } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { | ||
399 | u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); | ||
400 | rc = fill_tgid(tgid, NULL, &stats); | ||
401 | if (rc < 0) | ||
402 | goto err; | ||
403 | |||
404 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); | ||
405 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid); | ||
406 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | ||
407 | stats); | ||
408 | } else { | ||
409 | rc = -EINVAL; | ||
410 | goto err; | ||
411 | } | ||
412 | |||
413 | nla_nest_end(rep_skb, na); | ||
414 | |||
415 | return send_reply(rep_skb, info->snd_pid); | ||
416 | |||
417 | nla_put_failure: | ||
418 | return genlmsg_cancel(rep_skb, reply); | ||
419 | err: | ||
420 | nlmsg_free(rep_skb); | ||
421 | return rc; | ||
422 | } | ||
423 | |||
424 | void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) | ||
425 | { | ||
426 | struct listener_list *listeners; | ||
427 | struct taskstats *tmp; | ||
428 | /* | ||
429 | * This is the cpu on which the task is exiting currently and will | ||
430 | * be the one for which the exit event is sent, even if the cpu | ||
431 | * on which this function is running changes later. | ||
432 | */ | ||
433 | *mycpu = raw_smp_processor_id(); | ||
434 | |||
435 | *ptidstats = NULL; | ||
436 | tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); | ||
437 | if (!tmp) | ||
438 | return; | ||
439 | |||
440 | listeners = &per_cpu(listener_array, *mycpu); | ||
441 | down_read(&listeners->sem); | ||
442 | if (!list_empty(&listeners->list)) { | ||
443 | *ptidstats = tmp; | ||
444 | tmp = NULL; | ||
445 | } | ||
446 | up_read(&listeners->sem); | ||
447 | kfree(tmp); | ||
448 | } | ||
449 | |||
450 | /* Send pid data out on exit */ | ||
451 | void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, | ||
452 | int group_dead, unsigned int mycpu) | ||
453 | { | ||
454 | int rc; | ||
455 | struct sk_buff *rep_skb; | ||
456 | void *reply; | ||
457 | size_t size; | ||
458 | int is_thread_group; | ||
459 | struct nlattr *na; | ||
460 | unsigned long flags; | ||
461 | |||
462 | if (!family_registered || !tidstats) | ||
463 | return; | ||
464 | |||
465 | spin_lock_irqsave(&tsk->signal->stats_lock, flags); | ||
466 | is_thread_group = tsk->signal->stats ? 1 : 0; | ||
467 | spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); | ||
468 | |||
469 | rc = 0; | ||
470 | /* | ||
471 | * Size includes space for nested attributes | ||
472 | */ | ||
473 | size = nla_total_size(sizeof(u32)) + | ||
474 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | ||
475 | |||
476 | if (is_thread_group) | ||
477 | size = 2 * size; /* PID + STATS + TGID + STATS */ | ||
478 | |||
479 | rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); | ||
480 | if (rc < 0) | ||
481 | goto ret; | ||
482 | |||
483 | rc = fill_pid(tsk->pid, tsk, tidstats); | ||
484 | if (rc < 0) | ||
485 | goto err_skb; | ||
486 | |||
487 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); | ||
488 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid); | ||
489 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | ||
490 | *tidstats); | ||
491 | nla_nest_end(rep_skb, na); | ||
492 | |||
493 | if (!is_thread_group) | ||
494 | goto send; | ||
495 | |||
496 | /* | ||
497 | * tsk has/had a thread group so fill the tsk->signal->stats structure | ||
498 | * Doesn't matter if tsk is the leader or the last group member leaving | ||
499 | */ | ||
500 | |||
501 | fill_tgid_exit(tsk); | ||
502 | if (!group_dead) | ||
503 | goto send; | ||
504 | |||
505 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); | ||
506 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); | ||
507 | /* No locking needed for tsk->signal->stats since group is dead */ | ||
508 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | ||
509 | *tsk->signal->stats); | ||
510 | nla_nest_end(rep_skb, na); | ||
511 | |||
512 | send: | ||
513 | send_cpu_listeners(rep_skb, mycpu); | ||
514 | return; | ||
515 | |||
516 | nla_put_failure: | ||
517 | genlmsg_cancel(rep_skb, reply); | ||
518 | goto ret; | ||
519 | err_skb: | ||
520 | nlmsg_free(rep_skb); | ||
521 | ret: | ||
522 | return; | ||
523 | } | ||
524 | |||
525 | static struct genl_ops taskstats_ops = { | ||
526 | .cmd = TASKSTATS_CMD_GET, | ||
527 | .doit = taskstats_user_cmd, | ||
528 | .policy = taskstats_cmd_get_policy, | ||
529 | }; | ||
530 | |||
531 | /* Needed early in initialization */ | ||
532 | void __init taskstats_init_early(void) | ||
533 | { | ||
534 | unsigned int i; | ||
535 | |||
536 | taskstats_cache = kmem_cache_create("taskstats_cache", | ||
537 | sizeof(struct taskstats), | ||
538 | 0, SLAB_PANIC, NULL, NULL); | ||
539 | for_each_possible_cpu(i) { | ||
540 | INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); | ||
541 | init_rwsem(&(per_cpu(listener_array, i).sem)); | ||
542 | } | ||
543 | } | ||
544 | |||
545 | static int __init taskstats_init(void) | ||
546 | { | ||
547 | int rc; | ||
548 | |||
549 | rc = genl_register_family(&family); | ||
550 | if (rc) | ||
551 | return rc; | ||
552 | |||
553 | rc = genl_register_ops(&family, &taskstats_ops); | ||
554 | if (rc < 0) | ||
555 | goto err; | ||
556 | |||
557 | family_registered = 1; | ||
558 | return 0; | ||
559 | err: | ||
560 | genl_unregister_family(&family); | ||
561 | return rc; | ||
562 | } | ||
563 | |||
564 | /* | ||
565 | * late initcall ensures initialization of statistics collection | ||
566 | * mechanisms precedes initialization of the taskstats interface | ||
567 | */ | ||
568 | late_initcall(taskstats_init); | ||
diff --git a/kernel/time.c b/kernel/time.c index b00ddc71cedb..5bd489747643 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -523,6 +523,7 @@ EXPORT_SYMBOL(do_gettimeofday); | |||
523 | 523 | ||
524 | 524 | ||
525 | #else | 525 | #else |
526 | #ifndef CONFIG_GENERIC_TIME | ||
526 | /* | 527 | /* |
527 | * Simulate gettimeofday using do_gettimeofday which only allows a timeval | 528 | * Simulate gettimeofday using do_gettimeofday which only allows a timeval |
528 | * and therefore only yields usec accuracy | 529 | * and therefore only yields usec accuracy |
@@ -537,6 +538,7 @@ void getnstimeofday(struct timespec *tv) | |||
537 | } | 538 | } |
538 | EXPORT_SYMBOL_GPL(getnstimeofday); | 539 | EXPORT_SYMBOL_GPL(getnstimeofday); |
539 | #endif | 540 | #endif |
541 | #endif | ||
540 | 542 | ||
541 | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. | 543 | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. |
542 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 | 544 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 |
diff --git a/kernel/time/Makefile b/kernel/time/Makefile new file mode 100644 index 000000000000..e1dfd8e86cce --- /dev/null +++ b/kernel/time/Makefile | |||
@@ -0,0 +1 @@ | |||
obj-y += clocksource.o jiffies.o | |||
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c new file mode 100644 index 000000000000..74eca5939bd9 --- /dev/null +++ b/kernel/time/clocksource.c | |||
@@ -0,0 +1,349 @@ | |||
1 | /* | ||
2 | * linux/kernel/time/clocksource.c | ||
3 | * | ||
4 | * This file contains the functions which manage clocksource drivers. | ||
5 | * | ||
6 | * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify | ||
9 | * it under the terms of the GNU General Public License as published by | ||
10 | * the Free Software Foundation; either version 2 of the License, or | ||
11 | * (at your option) any later version. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | * GNU General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public License | ||
19 | * along with this program; if not, write to the Free Software | ||
20 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
21 | * | ||
22 | * TODO WishList: | ||
23 | * o Allow clocksource drivers to be unregistered | ||
24 | * o get rid of clocksource_jiffies extern | ||
25 | */ | ||
26 | |||
27 | #include <linux/clocksource.h> | ||
28 | #include <linux/sysdev.h> | ||
29 | #include <linux/init.h> | ||
30 | #include <linux/module.h> | ||
31 | |||
32 | /* XXX - Would like a better way for initializing curr_clocksource */ | ||
33 | extern struct clocksource clocksource_jiffies; | ||
34 | |||
35 | /*[Clocksource internal variables]--------- | ||
36 | * curr_clocksource: | ||
37 | * currently selected clocksource. Initialized to clocksource_jiffies. | ||
38 | * next_clocksource: | ||
39 | * pending next selected clocksource. | ||
40 | * clocksource_list: | ||
41 | * linked list with the registered clocksources | ||
42 | * clocksource_lock: | ||
43 | * protects manipulations to curr_clocksource and next_clocksource | ||
44 | * and the clocksource_list | ||
45 | * override_name: | ||
46 | * Name of the user-specified clocksource. | ||
47 | */ | ||
48 | static struct clocksource *curr_clocksource = &clocksource_jiffies; | ||
49 | static struct clocksource *next_clocksource; | ||
50 | static LIST_HEAD(clocksource_list); | ||
51 | static DEFINE_SPINLOCK(clocksource_lock); | ||
52 | static char override_name[32]; | ||
53 | static int finished_booting; | ||
54 | |||
55 | /* clocksource_done_booting - Called near the end of bootup | ||
56 | * | ||
57 | * Hack to avoid lots of clocksource churn at boot time | ||
58 | */ | ||
59 | static int __init clocksource_done_booting(void) | ||
60 | { | ||
61 | finished_booting = 1; | ||
62 | return 0; | ||
63 | } | ||
64 | |||
65 | late_initcall(clocksource_done_booting); | ||
66 | |||
67 | /** | ||
68 | * clocksource_get_next - Returns the selected clocksource | ||
69 | * | ||
70 | */ | ||
71 | struct clocksource *clocksource_get_next(void) | ||
72 | { | ||
73 | unsigned long flags; | ||
74 | |||
75 | spin_lock_irqsave(&clocksource_lock, flags); | ||
76 | if (next_clocksource && finished_booting) { | ||
77 | curr_clocksource = next_clocksource; | ||
78 | next_clocksource = NULL; | ||
79 | } | ||
80 | spin_unlock_irqrestore(&clocksource_lock, flags); | ||
81 | |||
82 | return curr_clocksource; | ||
83 | } | ||
84 | |||
85 | /** | ||
86 | * select_clocksource - Finds the best registered clocksource. | ||
87 | * | ||
88 | * Private function. Must hold clocksource_lock when called. | ||
89 | * | ||
90 | * Looks through the list of registered clocksources, returning | ||
91 | * the one with the highest rating value. If there is a clocksource | ||
92 | * name that matches the override string, it returns that clocksource. | ||
93 | */ | ||
94 | static struct clocksource *select_clocksource(void) | ||
95 | { | ||
96 | struct clocksource *best = NULL; | ||
97 | struct list_head *tmp; | ||
98 | |||
99 | list_for_each(tmp, &clocksource_list) { | ||
100 | struct clocksource *src; | ||
101 | |||
102 | src = list_entry(tmp, struct clocksource, list); | ||
103 | if (!best) | ||
104 | best = src; | ||
105 | |||
106 | /* check for override: */ | ||
107 | if (strlen(src->name) == strlen(override_name) && | ||
108 | !strcmp(src->name, override_name)) { | ||
109 | best = src; | ||
110 | break; | ||
111 | } | ||
112 | /* pick the highest rating: */ | ||
113 | if (src->rating > best->rating) | ||
114 | best = src; | ||
115 | } | ||
116 | |||
117 | return best; | ||
118 | } | ||
119 | |||
120 | /** | ||
121 | * is_registered_source - Checks if clocksource is registered | ||
122 | * @c: pointer to a clocksource | ||
123 | * | ||
124 | * Private helper function. Must hold clocksource_lock when called. | ||
125 | * | ||
126 | * Returns one if the clocksource is already registered, zero otherwise. | ||
127 | */ | ||
128 | static int is_registered_source(struct clocksource *c) | ||
129 | { | ||
130 | int len = strlen(c->name); | ||
131 | struct list_head *tmp; | ||
132 | |||
133 | list_for_each(tmp, &clocksource_list) { | ||
134 | struct clocksource *src; | ||
135 | |||
136 | src = list_entry(tmp, struct clocksource, list); | ||
137 | if (strlen(src->name) == len && !strcmp(src->name, c->name)) | ||
138 | return 1; | ||
139 | } | ||
140 | |||
141 | return 0; | ||
142 | } | ||
143 | |||
144 | /** | ||
145 | * clocksource_register - Used to install new clocksources | ||
146 | * @t: clocksource to be registered | ||
147 | * | ||
148 | * Returns -EBUSY if registration fails, zero otherwise. | ||
149 | */ | ||
150 | int clocksource_register(struct clocksource *c) | ||
151 | { | ||
152 | int ret = 0; | ||
153 | unsigned long flags; | ||
154 | |||
155 | spin_lock_irqsave(&clocksource_lock, flags); | ||
156 | /* check if clocksource is already registered */ | ||
157 | if (is_registered_source(c)) { | ||
158 | printk("register_clocksource: Cannot register %s. " | ||
159 | "Already registered!", c->name); | ||
160 | ret = -EBUSY; | ||
161 | } else { | ||
162 | /* register it */ | ||
163 | list_add(&c->list, &clocksource_list); | ||
164 | /* scan the registered clocksources, and pick the best one */ | ||
165 | next_clocksource = select_clocksource(); | ||
166 | } | ||
167 | spin_unlock_irqrestore(&clocksource_lock, flags); | ||
168 | return ret; | ||
169 | } | ||
170 | EXPORT_SYMBOL(clocksource_register); | ||
171 | |||
172 | /** | ||
173 | * clocksource_reselect - Rescan list for next clocksource | ||
174 | * | ||
175 | * A quick helper function to be used if a clocksource changes its | ||
176 | * rating. Forces the clocksource list to be re-scanned for the best | ||
177 | * clocksource. | ||
178 | */ | ||
179 | void clocksource_reselect(void) | ||
180 | { | ||
181 | unsigned long flags; | ||
182 | |||
183 | spin_lock_irqsave(&clocksource_lock, flags); | ||
184 | next_clocksource = select_clocksource(); | ||
185 | spin_unlock_irqrestore(&clocksource_lock, flags); | ||
186 | } | ||
187 | EXPORT_SYMBOL(clocksource_reselect); | ||
188 | |||
189 | /** | ||
190 | * sysfs_show_current_clocksources - sysfs interface for current clocksource | ||
191 | * @dev: unused | ||
192 | * @buf: char buffer to be filled with clocksource list | ||
193 | * | ||
194 | * Provides sysfs interface for listing current clocksource. | ||
195 | */ | ||
196 | static ssize_t | ||
197 | sysfs_show_current_clocksources(struct sys_device *dev, char *buf) | ||
198 | { | ||
199 | char *curr = buf; | ||
200 | |||
201 | spin_lock_irq(&clocksource_lock); | ||
202 | curr += sprintf(curr, "%s ", curr_clocksource->name); | ||
203 | spin_unlock_irq(&clocksource_lock); | ||
204 | |||
205 | curr += sprintf(curr, "\n"); | ||
206 | |||
207 | return curr - buf; | ||
208 | } | ||
209 | |||
210 | /** | ||
211 | * sysfs_override_clocksource - interface for manually overriding clocksource | ||
212 | * @dev: unused | ||
213 | * @buf: name of override clocksource | ||
214 | * @count: length of buffer | ||
215 | * | ||
216 | * Takes input from sysfs interface for manually overriding the default | ||
217 | * clocksource selction. | ||
218 | */ | ||
219 | static ssize_t sysfs_override_clocksource(struct sys_device *dev, | ||
220 | const char *buf, size_t count) | ||
221 | { | ||
222 | size_t ret = count; | ||
223 | /* strings from sysfs write are not 0 terminated! */ | ||
224 | if (count >= sizeof(override_name)) | ||
225 | return -EINVAL; | ||
226 | |||
227 | /* strip of \n: */ | ||
228 | if (buf[count-1] == '\n') | ||
229 | count--; | ||
230 | if (count < 1) | ||
231 | return -EINVAL; | ||
232 | |||
233 | spin_lock_irq(&clocksource_lock); | ||
234 | |||
235 | /* copy the name given: */ | ||
236 | memcpy(override_name, buf, count); | ||
237 | override_name[count] = 0; | ||
238 | |||
239 | /* try to select it: */ | ||
240 | next_clocksource = select_clocksource(); | ||
241 | |||
242 | spin_unlock_irq(&clocksource_lock); | ||
243 | |||
244 | return ret; | ||
245 | } | ||
246 | |||
247 | /** | ||
248 | * sysfs_show_available_clocksources - sysfs interface for listing clocksource | ||
249 | * @dev: unused | ||
250 | * @buf: char buffer to be filled with clocksource list | ||
251 | * | ||
252 | * Provides sysfs interface for listing registered clocksources | ||
253 | */ | ||
254 | static ssize_t | ||
255 | sysfs_show_available_clocksources(struct sys_device *dev, char *buf) | ||
256 | { | ||
257 | struct list_head *tmp; | ||
258 | char *curr = buf; | ||
259 | |||
260 | spin_lock_irq(&clocksource_lock); | ||
261 | list_for_each(tmp, &clocksource_list) { | ||
262 | struct clocksource *src; | ||
263 | |||
264 | src = list_entry(tmp, struct clocksource, list); | ||
265 | curr += sprintf(curr, "%s ", src->name); | ||
266 | } | ||
267 | spin_unlock_irq(&clocksource_lock); | ||
268 | |||
269 | curr += sprintf(curr, "\n"); | ||
270 | |||
271 | return curr - buf; | ||
272 | } | ||
273 | |||
274 | /* | ||
275 | * Sysfs setup bits: | ||
276 | */ | ||
277 | static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, | ||
278 | sysfs_override_clocksource); | ||
279 | |||
280 | static SYSDEV_ATTR(available_clocksource, 0600, | ||
281 | sysfs_show_available_clocksources, NULL); | ||
282 | |||
283 | static struct sysdev_class clocksource_sysclass = { | ||
284 | set_kset_name("clocksource"), | ||
285 | }; | ||
286 | |||
287 | static struct sys_device device_clocksource = { | ||
288 | .id = 0, | ||
289 | .cls = &clocksource_sysclass, | ||
290 | }; | ||
291 | |||
292 | static int __init init_clocksource_sysfs(void) | ||
293 | { | ||
294 | int error = sysdev_class_register(&clocksource_sysclass); | ||
295 | |||
296 | if (!error) | ||
297 | error = sysdev_register(&device_clocksource); | ||
298 | if (!error) | ||
299 | error = sysdev_create_file( | ||
300 | &device_clocksource, | ||
301 | &attr_current_clocksource); | ||
302 | if (!error) | ||
303 | error = sysdev_create_file( | ||
304 | &device_clocksource, | ||
305 | &attr_available_clocksource); | ||
306 | return error; | ||
307 | } | ||
308 | |||
309 | device_initcall(init_clocksource_sysfs); | ||
310 | |||
311 | /** | ||
312 | * boot_override_clocksource - boot clock override | ||
313 | * @str: override name | ||
314 | * | ||
315 | * Takes a clocksource= boot argument and uses it | ||
316 | * as the clocksource override name. | ||
317 | */ | ||
318 | static int __init boot_override_clocksource(char* str) | ||
319 | { | ||
320 | unsigned long flags; | ||
321 | spin_lock_irqsave(&clocksource_lock, flags); | ||
322 | if (str) | ||
323 | strlcpy(override_name, str, sizeof(override_name)); | ||
324 | spin_unlock_irqrestore(&clocksource_lock, flags); | ||
325 | return 1; | ||
326 | } | ||
327 | |||
328 | __setup("clocksource=", boot_override_clocksource); | ||
329 | |||
330 | /** | ||
331 | * boot_override_clock - Compatibility layer for deprecated boot option | ||
332 | * @str: override name | ||
333 | * | ||
334 | * DEPRECATED! Takes a clock= boot argument and uses it | ||
335 | * as the clocksource override name | ||
336 | */ | ||
337 | static int __init boot_override_clock(char* str) | ||
338 | { | ||
339 | if (!strcmp(str, "pmtmr")) { | ||
340 | printk("Warning: clock=pmtmr is deprecated. " | ||
341 | "Use clocksource=acpi_pm.\n"); | ||
342 | return boot_override_clocksource("acpi_pm"); | ||
343 | } | ||
344 | printk("Warning! clock= boot option is deprecated. " | ||
345 | "Use clocksource=xyz\n"); | ||
346 | return boot_override_clocksource(str); | ||
347 | } | ||
348 | |||
349 | __setup("clock=", boot_override_clock); | ||
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c new file mode 100644 index 000000000000..126bb30c4afe --- /dev/null +++ b/kernel/time/jiffies.c | |||
@@ -0,0 +1,73 @@ | |||
1 | /*********************************************************************** | ||
2 | * linux/kernel/time/jiffies.c | ||
3 | * | ||
4 | * This file contains the jiffies based clocksource. | ||
5 | * | ||
6 | * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify | ||
9 | * it under the terms of the GNU General Public License as published by | ||
10 | * the Free Software Foundation; either version 2 of the License, or | ||
11 | * (at your option) any later version. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | * GNU General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public License | ||
19 | * along with this program; if not, write to the Free Software | ||
20 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
21 | * | ||
22 | ************************************************************************/ | ||
23 | #include <linux/clocksource.h> | ||
24 | #include <linux/jiffies.h> | ||
25 | #include <linux/init.h> | ||
26 | |||
27 | /* The Jiffies based clocksource is the lowest common | ||
28 | * denominator clock source which should function on | ||
29 | * all systems. It has the same coarse resolution as | ||
30 | * the timer interrupt frequency HZ and it suffers | ||
31 | * inaccuracies caused by missed or lost timer | ||
32 | * interrupts and the inability for the timer | ||
33 | * interrupt hardware to accuratly tick at the | ||
34 | * requested HZ value. It is also not reccomended | ||
35 | * for "tick-less" systems. | ||
36 | */ | ||
37 | #define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) | ||
38 | |||
39 | /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier | ||
40 | * conversion, the .shift value could be zero. However | ||
41 | * this would make NTP adjustments impossible as they are | ||
42 | * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to | ||
43 | * shift both the nominator and denominator the same | ||
44 | * amount, and give ntp adjustments in units of 1/2^8 | ||
45 | * | ||
46 | * The value 8 is somewhat carefully chosen, as anything | ||
47 | * larger can result in overflows. NSEC_PER_JIFFY grows as | ||
48 | * HZ shrinks, so values greater then 8 overflow 32bits when | ||
49 | * HZ=100. | ||
50 | */ | ||
51 | #define JIFFIES_SHIFT 8 | ||
52 | |||
53 | static cycle_t jiffies_read(void) | ||
54 | { | ||
55 | return (cycle_t) jiffies; | ||
56 | } | ||
57 | |||
58 | struct clocksource clocksource_jiffies = { | ||
59 | .name = "jiffies", | ||
60 | .rating = 0, /* lowest rating*/ | ||
61 | .read = jiffies_read, | ||
62 | .mask = 0xffffffff, /*32bits*/ | ||
63 | .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ | ||
64 | .shift = JIFFIES_SHIFT, | ||
65 | .is_continuous = 0, /* tick based, not free running */ | ||
66 | }; | ||
67 | |||
68 | static int __init init_jiffies_clocksource(void) | ||
69 | { | ||
70 | return clocksource_register(&clocksource_jiffies); | ||
71 | } | ||
72 | |||
73 | module_init(init_jiffies_clocksource); | ||
diff --git a/kernel/timer.c b/kernel/timer.c index 9e49deed468c..05809c2e2fd6 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -146,7 +146,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) | |||
146 | void fastcall init_timer(struct timer_list *timer) | 146 | void fastcall init_timer(struct timer_list *timer) |
147 | { | 147 | { |
148 | timer->entry.next = NULL; | 148 | timer->entry.next = NULL; |
149 | timer->base = per_cpu(tvec_bases, raw_smp_processor_id()); | 149 | timer->base = __raw_get_cpu_var(tvec_bases); |
150 | } | 150 | } |
151 | EXPORT_SYMBOL(init_timer); | 151 | EXPORT_SYMBOL(init_timer); |
152 | 152 | ||
@@ -374,6 +374,7 @@ int del_timer_sync(struct timer_list *timer) | |||
374 | int ret = try_to_del_timer_sync(timer); | 374 | int ret = try_to_del_timer_sync(timer); |
375 | if (ret >= 0) | 375 | if (ret >= 0) |
376 | return ret; | 376 | return ret; |
377 | cpu_relax(); | ||
377 | } | 378 | } |
378 | } | 379 | } |
379 | 380 | ||
@@ -383,23 +384,19 @@ EXPORT_SYMBOL(del_timer_sync); | |||
383 | static int cascade(tvec_base_t *base, tvec_t *tv, int index) | 384 | static int cascade(tvec_base_t *base, tvec_t *tv, int index) |
384 | { | 385 | { |
385 | /* cascade all the timers from tv up one level */ | 386 | /* cascade all the timers from tv up one level */ |
386 | struct list_head *head, *curr; | 387 | struct timer_list *timer, *tmp; |
388 | struct list_head tv_list; | ||
389 | |||
390 | list_replace_init(tv->vec + index, &tv_list); | ||
387 | 391 | ||
388 | head = tv->vec + index; | ||
389 | curr = head->next; | ||
390 | /* | 392 | /* |
391 | * We are removing _all_ timers from the list, so we don't have to | 393 | * We are removing _all_ timers from the list, so we |
392 | * detach them individually, just clear the list afterwards. | 394 | * don't have to detach them individually. |
393 | */ | 395 | */ |
394 | while (curr != head) { | 396 | list_for_each_entry_safe(timer, tmp, &tv_list, entry) { |
395 | struct timer_list *tmp; | 397 | BUG_ON(timer->base != base); |
396 | 398 | internal_add_timer(base, timer); | |
397 | tmp = list_entry(curr, struct timer_list, entry); | ||
398 | BUG_ON(tmp->base != base); | ||
399 | curr = curr->next; | ||
400 | internal_add_timer(base, tmp); | ||
401 | } | 399 | } |
402 | INIT_LIST_HEAD(head); | ||
403 | 400 | ||
404 | return index; | 401 | return index; |
405 | } | 402 | } |
@@ -419,10 +416,10 @@ static inline void __run_timers(tvec_base_t *base) | |||
419 | 416 | ||
420 | spin_lock_irq(&base->lock); | 417 | spin_lock_irq(&base->lock); |
421 | while (time_after_eq(jiffies, base->timer_jiffies)) { | 418 | while (time_after_eq(jiffies, base->timer_jiffies)) { |
422 | struct list_head work_list = LIST_HEAD_INIT(work_list); | 419 | struct list_head work_list; |
423 | struct list_head *head = &work_list; | 420 | struct list_head *head = &work_list; |
424 | int index = base->timer_jiffies & TVR_MASK; | 421 | int index = base->timer_jiffies & TVR_MASK; |
425 | 422 | ||
426 | /* | 423 | /* |
427 | * Cascade timers: | 424 | * Cascade timers: |
428 | */ | 425 | */ |
@@ -431,8 +428,8 @@ static inline void __run_timers(tvec_base_t *base) | |||
431 | (!cascade(base, &base->tv3, INDEX(1))) && | 428 | (!cascade(base, &base->tv3, INDEX(1))) && |
432 | !cascade(base, &base->tv4, INDEX(2))) | 429 | !cascade(base, &base->tv4, INDEX(2))) |
433 | cascade(base, &base->tv5, INDEX(3)); | 430 | cascade(base, &base->tv5, INDEX(3)); |
434 | ++base->timer_jiffies; | 431 | ++base->timer_jiffies; |
435 | list_splice_init(base->tv1.vec + index, &work_list); | 432 | list_replace_init(base->tv1.vec + index, &work_list); |
436 | while (!list_empty(head)) { | 433 | while (!list_empty(head)) { |
437 | void (*fn)(unsigned long); | 434 | void (*fn)(unsigned long); |
438 | unsigned long data; | 435 | unsigned long data; |
@@ -601,7 +598,6 @@ long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ | |||
601 | long time_precision = 1; /* clock precision (us) */ | 598 | long time_precision = 1; /* clock precision (us) */ |
602 | long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ | 599 | long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ |
603 | long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ | 600 | long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ |
604 | static long time_phase; /* phase offset (scaled us) */ | ||
605 | long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; | 601 | long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; |
606 | /* frequency offset (scaled ppm)*/ | 602 | /* frequency offset (scaled ppm)*/ |
607 | static long time_adj; /* tick adjust (scaled 1 / HZ) */ | 603 | static long time_adj; /* tick adjust (scaled 1 / HZ) */ |
@@ -751,27 +747,14 @@ static long adjtime_adjustment(void) | |||
751 | } | 747 | } |
752 | 748 | ||
753 | /* in the NTP reference this is called "hardclock()" */ | 749 | /* in the NTP reference this is called "hardclock()" */ |
754 | static void update_wall_time_one_tick(void) | 750 | static void update_ntp_one_tick(void) |
755 | { | 751 | { |
756 | long time_adjust_step, delta_nsec; | 752 | long time_adjust_step; |
757 | 753 | ||
758 | time_adjust_step = adjtime_adjustment(); | 754 | time_adjust_step = adjtime_adjustment(); |
759 | if (time_adjust_step) | 755 | if (time_adjust_step) |
760 | /* Reduce by this step the amount of time left */ | 756 | /* Reduce by this step the amount of time left */ |
761 | time_adjust -= time_adjust_step; | 757 | time_adjust -= time_adjust_step; |
762 | delta_nsec = tick_nsec + time_adjust_step * 1000; | ||
763 | /* | ||
764 | * Advance the phase, once it gets to one microsecond, then | ||
765 | * advance the tick more. | ||
766 | */ | ||
767 | time_phase += time_adj; | ||
768 | if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) { | ||
769 | long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10)); | ||
770 | time_phase -= ltemp << (SHIFT_SCALE - 10); | ||
771 | delta_nsec += ltemp; | ||
772 | } | ||
773 | xtime.tv_nsec += delta_nsec; | ||
774 | time_interpolator_update(delta_nsec); | ||
775 | 758 | ||
776 | /* Changes by adjtime() do not take effect till next tick. */ | 759 | /* Changes by adjtime() do not take effect till next tick. */ |
777 | if (time_next_adjust != 0) { | 760 | if (time_next_adjust != 0) { |
@@ -784,36 +767,404 @@ static void update_wall_time_one_tick(void) | |||
784 | * Return how long ticks are at the moment, that is, how much time | 767 | * Return how long ticks are at the moment, that is, how much time |
785 | * update_wall_time_one_tick will add to xtime next time we call it | 768 | * update_wall_time_one_tick will add to xtime next time we call it |
786 | * (assuming no calls to do_adjtimex in the meantime). | 769 | * (assuming no calls to do_adjtimex in the meantime). |
787 | * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10 | 770 | * The return value is in fixed-point nanoseconds shifted by the |
788 | * bits to the right of the binary point. | 771 | * specified number of bits to the right of the binary point. |
789 | * This function has no side-effects. | 772 | * This function has no side-effects. |
790 | */ | 773 | */ |
791 | u64 current_tick_length(void) | 774 | u64 current_tick_length(void) |
792 | { | 775 | { |
793 | long delta_nsec; | 776 | long delta_nsec; |
777 | u64 ret; | ||
794 | 778 | ||
779 | /* calculate the finest interval NTP will allow. | ||
780 | * ie: nanosecond value shifted by (SHIFT_SCALE - 10) | ||
781 | */ | ||
795 | delta_nsec = tick_nsec + adjtime_adjustment() * 1000; | 782 | delta_nsec = tick_nsec + adjtime_adjustment() * 1000; |
796 | return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; | 783 | ret = (u64)delta_nsec << TICK_LENGTH_SHIFT; |
784 | ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10)); | ||
785 | |||
786 | return ret; | ||
797 | } | 787 | } |
798 | 788 | ||
799 | /* | 789 | /* XXX - all of this timekeeping code should be later moved to time.c */ |
800 | * Using a loop looks inefficient, but "ticks" is | 790 | #include <linux/clocksource.h> |
801 | * usually just one (we shouldn't be losing ticks, | 791 | static struct clocksource *clock; /* pointer to current clocksource */ |
802 | * we're doing this this way mainly for interrupt | 792 | |
803 | * latency reasons, not because we think we'll | 793 | #ifdef CONFIG_GENERIC_TIME |
804 | * have lots of lost timer ticks | 794 | /** |
795 | * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook | ||
796 | * | ||
797 | * private function, must hold xtime_lock lock when being | ||
798 | * called. Returns the number of nanoseconds since the | ||
799 | * last call to update_wall_time() (adjusted by NTP scaling) | ||
800 | */ | ||
801 | static inline s64 __get_nsec_offset(void) | ||
802 | { | ||
803 | cycle_t cycle_now, cycle_delta; | ||
804 | s64 ns_offset; | ||
805 | |||
806 | /* read clocksource: */ | ||
807 | cycle_now = clocksource_read(clock); | ||
808 | |||
809 | /* calculate the delta since the last update_wall_time: */ | ||
810 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | ||
811 | |||
812 | /* convert to nanoseconds: */ | ||
813 | ns_offset = cyc2ns(clock, cycle_delta); | ||
814 | |||
815 | return ns_offset; | ||
816 | } | ||
817 | |||
818 | /** | ||
819 | * __get_realtime_clock_ts - Returns the time of day in a timespec | ||
820 | * @ts: pointer to the timespec to be set | ||
821 | * | ||
822 | * Returns the time of day in a timespec. Used by | ||
823 | * do_gettimeofday() and get_realtime_clock_ts(). | ||
805 | */ | 824 | */ |
806 | static void update_wall_time(unsigned long ticks) | 825 | static inline void __get_realtime_clock_ts(struct timespec *ts) |
807 | { | 826 | { |
827 | unsigned long seq; | ||
828 | s64 nsecs; | ||
829 | |||
808 | do { | 830 | do { |
809 | ticks--; | 831 | seq = read_seqbegin(&xtime_lock); |
810 | update_wall_time_one_tick(); | 832 | |
811 | if (xtime.tv_nsec >= 1000000000) { | 833 | *ts = xtime; |
812 | xtime.tv_nsec -= 1000000000; | 834 | nsecs = __get_nsec_offset(); |
835 | |||
836 | } while (read_seqretry(&xtime_lock, seq)); | ||
837 | |||
838 | timespec_add_ns(ts, nsecs); | ||
839 | } | ||
840 | |||
841 | /** | ||
842 | * getnstimeofday - Returns the time of day in a timespec | ||
843 | * @ts: pointer to the timespec to be set | ||
844 | * | ||
845 | * Returns the time of day in a timespec. | ||
846 | */ | ||
847 | void getnstimeofday(struct timespec *ts) | ||
848 | { | ||
849 | __get_realtime_clock_ts(ts); | ||
850 | } | ||
851 | |||
852 | EXPORT_SYMBOL(getnstimeofday); | ||
853 | |||
854 | /** | ||
855 | * do_gettimeofday - Returns the time of day in a timeval | ||
856 | * @tv: pointer to the timeval to be set | ||
857 | * | ||
858 | * NOTE: Users should be converted to using get_realtime_clock_ts() | ||
859 | */ | ||
860 | void do_gettimeofday(struct timeval *tv) | ||
861 | { | ||
862 | struct timespec now; | ||
863 | |||
864 | __get_realtime_clock_ts(&now); | ||
865 | tv->tv_sec = now.tv_sec; | ||
866 | tv->tv_usec = now.tv_nsec/1000; | ||
867 | } | ||
868 | |||
869 | EXPORT_SYMBOL(do_gettimeofday); | ||
870 | /** | ||
871 | * do_settimeofday - Sets the time of day | ||
872 | * @tv: pointer to the timespec variable containing the new time | ||
873 | * | ||
874 | * Sets the time of day to the new time and update NTP and notify hrtimers | ||
875 | */ | ||
876 | int do_settimeofday(struct timespec *tv) | ||
877 | { | ||
878 | unsigned long flags; | ||
879 | time_t wtm_sec, sec = tv->tv_sec; | ||
880 | long wtm_nsec, nsec = tv->tv_nsec; | ||
881 | |||
882 | if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) | ||
883 | return -EINVAL; | ||
884 | |||
885 | write_seqlock_irqsave(&xtime_lock, flags); | ||
886 | |||
887 | nsec -= __get_nsec_offset(); | ||
888 | |||
889 | wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); | ||
890 | wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); | ||
891 | |||
892 | set_normalized_timespec(&xtime, sec, nsec); | ||
893 | set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); | ||
894 | |||
895 | clock->error = 0; | ||
896 | ntp_clear(); | ||
897 | |||
898 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
899 | |||
900 | /* signal hrtimers about time change */ | ||
901 | clock_was_set(); | ||
902 | |||
903 | return 0; | ||
904 | } | ||
905 | |||
906 | EXPORT_SYMBOL(do_settimeofday); | ||
907 | |||
908 | /** | ||
909 | * change_clocksource - Swaps clocksources if a new one is available | ||
910 | * | ||
911 | * Accumulates current time interval and initializes new clocksource | ||
912 | */ | ||
913 | static int change_clocksource(void) | ||
914 | { | ||
915 | struct clocksource *new; | ||
916 | cycle_t now; | ||
917 | u64 nsec; | ||
918 | new = clocksource_get_next(); | ||
919 | if (clock != new) { | ||
920 | now = clocksource_read(new); | ||
921 | nsec = __get_nsec_offset(); | ||
922 | timespec_add_ns(&xtime, nsec); | ||
923 | |||
924 | clock = new; | ||
925 | clock->cycle_last = now; | ||
926 | printk(KERN_INFO "Time: %s clocksource has been installed.\n", | ||
927 | clock->name); | ||
928 | return 1; | ||
929 | } else if (clock->update_callback) { | ||
930 | return clock->update_callback(); | ||
931 | } | ||
932 | return 0; | ||
933 | } | ||
934 | #else | ||
935 | #define change_clocksource() (0) | ||
936 | #endif | ||
937 | |||
938 | /** | ||
939 | * timeofday_is_continuous - check to see if timekeeping is free running | ||
940 | */ | ||
941 | int timekeeping_is_continuous(void) | ||
942 | { | ||
943 | unsigned long seq; | ||
944 | int ret; | ||
945 | |||
946 | do { | ||
947 | seq = read_seqbegin(&xtime_lock); | ||
948 | |||
949 | ret = clock->is_continuous; | ||
950 | |||
951 | } while (read_seqretry(&xtime_lock, seq)); | ||
952 | |||
953 | return ret; | ||
954 | } | ||
955 | |||
956 | /* | ||
957 | * timekeeping_init - Initializes the clocksource and common timekeeping values | ||
958 | */ | ||
959 | void __init timekeeping_init(void) | ||
960 | { | ||
961 | unsigned long flags; | ||
962 | |||
963 | write_seqlock_irqsave(&xtime_lock, flags); | ||
964 | clock = clocksource_get_next(); | ||
965 | clocksource_calculate_interval(clock, tick_nsec); | ||
966 | clock->cycle_last = clocksource_read(clock); | ||
967 | ntp_clear(); | ||
968 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
969 | } | ||
970 | |||
971 | |||
972 | static int timekeeping_suspended; | ||
973 | /* | ||
974 | * timekeeping_resume - Resumes the generic timekeeping subsystem. | ||
975 | * @dev: unused | ||
976 | * | ||
977 | * This is for the generic clocksource timekeeping. | ||
978 | * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are | ||
979 | * still managed by arch specific suspend/resume code. | ||
980 | */ | ||
981 | static int timekeeping_resume(struct sys_device *dev) | ||
982 | { | ||
983 | unsigned long flags; | ||
984 | |||
985 | write_seqlock_irqsave(&xtime_lock, flags); | ||
986 | /* restart the last cycle value */ | ||
987 | clock->cycle_last = clocksource_read(clock); | ||
988 | clock->error = 0; | ||
989 | timekeeping_suspended = 0; | ||
990 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
991 | return 0; | ||
992 | } | ||
993 | |||
994 | static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) | ||
995 | { | ||
996 | unsigned long flags; | ||
997 | |||
998 | write_seqlock_irqsave(&xtime_lock, flags); | ||
999 | timekeeping_suspended = 1; | ||
1000 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
1001 | return 0; | ||
1002 | } | ||
1003 | |||
1004 | /* sysfs resume/suspend bits for timekeeping */ | ||
1005 | static struct sysdev_class timekeeping_sysclass = { | ||
1006 | .resume = timekeeping_resume, | ||
1007 | .suspend = timekeeping_suspend, | ||
1008 | set_kset_name("timekeeping"), | ||
1009 | }; | ||
1010 | |||
1011 | static struct sys_device device_timer = { | ||
1012 | .id = 0, | ||
1013 | .cls = &timekeeping_sysclass, | ||
1014 | }; | ||
1015 | |||
1016 | static int __init timekeeping_init_device(void) | ||
1017 | { | ||
1018 | int error = sysdev_class_register(&timekeeping_sysclass); | ||
1019 | if (!error) | ||
1020 | error = sysdev_register(&device_timer); | ||
1021 | return error; | ||
1022 | } | ||
1023 | |||
1024 | device_initcall(timekeeping_init_device); | ||
1025 | |||
1026 | /* | ||
1027 | * If the error is already larger, we look ahead even further | ||
1028 | * to compensate for late or lost adjustments. | ||
1029 | */ | ||
1030 | static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset) | ||
1031 | { | ||
1032 | s64 tick_error, i; | ||
1033 | u32 look_ahead, adj; | ||
1034 | s32 error2, mult; | ||
1035 | |||
1036 | /* | ||
1037 | * Use the current error value to determine how much to look ahead. | ||
1038 | * The larger the error the slower we adjust for it to avoid problems | ||
1039 | * with losing too many ticks, otherwise we would overadjust and | ||
1040 | * produce an even larger error. The smaller the adjustment the | ||
1041 | * faster we try to adjust for it, as lost ticks can do less harm | ||
1042 | * here. This is tuned so that an error of about 1 msec is adusted | ||
1043 | * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). | ||
1044 | */ | ||
1045 | error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ); | ||
1046 | error2 = abs(error2); | ||
1047 | for (look_ahead = 0; error2 > 0; look_ahead++) | ||
1048 | error2 >>= 2; | ||
1049 | |||
1050 | /* | ||
1051 | * Now calculate the error in (1 << look_ahead) ticks, but first | ||
1052 | * remove the single look ahead already included in the error. | ||
1053 | */ | ||
1054 | tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); | ||
1055 | tick_error -= clock->xtime_interval >> 1; | ||
1056 | error = ((error - tick_error) >> look_ahead) + tick_error; | ||
1057 | |||
1058 | /* Finally calculate the adjustment shift value. */ | ||
1059 | i = *interval; | ||
1060 | mult = 1; | ||
1061 | if (error < 0) { | ||
1062 | error = -error; | ||
1063 | *interval = -*interval; | ||
1064 | *offset = -*offset; | ||
1065 | mult = -1; | ||
1066 | } | ||
1067 | for (adj = 0; error > i; adj++) | ||
1068 | error >>= 1; | ||
1069 | |||
1070 | *interval <<= adj; | ||
1071 | *offset <<= adj; | ||
1072 | return mult << adj; | ||
1073 | } | ||
1074 | |||
1075 | /* | ||
1076 | * Adjust the multiplier to reduce the error value, | ||
1077 | * this is optimized for the most common adjustments of -1,0,1, | ||
1078 | * for other values we can do a bit more work. | ||
1079 | */ | ||
1080 | static void clocksource_adjust(struct clocksource *clock, s64 offset) | ||
1081 | { | ||
1082 | s64 error, interval = clock->cycle_interval; | ||
1083 | int adj; | ||
1084 | |||
1085 | error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); | ||
1086 | if (error > interval) { | ||
1087 | error >>= 2; | ||
1088 | if (likely(error <= interval)) | ||
1089 | adj = 1; | ||
1090 | else | ||
1091 | adj = clocksource_bigadjust(error, &interval, &offset); | ||
1092 | } else if (error < -interval) { | ||
1093 | error >>= 2; | ||
1094 | if (likely(error >= -interval)) { | ||
1095 | adj = -1; | ||
1096 | interval = -interval; | ||
1097 | offset = -offset; | ||
1098 | } else | ||
1099 | adj = clocksource_bigadjust(error, &interval, &offset); | ||
1100 | } else | ||
1101 | return; | ||
1102 | |||
1103 | clock->mult += adj; | ||
1104 | clock->xtime_interval += interval; | ||
1105 | clock->xtime_nsec -= offset; | ||
1106 | clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); | ||
1107 | } | ||
1108 | |||
1109 | /* | ||
1110 | * update_wall_time - Uses the current clocksource to increment the wall time | ||
1111 | * | ||
1112 | * Called from the timer interrupt, must hold a write on xtime_lock. | ||
1113 | */ | ||
1114 | static void update_wall_time(void) | ||
1115 | { | ||
1116 | cycle_t offset; | ||
1117 | |||
1118 | /* Make sure we're fully resumed: */ | ||
1119 | if (unlikely(timekeeping_suspended)) | ||
1120 | return; | ||
1121 | |||
1122 | #ifdef CONFIG_GENERIC_TIME | ||
1123 | offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; | ||
1124 | #else | ||
1125 | offset = clock->cycle_interval; | ||
1126 | #endif | ||
1127 | clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; | ||
1128 | |||
1129 | /* normally this loop will run just once, however in the | ||
1130 | * case of lost or late ticks, it will accumulate correctly. | ||
1131 | */ | ||
1132 | while (offset >= clock->cycle_interval) { | ||
1133 | /* accumulate one interval */ | ||
1134 | clock->xtime_nsec += clock->xtime_interval; | ||
1135 | clock->cycle_last += clock->cycle_interval; | ||
1136 | offset -= clock->cycle_interval; | ||
1137 | |||
1138 | if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { | ||
1139 | clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; | ||
813 | xtime.tv_sec++; | 1140 | xtime.tv_sec++; |
814 | second_overflow(); | 1141 | second_overflow(); |
815 | } | 1142 | } |
816 | } while (ticks); | 1143 | |
1144 | /* interpolator bits */ | ||
1145 | time_interpolator_update(clock->xtime_interval | ||
1146 | >> clock->shift); | ||
1147 | /* increment the NTP state machine */ | ||
1148 | update_ntp_one_tick(); | ||
1149 | |||
1150 | /* accumulate error between NTP and clock interval */ | ||
1151 | clock->error += current_tick_length(); | ||
1152 | clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); | ||
1153 | } | ||
1154 | |||
1155 | /* correct the clock when NTP error is too big */ | ||
1156 | clocksource_adjust(clock, offset); | ||
1157 | |||
1158 | /* store full nanoseconds into xtime */ | ||
1159 | xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; | ||
1160 | clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; | ||
1161 | |||
1162 | /* check to see if there is a new clocksource to use */ | ||
1163 | if (change_clocksource()) { | ||
1164 | clock->error = 0; | ||
1165 | clock->xtime_nsec = 0; | ||
1166 | clocksource_calculate_interval(clock, tick_nsec); | ||
1167 | } | ||
817 | } | 1168 | } |
818 | 1169 | ||
819 | /* | 1170 | /* |
@@ -884,7 +1235,7 @@ unsigned long wall_jiffies = INITIAL_JIFFIES; | |||
884 | * playing with xtime and avenrun. | 1235 | * playing with xtime and avenrun. |
885 | */ | 1236 | */ |
886 | #ifndef ARCH_HAVE_XTIME_LOCK | 1237 | #ifndef ARCH_HAVE_XTIME_LOCK |
887 | seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; | 1238 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); |
888 | 1239 | ||
889 | EXPORT_SYMBOL(xtime_lock); | 1240 | EXPORT_SYMBOL(xtime_lock); |
890 | #endif | 1241 | #endif |
@@ -919,10 +1270,8 @@ static inline void update_times(void) | |||
919 | unsigned long ticks; | 1270 | unsigned long ticks; |
920 | 1271 | ||
921 | ticks = jiffies - wall_jiffies; | 1272 | ticks = jiffies - wall_jiffies; |
922 | if (ticks) { | 1273 | wall_jiffies += ticks; |
923 | wall_jiffies += ticks; | 1274 | update_wall_time(); |
924 | update_wall_time(ticks); | ||
925 | } | ||
926 | calc_load(ticks); | 1275 | calc_load(ticks); |
927 | } | 1276 | } |
928 | 1277 | ||
@@ -1046,7 +1395,7 @@ asmlinkage long sys_getegid(void) | |||
1046 | 1395 | ||
1047 | static void process_timeout(unsigned long __data) | 1396 | static void process_timeout(unsigned long __data) |
1048 | { | 1397 | { |
1049 | wake_up_process((task_t *)__data); | 1398 | wake_up_process((struct task_struct *)__data); |
1050 | } | 1399 | } |
1051 | 1400 | ||
1052 | /** | 1401 | /** |
@@ -1237,6 +1586,13 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) | |||
1237 | return 0; | 1586 | return 0; |
1238 | } | 1587 | } |
1239 | 1588 | ||
1589 | /* | ||
1590 | * lockdep: we want to track each per-CPU base as a separate lock-class, | ||
1591 | * but timer-bases are kmalloc()-ed, so we need to attach separate | ||
1592 | * keys to them: | ||
1593 | */ | ||
1594 | static struct lock_class_key base_lock_keys[NR_CPUS]; | ||
1595 | |||
1240 | static int __devinit init_timers_cpu(int cpu) | 1596 | static int __devinit init_timers_cpu(int cpu) |
1241 | { | 1597 | { |
1242 | int j; | 1598 | int j; |
@@ -1272,6 +1628,8 @@ static int __devinit init_timers_cpu(int cpu) | |||
1272 | } | 1628 | } |
1273 | 1629 | ||
1274 | spin_lock_init(&base->lock); | 1630 | spin_lock_init(&base->lock); |
1631 | lockdep_set_class(&base->lock, base_lock_keys + cpu); | ||
1632 | |||
1275 | for (j = 0; j < TVN_SIZE; j++) { | 1633 | for (j = 0; j < TVN_SIZE; j++) { |
1276 | INIT_LIST_HEAD(base->tv5.vec + j); | 1634 | INIT_LIST_HEAD(base->tv5.vec + j); |
1277 | INIT_LIST_HEAD(base->tv4.vec + j); | 1635 | INIT_LIST_HEAD(base->tv4.vec + j); |
@@ -1330,7 +1688,7 @@ static void __devinit migrate_timers(int cpu) | |||
1330 | } | 1688 | } |
1331 | #endif /* CONFIG_HOTPLUG_CPU */ | 1689 | #endif /* CONFIG_HOTPLUG_CPU */ |
1332 | 1690 | ||
1333 | static int timer_cpu_notify(struct notifier_block *self, | 1691 | static int __devinit timer_cpu_notify(struct notifier_block *self, |
1334 | unsigned long action, void *hcpu) | 1692 | unsigned long action, void *hcpu) |
1335 | { | 1693 | { |
1336 | long cpu = (long)hcpu; | 1694 | long cpu = (long)hcpu; |
@@ -1350,7 +1708,7 @@ static int timer_cpu_notify(struct notifier_block *self, | |||
1350 | return NOTIFY_OK; | 1708 | return NOTIFY_OK; |
1351 | } | 1709 | } |
1352 | 1710 | ||
1353 | static struct notifier_block timers_nb = { | 1711 | static struct notifier_block __devinitdata timers_nb = { |
1354 | .notifier_call = timer_cpu_notify, | 1712 | .notifier_call = timer_cpu_notify, |
1355 | }; | 1713 | }; |
1356 | 1714 | ||
diff --git a/kernel/unwind.c b/kernel/unwind.c new file mode 100644 index 000000000000..f69c804c8e62 --- /dev/null +++ b/kernel/unwind.c | |||
@@ -0,0 +1,918 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2002-2006 Novell, Inc. | ||
3 | * Jan Beulich <jbeulich@novell.com> | ||
4 | * This code is released under version 2 of the GNU GPL. | ||
5 | * | ||
6 | * A simple API for unwinding kernel stacks. This is used for | ||
7 | * debugging and error reporting purposes. The kernel doesn't need | ||
8 | * full-blown stack unwinding with all the bells and whistles, so there | ||
9 | * is not much point in implementing the full Dwarf2 unwind API. | ||
10 | */ | ||
11 | |||
12 | #include <linux/unwind.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <linux/delay.h> | ||
15 | #include <linux/stop_machine.h> | ||
16 | #include <asm/sections.h> | ||
17 | #include <asm/uaccess.h> | ||
18 | #include <asm/unaligned.h> | ||
19 | |||
20 | extern char __start_unwind[], __end_unwind[]; | ||
21 | |||
22 | #define MAX_STACK_DEPTH 8 | ||
23 | |||
24 | #define EXTRA_INFO(f) { \ | ||
25 | BUILD_BUG_ON_ZERO(offsetof(struct unwind_frame_info, f) \ | ||
26 | % FIELD_SIZEOF(struct unwind_frame_info, f)) \ | ||
27 | + offsetof(struct unwind_frame_info, f) \ | ||
28 | / FIELD_SIZEOF(struct unwind_frame_info, f), \ | ||
29 | FIELD_SIZEOF(struct unwind_frame_info, f) \ | ||
30 | } | ||
31 | #define PTREGS_INFO(f) EXTRA_INFO(regs.f) | ||
32 | |||
33 | static const struct { | ||
34 | unsigned offs:BITS_PER_LONG / 2; | ||
35 | unsigned width:BITS_PER_LONG / 2; | ||
36 | } reg_info[] = { | ||
37 | UNW_REGISTER_INFO | ||
38 | }; | ||
39 | |||
40 | #undef PTREGS_INFO | ||
41 | #undef EXTRA_INFO | ||
42 | |||
43 | #ifndef REG_INVALID | ||
44 | #define REG_INVALID(r) (reg_info[r].width == 0) | ||
45 | #endif | ||
46 | |||
47 | #define DW_CFA_nop 0x00 | ||
48 | #define DW_CFA_set_loc 0x01 | ||
49 | #define DW_CFA_advance_loc1 0x02 | ||
50 | #define DW_CFA_advance_loc2 0x03 | ||
51 | #define DW_CFA_advance_loc4 0x04 | ||
52 | #define DW_CFA_offset_extended 0x05 | ||
53 | #define DW_CFA_restore_extended 0x06 | ||
54 | #define DW_CFA_undefined 0x07 | ||
55 | #define DW_CFA_same_value 0x08 | ||
56 | #define DW_CFA_register 0x09 | ||
57 | #define DW_CFA_remember_state 0x0a | ||
58 | #define DW_CFA_restore_state 0x0b | ||
59 | #define DW_CFA_def_cfa 0x0c | ||
60 | #define DW_CFA_def_cfa_register 0x0d | ||
61 | #define DW_CFA_def_cfa_offset 0x0e | ||
62 | #define DW_CFA_def_cfa_expression 0x0f | ||
63 | #define DW_CFA_expression 0x10 | ||
64 | #define DW_CFA_offset_extended_sf 0x11 | ||
65 | #define DW_CFA_def_cfa_sf 0x12 | ||
66 | #define DW_CFA_def_cfa_offset_sf 0x13 | ||
67 | #define DW_CFA_val_offset 0x14 | ||
68 | #define DW_CFA_val_offset_sf 0x15 | ||
69 | #define DW_CFA_val_expression 0x16 | ||
70 | #define DW_CFA_lo_user 0x1c | ||
71 | #define DW_CFA_GNU_window_save 0x2d | ||
72 | #define DW_CFA_GNU_args_size 0x2e | ||
73 | #define DW_CFA_GNU_negative_offset_extended 0x2f | ||
74 | #define DW_CFA_hi_user 0x3f | ||
75 | |||
76 | #define DW_EH_PE_FORM 0x07 | ||
77 | #define DW_EH_PE_native 0x00 | ||
78 | #define DW_EH_PE_leb128 0x01 | ||
79 | #define DW_EH_PE_data2 0x02 | ||
80 | #define DW_EH_PE_data4 0x03 | ||
81 | #define DW_EH_PE_data8 0x04 | ||
82 | #define DW_EH_PE_signed 0x08 | ||
83 | #define DW_EH_PE_ADJUST 0x70 | ||
84 | #define DW_EH_PE_abs 0x00 | ||
85 | #define DW_EH_PE_pcrel 0x10 | ||
86 | #define DW_EH_PE_textrel 0x20 | ||
87 | #define DW_EH_PE_datarel 0x30 | ||
88 | #define DW_EH_PE_funcrel 0x40 | ||
89 | #define DW_EH_PE_aligned 0x50 | ||
90 | #define DW_EH_PE_indirect 0x80 | ||
91 | #define DW_EH_PE_omit 0xff | ||
92 | |||
93 | typedef unsigned long uleb128_t; | ||
94 | typedef signed long sleb128_t; | ||
95 | |||
96 | static struct unwind_table { | ||
97 | struct { | ||
98 | unsigned long pc; | ||
99 | unsigned long range; | ||
100 | } core, init; | ||
101 | const void *address; | ||
102 | unsigned long size; | ||
103 | struct unwind_table *link; | ||
104 | const char *name; | ||
105 | } root_table, *last_table; | ||
106 | |||
107 | struct unwind_item { | ||
108 | enum item_location { | ||
109 | Nowhere, | ||
110 | Memory, | ||
111 | Register, | ||
112 | Value | ||
113 | } where; | ||
114 | uleb128_t value; | ||
115 | }; | ||
116 | |||
117 | struct unwind_state { | ||
118 | uleb128_t loc, org; | ||
119 | const u8 *cieStart, *cieEnd; | ||
120 | uleb128_t codeAlign; | ||
121 | sleb128_t dataAlign; | ||
122 | struct cfa { | ||
123 | uleb128_t reg, offs; | ||
124 | } cfa; | ||
125 | struct unwind_item regs[ARRAY_SIZE(reg_info)]; | ||
126 | unsigned stackDepth:8; | ||
127 | unsigned version:8; | ||
128 | const u8 *label; | ||
129 | const u8 *stack[MAX_STACK_DEPTH]; | ||
130 | }; | ||
131 | |||
132 | static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 }; | ||
133 | |||
134 | static struct unwind_table *find_table(unsigned long pc) | ||
135 | { | ||
136 | struct unwind_table *table; | ||
137 | |||
138 | for (table = &root_table; table; table = table->link) | ||
139 | if ((pc >= table->core.pc | ||
140 | && pc < table->core.pc + table->core.range) | ||
141 | || (pc >= table->init.pc | ||
142 | && pc < table->init.pc + table->init.range)) | ||
143 | break; | ||
144 | |||
145 | return table; | ||
146 | } | ||
147 | |||
148 | static void init_unwind_table(struct unwind_table *table, | ||
149 | const char *name, | ||
150 | const void *core_start, | ||
151 | unsigned long core_size, | ||
152 | const void *init_start, | ||
153 | unsigned long init_size, | ||
154 | const void *table_start, | ||
155 | unsigned long table_size) | ||
156 | { | ||
157 | table->core.pc = (unsigned long)core_start; | ||
158 | table->core.range = core_size; | ||
159 | table->init.pc = (unsigned long)init_start; | ||
160 | table->init.range = init_size; | ||
161 | table->address = table_start; | ||
162 | table->size = table_size; | ||
163 | table->link = NULL; | ||
164 | table->name = name; | ||
165 | } | ||
166 | |||
167 | void __init unwind_init(void) | ||
168 | { | ||
169 | init_unwind_table(&root_table, "kernel", | ||
170 | _text, _end - _text, | ||
171 | NULL, 0, | ||
172 | __start_unwind, __end_unwind - __start_unwind); | ||
173 | } | ||
174 | |||
175 | #ifdef CONFIG_MODULES | ||
176 | |||
177 | /* Must be called with module_mutex held. */ | ||
178 | void *unwind_add_table(struct module *module, | ||
179 | const void *table_start, | ||
180 | unsigned long table_size) | ||
181 | { | ||
182 | struct unwind_table *table; | ||
183 | |||
184 | if (table_size <= 0) | ||
185 | return NULL; | ||
186 | |||
187 | table = kmalloc(sizeof(*table), GFP_KERNEL); | ||
188 | if (!table) | ||
189 | return NULL; | ||
190 | |||
191 | init_unwind_table(table, module->name, | ||
192 | module->module_core, module->core_size, | ||
193 | module->module_init, module->init_size, | ||
194 | table_start, table_size); | ||
195 | |||
196 | if (last_table) | ||
197 | last_table->link = table; | ||
198 | else | ||
199 | root_table.link = table; | ||
200 | last_table = table; | ||
201 | |||
202 | return table; | ||
203 | } | ||
204 | |||
205 | struct unlink_table_info | ||
206 | { | ||
207 | struct unwind_table *table; | ||
208 | int init_only; | ||
209 | }; | ||
210 | |||
211 | static int unlink_table(void *arg) | ||
212 | { | ||
213 | struct unlink_table_info *info = arg; | ||
214 | struct unwind_table *table = info->table, *prev; | ||
215 | |||
216 | for (prev = &root_table; prev->link && prev->link != table; prev = prev->link) | ||
217 | ; | ||
218 | |||
219 | if (prev->link) { | ||
220 | if (info->init_only) { | ||
221 | table->init.pc = 0; | ||
222 | table->init.range = 0; | ||
223 | info->table = NULL; | ||
224 | } else { | ||
225 | prev->link = table->link; | ||
226 | if (!prev->link) | ||
227 | last_table = prev; | ||
228 | } | ||
229 | } else | ||
230 | info->table = NULL; | ||
231 | |||
232 | return 0; | ||
233 | } | ||
234 | |||
235 | /* Must be called with module_mutex held. */ | ||
236 | void unwind_remove_table(void *handle, int init_only) | ||
237 | { | ||
238 | struct unwind_table *table = handle; | ||
239 | struct unlink_table_info info; | ||
240 | |||
241 | if (!table || table == &root_table) | ||
242 | return; | ||
243 | |||
244 | if (init_only && table == last_table) { | ||
245 | table->init.pc = 0; | ||
246 | table->init.range = 0; | ||
247 | return; | ||
248 | } | ||
249 | |||
250 | info.table = table; | ||
251 | info.init_only = init_only; | ||
252 | stop_machine_run(unlink_table, &info, NR_CPUS); | ||
253 | |||
254 | if (info.table) | ||
255 | kfree(table); | ||
256 | } | ||
257 | |||
258 | #endif /* CONFIG_MODULES */ | ||
259 | |||
260 | static uleb128_t get_uleb128(const u8 **pcur, const u8 *end) | ||
261 | { | ||
262 | const u8 *cur = *pcur; | ||
263 | uleb128_t value; | ||
264 | unsigned shift; | ||
265 | |||
266 | for (shift = 0, value = 0; cur < end; shift += 7) { | ||
267 | if (shift + 7 > 8 * sizeof(value) | ||
268 | && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) { | ||
269 | cur = end + 1; | ||
270 | break; | ||
271 | } | ||
272 | value |= (uleb128_t)(*cur & 0x7f) << shift; | ||
273 | if (!(*cur++ & 0x80)) | ||
274 | break; | ||
275 | } | ||
276 | *pcur = cur; | ||
277 | |||
278 | return value; | ||
279 | } | ||
280 | |||
281 | static sleb128_t get_sleb128(const u8 **pcur, const u8 *end) | ||
282 | { | ||
283 | const u8 *cur = *pcur; | ||
284 | sleb128_t value; | ||
285 | unsigned shift; | ||
286 | |||
287 | for (shift = 0, value = 0; cur < end; shift += 7) { | ||
288 | if (shift + 7 > 8 * sizeof(value) | ||
289 | && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) { | ||
290 | cur = end + 1; | ||
291 | break; | ||
292 | } | ||
293 | value |= (sleb128_t)(*cur & 0x7f) << shift; | ||
294 | if (!(*cur & 0x80)) { | ||
295 | value |= -(*cur++ & 0x40) << shift; | ||
296 | break; | ||
297 | } | ||
298 | } | ||
299 | *pcur = cur; | ||
300 | |||
301 | return value; | ||
302 | } | ||
303 | |||
304 | static unsigned long read_pointer(const u8 **pLoc, | ||
305 | const void *end, | ||
306 | signed ptrType) | ||
307 | { | ||
308 | unsigned long value = 0; | ||
309 | union { | ||
310 | const u8 *p8; | ||
311 | const u16 *p16u; | ||
312 | const s16 *p16s; | ||
313 | const u32 *p32u; | ||
314 | const s32 *p32s; | ||
315 | const unsigned long *pul; | ||
316 | } ptr; | ||
317 | |||
318 | if (ptrType < 0 || ptrType == DW_EH_PE_omit) | ||
319 | return 0; | ||
320 | ptr.p8 = *pLoc; | ||
321 | switch(ptrType & DW_EH_PE_FORM) { | ||
322 | case DW_EH_PE_data2: | ||
323 | if (end < (const void *)(ptr.p16u + 1)) | ||
324 | return 0; | ||
325 | if(ptrType & DW_EH_PE_signed) | ||
326 | value = get_unaligned(ptr.p16s++); | ||
327 | else | ||
328 | value = get_unaligned(ptr.p16u++); | ||
329 | break; | ||
330 | case DW_EH_PE_data4: | ||
331 | #ifdef CONFIG_64BIT | ||
332 | if (end < (const void *)(ptr.p32u + 1)) | ||
333 | return 0; | ||
334 | if(ptrType & DW_EH_PE_signed) | ||
335 | value = get_unaligned(ptr.p32s++); | ||
336 | else | ||
337 | value = get_unaligned(ptr.p32u++); | ||
338 | break; | ||
339 | case DW_EH_PE_data8: | ||
340 | BUILD_BUG_ON(sizeof(u64) != sizeof(value)); | ||
341 | #else | ||
342 | BUILD_BUG_ON(sizeof(u32) != sizeof(value)); | ||
343 | #endif | ||
344 | case DW_EH_PE_native: | ||
345 | if (end < (const void *)(ptr.pul + 1)) | ||
346 | return 0; | ||
347 | value = get_unaligned(ptr.pul++); | ||
348 | break; | ||
349 | case DW_EH_PE_leb128: | ||
350 | BUILD_BUG_ON(sizeof(uleb128_t) > sizeof(value)); | ||
351 | value = ptrType & DW_EH_PE_signed | ||
352 | ? get_sleb128(&ptr.p8, end) | ||
353 | : get_uleb128(&ptr.p8, end); | ||
354 | if ((const void *)ptr.p8 > end) | ||
355 | return 0; | ||
356 | break; | ||
357 | default: | ||
358 | return 0; | ||
359 | } | ||
360 | switch(ptrType & DW_EH_PE_ADJUST) { | ||
361 | case DW_EH_PE_abs: | ||
362 | break; | ||
363 | case DW_EH_PE_pcrel: | ||
364 | value += (unsigned long)*pLoc; | ||
365 | break; | ||
366 | default: | ||
367 | return 0; | ||
368 | } | ||
369 | if ((ptrType & DW_EH_PE_indirect) | ||
370 | && __get_user(value, (unsigned long *)value)) | ||
371 | return 0; | ||
372 | *pLoc = ptr.p8; | ||
373 | |||
374 | return value; | ||
375 | } | ||
376 | |||
377 | static signed fde_pointer_type(const u32 *cie) | ||
378 | { | ||
379 | const u8 *ptr = (const u8 *)(cie + 2); | ||
380 | unsigned version = *ptr; | ||
381 | |||
382 | if (version != 1) | ||
383 | return -1; /* unsupported */ | ||
384 | if (*++ptr) { | ||
385 | const char *aug; | ||
386 | const u8 *end = (const u8 *)(cie + 1) + *cie; | ||
387 | uleb128_t len; | ||
388 | |||
389 | /* check if augmentation size is first (and thus present) */ | ||
390 | if (*ptr != 'z') | ||
391 | return -1; | ||
392 | /* check if augmentation string is nul-terminated */ | ||
393 | if ((ptr = memchr(aug = (const void *)ptr, 0, end - ptr)) == NULL) | ||
394 | return -1; | ||
395 | ++ptr; /* skip terminator */ | ||
396 | get_uleb128(&ptr, end); /* skip code alignment */ | ||
397 | get_sleb128(&ptr, end); /* skip data alignment */ | ||
398 | /* skip return address column */ | ||
399 | version <= 1 ? (void)++ptr : (void)get_uleb128(&ptr, end); | ||
400 | len = get_uleb128(&ptr, end); /* augmentation length */ | ||
401 | if (ptr + len < ptr || ptr + len > end) | ||
402 | return -1; | ||
403 | end = ptr + len; | ||
404 | while (*++aug) { | ||
405 | if (ptr >= end) | ||
406 | return -1; | ||
407 | switch(*aug) { | ||
408 | case 'L': | ||
409 | ++ptr; | ||
410 | break; | ||
411 | case 'P': { | ||
412 | signed ptrType = *ptr++; | ||
413 | |||
414 | if (!read_pointer(&ptr, end, ptrType) || ptr > end) | ||
415 | return -1; | ||
416 | } | ||
417 | break; | ||
418 | case 'R': | ||
419 | return *ptr; | ||
420 | default: | ||
421 | return -1; | ||
422 | } | ||
423 | } | ||
424 | } | ||
425 | return DW_EH_PE_native|DW_EH_PE_abs; | ||
426 | } | ||
427 | |||
428 | static int advance_loc(unsigned long delta, struct unwind_state *state) | ||
429 | { | ||
430 | state->loc += delta * state->codeAlign; | ||
431 | |||
432 | return delta > 0; | ||
433 | } | ||
434 | |||
435 | static void set_rule(uleb128_t reg, | ||
436 | enum item_location where, | ||
437 | uleb128_t value, | ||
438 | struct unwind_state *state) | ||
439 | { | ||
440 | if (reg < ARRAY_SIZE(state->regs)) { | ||
441 | state->regs[reg].where = where; | ||
442 | state->regs[reg].value = value; | ||
443 | } | ||
444 | } | ||
445 | |||
446 | static int processCFI(const u8 *start, | ||
447 | const u8 *end, | ||
448 | unsigned long targetLoc, | ||
449 | signed ptrType, | ||
450 | struct unwind_state *state) | ||
451 | { | ||
452 | union { | ||
453 | const u8 *p8; | ||
454 | const u16 *p16; | ||
455 | const u32 *p32; | ||
456 | } ptr; | ||
457 | int result = 1; | ||
458 | |||
459 | if (start != state->cieStart) { | ||
460 | state->loc = state->org; | ||
461 | result = processCFI(state->cieStart, state->cieEnd, 0, ptrType, state); | ||
462 | if (targetLoc == 0 && state->label == NULL) | ||
463 | return result; | ||
464 | } | ||
465 | for (ptr.p8 = start; result && ptr.p8 < end; ) { | ||
466 | switch(*ptr.p8 >> 6) { | ||
467 | uleb128_t value; | ||
468 | |||
469 | case 0: | ||
470 | switch(*ptr.p8++) { | ||
471 | case DW_CFA_nop: | ||
472 | break; | ||
473 | case DW_CFA_set_loc: | ||
474 | if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0) | ||
475 | result = 0; | ||
476 | break; | ||
477 | case DW_CFA_advance_loc1: | ||
478 | result = ptr.p8 < end && advance_loc(*ptr.p8++, state); | ||
479 | break; | ||
480 | case DW_CFA_advance_loc2: | ||
481 | result = ptr.p8 <= end + 2 | ||
482 | && advance_loc(*ptr.p16++, state); | ||
483 | break; | ||
484 | case DW_CFA_advance_loc4: | ||
485 | result = ptr.p8 <= end + 4 | ||
486 | && advance_loc(*ptr.p32++, state); | ||
487 | break; | ||
488 | case DW_CFA_offset_extended: | ||
489 | value = get_uleb128(&ptr.p8, end); | ||
490 | set_rule(value, Memory, get_uleb128(&ptr.p8, end), state); | ||
491 | break; | ||
492 | case DW_CFA_val_offset: | ||
493 | value = get_uleb128(&ptr.p8, end); | ||
494 | set_rule(value, Value, get_uleb128(&ptr.p8, end), state); | ||
495 | break; | ||
496 | case DW_CFA_offset_extended_sf: | ||
497 | value = get_uleb128(&ptr.p8, end); | ||
498 | set_rule(value, Memory, get_sleb128(&ptr.p8, end), state); | ||
499 | break; | ||
500 | case DW_CFA_val_offset_sf: | ||
501 | value = get_uleb128(&ptr.p8, end); | ||
502 | set_rule(value, Value, get_sleb128(&ptr.p8, end), state); | ||
503 | break; | ||
504 | case DW_CFA_restore_extended: | ||
505 | case DW_CFA_undefined: | ||
506 | case DW_CFA_same_value: | ||
507 | set_rule(get_uleb128(&ptr.p8, end), Nowhere, 0, state); | ||
508 | break; | ||
509 | case DW_CFA_register: | ||
510 | value = get_uleb128(&ptr.p8, end); | ||
511 | set_rule(value, | ||
512 | Register, | ||
513 | get_uleb128(&ptr.p8, end), state); | ||
514 | break; | ||
515 | case DW_CFA_remember_state: | ||
516 | if (ptr.p8 == state->label) { | ||
517 | state->label = NULL; | ||
518 | return 1; | ||
519 | } | ||
520 | if (state->stackDepth >= MAX_STACK_DEPTH) | ||
521 | return 0; | ||
522 | state->stack[state->stackDepth++] = ptr.p8; | ||
523 | break; | ||
524 | case DW_CFA_restore_state: | ||
525 | if (state->stackDepth) { | ||
526 | const uleb128_t loc = state->loc; | ||
527 | const u8 *label = state->label; | ||
528 | |||
529 | state->label = state->stack[state->stackDepth - 1]; | ||
530 | memcpy(&state->cfa, &badCFA, sizeof(state->cfa)); | ||
531 | memset(state->regs, 0, sizeof(state->regs)); | ||
532 | state->stackDepth = 0; | ||
533 | result = processCFI(start, end, 0, ptrType, state); | ||
534 | state->loc = loc; | ||
535 | state->label = label; | ||
536 | } else | ||
537 | return 0; | ||
538 | break; | ||
539 | case DW_CFA_def_cfa: | ||
540 | state->cfa.reg = get_uleb128(&ptr.p8, end); | ||
541 | /*nobreak*/ | ||
542 | case DW_CFA_def_cfa_offset: | ||
543 | state->cfa.offs = get_uleb128(&ptr.p8, end); | ||
544 | break; | ||
545 | case DW_CFA_def_cfa_sf: | ||
546 | state->cfa.reg = get_uleb128(&ptr.p8, end); | ||
547 | /*nobreak*/ | ||
548 | case DW_CFA_def_cfa_offset_sf: | ||
549 | state->cfa.offs = get_sleb128(&ptr.p8, end) | ||
550 | * state->dataAlign; | ||
551 | break; | ||
552 | case DW_CFA_def_cfa_register: | ||
553 | state->cfa.reg = get_uleb128(&ptr.p8, end); | ||
554 | break; | ||
555 | /*todo case DW_CFA_def_cfa_expression: */ | ||
556 | /*todo case DW_CFA_expression: */ | ||
557 | /*todo case DW_CFA_val_expression: */ | ||
558 | case DW_CFA_GNU_args_size: | ||
559 | get_uleb128(&ptr.p8, end); | ||
560 | break; | ||
561 | case DW_CFA_GNU_negative_offset_extended: | ||
562 | value = get_uleb128(&ptr.p8, end); | ||
563 | set_rule(value, | ||
564 | Memory, | ||
565 | (uleb128_t)0 - get_uleb128(&ptr.p8, end), state); | ||
566 | break; | ||
567 | case DW_CFA_GNU_window_save: | ||
568 | default: | ||
569 | result = 0; | ||
570 | break; | ||
571 | } | ||
572 | break; | ||
573 | case 1: | ||
574 | result = advance_loc(*ptr.p8++ & 0x3f, state); | ||
575 | break; | ||
576 | case 2: | ||
577 | value = *ptr.p8++ & 0x3f; | ||
578 | set_rule(value, Memory, get_uleb128(&ptr.p8, end), state); | ||
579 | break; | ||
580 | case 3: | ||
581 | set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state); | ||
582 | break; | ||
583 | } | ||
584 | if (ptr.p8 > end) | ||
585 | result = 0; | ||
586 | if (result && targetLoc != 0 && targetLoc < state->loc) | ||
587 | return 1; | ||
588 | } | ||
589 | |||
590 | return result | ||
591 | && ptr.p8 == end | ||
592 | && (targetLoc == 0 | ||
593 | || (/*todo While in theory this should apply, gcc in practice omits | ||
594 | everything past the function prolog, and hence the location | ||
595 | never reaches the end of the function. | ||
596 | targetLoc < state->loc &&*/ state->label == NULL)); | ||
597 | } | ||
598 | |||
599 | /* Unwind to previous to frame. Returns 0 if successful, negative | ||
600 | * number in case of an error. */ | ||
601 | int unwind(struct unwind_frame_info *frame) | ||
602 | { | ||
603 | #define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs]) | ||
604 | const u32 *fde = NULL, *cie = NULL; | ||
605 | const u8 *ptr = NULL, *end = NULL; | ||
606 | unsigned long startLoc = 0, endLoc = 0, cfa; | ||
607 | unsigned i; | ||
608 | signed ptrType = -1; | ||
609 | uleb128_t retAddrReg = 0; | ||
610 | struct unwind_table *table; | ||
611 | struct unwind_state state; | ||
612 | |||
613 | if (UNW_PC(frame) == 0) | ||
614 | return -EINVAL; | ||
615 | if ((table = find_table(UNW_PC(frame))) != NULL | ||
616 | && !(table->size & (sizeof(*fde) - 1))) { | ||
617 | unsigned long tableSize = table->size; | ||
618 | |||
619 | for (fde = table->address; | ||
620 | tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde; | ||
621 | tableSize -= sizeof(*fde) + *fde, | ||
622 | fde += 1 + *fde / sizeof(*fde)) { | ||
623 | if (!*fde || (*fde & (sizeof(*fde) - 1))) | ||
624 | break; | ||
625 | if (!fde[1]) | ||
626 | continue; /* this is a CIE */ | ||
627 | if ((fde[1] & (sizeof(*fde) - 1)) | ||
628 | || fde[1] > (unsigned long)(fde + 1) | ||
629 | - (unsigned long)table->address) | ||
630 | continue; /* this is not a valid FDE */ | ||
631 | cie = fde + 1 - fde[1] / sizeof(*fde); | ||
632 | if (*cie <= sizeof(*cie) + 4 | ||
633 | || *cie >= fde[1] - sizeof(*fde) | ||
634 | || (*cie & (sizeof(*cie) - 1)) | ||
635 | || cie[1] | ||
636 | || (ptrType = fde_pointer_type(cie)) < 0) { | ||
637 | cie = NULL; /* this is not a (valid) CIE */ | ||
638 | continue; | ||
639 | } | ||
640 | ptr = (const u8 *)(fde + 2); | ||
641 | startLoc = read_pointer(&ptr, | ||
642 | (const u8 *)(fde + 1) + *fde, | ||
643 | ptrType); | ||
644 | endLoc = startLoc | ||
645 | + read_pointer(&ptr, | ||
646 | (const u8 *)(fde + 1) + *fde, | ||
647 | ptrType & DW_EH_PE_indirect | ||
648 | ? ptrType | ||
649 | : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed)); | ||
650 | if (UNW_PC(frame) >= startLoc && UNW_PC(frame) < endLoc) | ||
651 | break; | ||
652 | cie = NULL; | ||
653 | } | ||
654 | } | ||
655 | if (cie != NULL) { | ||
656 | memset(&state, 0, sizeof(state)); | ||
657 | state.cieEnd = ptr; /* keep here temporarily */ | ||
658 | ptr = (const u8 *)(cie + 2); | ||
659 | end = (const u8 *)(cie + 1) + *cie; | ||
660 | if ((state.version = *ptr) != 1) | ||
661 | cie = NULL; /* unsupported version */ | ||
662 | else if (*++ptr) { | ||
663 | /* check if augmentation size is first (and thus present) */ | ||
664 | if (*ptr == 'z') { | ||
665 | /* check for ignorable (or already handled) | ||
666 | * nul-terminated augmentation string */ | ||
667 | while (++ptr < end && *ptr) | ||
668 | if (strchr("LPR", *ptr) == NULL) | ||
669 | break; | ||
670 | } | ||
671 | if (ptr >= end || *ptr) | ||
672 | cie = NULL; | ||
673 | } | ||
674 | ++ptr; | ||
675 | } | ||
676 | if (cie != NULL) { | ||
677 | /* get code aligment factor */ | ||
678 | state.codeAlign = get_uleb128(&ptr, end); | ||
679 | /* get data aligment factor */ | ||
680 | state.dataAlign = get_sleb128(&ptr, end); | ||
681 | if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end) | ||
682 | cie = NULL; | ||
683 | else { | ||
684 | retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end); | ||
685 | /* skip augmentation */ | ||
686 | if (((const char *)(cie + 2))[1] == 'z') | ||
687 | ptr += get_uleb128(&ptr, end); | ||
688 | if (ptr > end | ||
689 | || retAddrReg >= ARRAY_SIZE(reg_info) | ||
690 | || REG_INVALID(retAddrReg) | ||
691 | || reg_info[retAddrReg].width != sizeof(unsigned long)) | ||
692 | cie = NULL; | ||
693 | } | ||
694 | } | ||
695 | if (cie != NULL) { | ||
696 | state.cieStart = ptr; | ||
697 | ptr = state.cieEnd; | ||
698 | state.cieEnd = end; | ||
699 | end = (const u8 *)(fde + 1) + *fde; | ||
700 | /* skip augmentation */ | ||
701 | if (((const char *)(cie + 2))[1] == 'z') { | ||
702 | uleb128_t augSize = get_uleb128(&ptr, end); | ||
703 | |||
704 | if ((ptr += augSize) > end) | ||
705 | fde = NULL; | ||
706 | } | ||
707 | } | ||
708 | if (cie == NULL || fde == NULL) { | ||
709 | #ifdef CONFIG_FRAME_POINTER | ||
710 | unsigned long top, bottom; | ||
711 | #endif | ||
712 | |||
713 | #ifdef CONFIG_FRAME_POINTER | ||
714 | top = STACK_TOP(frame->task); | ||
715 | bottom = STACK_BOTTOM(frame->task); | ||
716 | # if FRAME_RETADDR_OFFSET < 0 | ||
717 | if (UNW_SP(frame) < top | ||
718 | && UNW_FP(frame) <= UNW_SP(frame) | ||
719 | && bottom < UNW_FP(frame) | ||
720 | # else | ||
721 | if (UNW_SP(frame) > top | ||
722 | && UNW_FP(frame) >= UNW_SP(frame) | ||
723 | && bottom > UNW_FP(frame) | ||
724 | # endif | ||
725 | && !((UNW_SP(frame) | UNW_FP(frame)) | ||
726 | & (sizeof(unsigned long) - 1))) { | ||
727 | unsigned long link; | ||
728 | |||
729 | if (!__get_user(link, | ||
730 | (unsigned long *)(UNW_FP(frame) | ||
731 | + FRAME_LINK_OFFSET)) | ||
732 | # if FRAME_RETADDR_OFFSET < 0 | ||
733 | && link > bottom && link < UNW_FP(frame) | ||
734 | # else | ||
735 | && link > UNW_FP(frame) && link < bottom | ||
736 | # endif | ||
737 | && !(link & (sizeof(link) - 1)) | ||
738 | && !__get_user(UNW_PC(frame), | ||
739 | (unsigned long *)(UNW_FP(frame) | ||
740 | + FRAME_RETADDR_OFFSET))) { | ||
741 | UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET | ||
742 | # if FRAME_RETADDR_OFFSET < 0 | ||
743 | - | ||
744 | # else | ||
745 | + | ||
746 | # endif | ||
747 | sizeof(UNW_PC(frame)); | ||
748 | UNW_FP(frame) = link; | ||
749 | return 0; | ||
750 | } | ||
751 | } | ||
752 | #endif | ||
753 | return -ENXIO; | ||
754 | } | ||
755 | state.org = startLoc; | ||
756 | memcpy(&state.cfa, &badCFA, sizeof(state.cfa)); | ||
757 | /* process instructions */ | ||
758 | if (!processCFI(ptr, end, UNW_PC(frame), ptrType, &state) | ||
759 | || state.loc > endLoc | ||
760 | || state.regs[retAddrReg].where == Nowhere | ||
761 | || state.cfa.reg >= ARRAY_SIZE(reg_info) | ||
762 | || reg_info[state.cfa.reg].width != sizeof(unsigned long) | ||
763 | || state.cfa.offs % sizeof(unsigned long)) | ||
764 | return -EIO; | ||
765 | /* update frame */ | ||
766 | cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs; | ||
767 | startLoc = min((unsigned long)UNW_SP(frame), cfa); | ||
768 | endLoc = max((unsigned long)UNW_SP(frame), cfa); | ||
769 | if (STACK_LIMIT(startLoc) != STACK_LIMIT(endLoc)) { | ||
770 | startLoc = min(STACK_LIMIT(cfa), cfa); | ||
771 | endLoc = max(STACK_LIMIT(cfa), cfa); | ||
772 | } | ||
773 | #ifndef CONFIG_64BIT | ||
774 | # define CASES CASE(8); CASE(16); CASE(32) | ||
775 | #else | ||
776 | # define CASES CASE(8); CASE(16); CASE(32); CASE(64) | ||
777 | #endif | ||
778 | for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { | ||
779 | if (REG_INVALID(i)) { | ||
780 | if (state.regs[i].where == Nowhere) | ||
781 | continue; | ||
782 | return -EIO; | ||
783 | } | ||
784 | switch(state.regs[i].where) { | ||
785 | default: | ||
786 | break; | ||
787 | case Register: | ||
788 | if (state.regs[i].value >= ARRAY_SIZE(reg_info) | ||
789 | || REG_INVALID(state.regs[i].value) | ||
790 | || reg_info[i].width > reg_info[state.regs[i].value].width) | ||
791 | return -EIO; | ||
792 | switch(reg_info[state.regs[i].value].width) { | ||
793 | #define CASE(n) \ | ||
794 | case sizeof(u##n): \ | ||
795 | state.regs[i].value = FRAME_REG(state.regs[i].value, \ | ||
796 | const u##n); \ | ||
797 | break | ||
798 | CASES; | ||
799 | #undef CASE | ||
800 | default: | ||
801 | return -EIO; | ||
802 | } | ||
803 | break; | ||
804 | } | ||
805 | } | ||
806 | for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { | ||
807 | if (REG_INVALID(i)) | ||
808 | continue; | ||
809 | switch(state.regs[i].where) { | ||
810 | case Nowhere: | ||
811 | if (reg_info[i].width != sizeof(UNW_SP(frame)) | ||
812 | || &FRAME_REG(i, __typeof__(UNW_SP(frame))) | ||
813 | != &UNW_SP(frame)) | ||
814 | continue; | ||
815 | UNW_SP(frame) = cfa; | ||
816 | break; | ||
817 | case Register: | ||
818 | switch(reg_info[i].width) { | ||
819 | #define CASE(n) case sizeof(u##n): \ | ||
820 | FRAME_REG(i, u##n) = state.regs[i].value; \ | ||
821 | break | ||
822 | CASES; | ||
823 | #undef CASE | ||
824 | default: | ||
825 | return -EIO; | ||
826 | } | ||
827 | break; | ||
828 | case Value: | ||
829 | if (reg_info[i].width != sizeof(unsigned long)) | ||
830 | return -EIO; | ||
831 | FRAME_REG(i, unsigned long) = cfa + state.regs[i].value | ||
832 | * state.dataAlign; | ||
833 | break; | ||
834 | case Memory: { | ||
835 | unsigned long addr = cfa + state.regs[i].value | ||
836 | * state.dataAlign; | ||
837 | |||
838 | if ((state.regs[i].value * state.dataAlign) | ||
839 | % sizeof(unsigned long) | ||
840 | || addr < startLoc | ||
841 | || addr + sizeof(unsigned long) < addr | ||
842 | || addr + sizeof(unsigned long) > endLoc) | ||
843 | return -EIO; | ||
844 | switch(reg_info[i].width) { | ||
845 | #define CASE(n) case sizeof(u##n): \ | ||
846 | __get_user(FRAME_REG(i, u##n), (u##n *)addr); \ | ||
847 | break | ||
848 | CASES; | ||
849 | #undef CASE | ||
850 | default: | ||
851 | return -EIO; | ||
852 | } | ||
853 | } | ||
854 | break; | ||
855 | } | ||
856 | } | ||
857 | |||
858 | return 0; | ||
859 | #undef CASES | ||
860 | #undef FRAME_REG | ||
861 | } | ||
862 | EXPORT_SYMBOL(unwind); | ||
863 | |||
864 | int unwind_init_frame_info(struct unwind_frame_info *info, | ||
865 | struct task_struct *tsk, | ||
866 | /*const*/ struct pt_regs *regs) | ||
867 | { | ||
868 | info->task = tsk; | ||
869 | arch_unw_init_frame_info(info, regs); | ||
870 | |||
871 | return 0; | ||
872 | } | ||
873 | EXPORT_SYMBOL(unwind_init_frame_info); | ||
874 | |||
875 | /* | ||
876 | * Prepare to unwind a blocked task. | ||
877 | */ | ||
878 | int unwind_init_blocked(struct unwind_frame_info *info, | ||
879 | struct task_struct *tsk) | ||
880 | { | ||
881 | info->task = tsk; | ||
882 | arch_unw_init_blocked(info); | ||
883 | |||
884 | return 0; | ||
885 | } | ||
886 | EXPORT_SYMBOL(unwind_init_blocked); | ||
887 | |||
888 | /* | ||
889 | * Prepare to unwind the currently running thread. | ||
890 | */ | ||
891 | int unwind_init_running(struct unwind_frame_info *info, | ||
892 | asmlinkage int (*callback)(struct unwind_frame_info *, | ||
893 | void *arg), | ||
894 | void *arg) | ||
895 | { | ||
896 | info->task = current; | ||
897 | |||
898 | return arch_unwind_init_running(info, callback, arg); | ||
899 | } | ||
900 | EXPORT_SYMBOL(unwind_init_running); | ||
901 | |||
902 | /* | ||
903 | * Unwind until the return pointer is in user-land (or until an error | ||
904 | * occurs). Returns 0 if successful, negative number in case of | ||
905 | * error. | ||
906 | */ | ||
907 | int unwind_to_user(struct unwind_frame_info *info) | ||
908 | { | ||
909 | while (!arch_unw_user_mode(info)) { | ||
910 | int err = unwind(info); | ||
911 | |||
912 | if (err < 0) | ||
913 | return err; | ||
914 | } | ||
915 | |||
916 | return 0; | ||
917 | } | ||
918 | EXPORT_SYMBOL(unwind_to_user); | ||
diff --git a/kernel/user.c b/kernel/user.c index 2116642f42c6..6408c0424291 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -140,7 +140,7 @@ struct user_struct * alloc_uid(uid_t uid) | |||
140 | atomic_set(&new->processes, 0); | 140 | atomic_set(&new->processes, 0); |
141 | atomic_set(&new->files, 0); | 141 | atomic_set(&new->files, 0); |
142 | atomic_set(&new->sigpending, 0); | 142 | atomic_set(&new->sigpending, 0); |
143 | #ifdef CONFIG_INOTIFY | 143 | #ifdef CONFIG_INOTIFY_USER |
144 | atomic_set(&new->inotify_watches, 0); | 144 | atomic_set(&new->inotify_watches, 0); |
145 | atomic_set(&new->inotify_devs, 0); | 145 | atomic_set(&new->inotify_devs, 0); |
146 | #endif | 146 | #endif |
@@ -148,7 +148,7 @@ struct user_struct * alloc_uid(uid_t uid) | |||
148 | new->mq_bytes = 0; | 148 | new->mq_bytes = 0; |
149 | new->locked_shm = 0; | 149 | new->locked_shm = 0; |
150 | 150 | ||
151 | if (alloc_uid_keyring(new) < 0) { | 151 | if (alloc_uid_keyring(new, current) < 0) { |
152 | kmem_cache_free(uid_cachep, new); | 152 | kmem_cache_free(uid_cachep, new); |
153 | return NULL; | 153 | return NULL; |
154 | } | 154 | } |
diff --git a/kernel/wait.c b/kernel/wait.c index 791681cfea98..59a82f63275d 100644 --- a/kernel/wait.c +++ b/kernel/wait.c | |||
@@ -3,7 +3,6 @@ | |||
3 | * | 3 | * |
4 | * (C) 2004 William Irwin, Oracle | 4 | * (C) 2004 William Irwin, Oracle |
5 | */ | 5 | */ |
6 | #include <linux/config.h> | ||
7 | #include <linux/init.h> | 6 | #include <linux/init.h> |
8 | #include <linux/module.h> | 7 | #include <linux/module.h> |
9 | #include <linux/sched.h> | 8 | #include <linux/sched.h> |
@@ -11,6 +10,14 @@ | |||
11 | #include <linux/wait.h> | 10 | #include <linux/wait.h> |
12 | #include <linux/hash.h> | 11 | #include <linux/hash.h> |
13 | 12 | ||
13 | void init_waitqueue_head(wait_queue_head_t *q) | ||
14 | { | ||
15 | spin_lock_init(&q->lock); | ||
16 | INIT_LIST_HEAD(&q->task_list); | ||
17 | } | ||
18 | |||
19 | EXPORT_SYMBOL(init_waitqueue_head); | ||
20 | |||
14 | void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) | 21 | void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) |
15 | { | 22 | { |
16 | unsigned long flags; | 23 | unsigned long flags; |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 880fb415a8f6..eebb1d839235 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -51,7 +51,7 @@ struct cpu_workqueue_struct { | |||
51 | wait_queue_head_t work_done; | 51 | wait_queue_head_t work_done; |
52 | 52 | ||
53 | struct workqueue_struct *wq; | 53 | struct workqueue_struct *wq; |
54 | task_t *thread; | 54 | struct task_struct *thread; |
55 | 55 | ||
56 | int run_depth; /* Detect run_workqueue() recursion depth */ | 56 | int run_depth; /* Detect run_workqueue() recursion depth */ |
57 | } ____cacheline_aligned; | 57 | } ____cacheline_aligned; |
@@ -114,6 +114,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) | |||
114 | put_cpu(); | 114 | put_cpu(); |
115 | return ret; | 115 | return ret; |
116 | } | 116 | } |
117 | EXPORT_SYMBOL_GPL(queue_work); | ||
117 | 118 | ||
118 | static void delayed_work_timer_fn(unsigned long __data) | 119 | static void delayed_work_timer_fn(unsigned long __data) |
119 | { | 120 | { |
@@ -147,6 +148,29 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq, | |||
147 | } | 148 | } |
148 | return ret; | 149 | return ret; |
149 | } | 150 | } |
151 | EXPORT_SYMBOL_GPL(queue_delayed_work); | ||
152 | |||
153 | int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, | ||
154 | struct work_struct *work, unsigned long delay) | ||
155 | { | ||
156 | int ret = 0; | ||
157 | struct timer_list *timer = &work->timer; | ||
158 | |||
159 | if (!test_and_set_bit(0, &work->pending)) { | ||
160 | BUG_ON(timer_pending(timer)); | ||
161 | BUG_ON(!list_empty(&work->entry)); | ||
162 | |||
163 | /* This stores wq for the moment, for the timer_fn */ | ||
164 | work->wq_data = wq; | ||
165 | timer->expires = jiffies + delay; | ||
166 | timer->data = (unsigned long)work; | ||
167 | timer->function = delayed_work_timer_fn; | ||
168 | add_timer_on(timer, cpu); | ||
169 | ret = 1; | ||
170 | } | ||
171 | return ret; | ||
172 | } | ||
173 | EXPORT_SYMBOL_GPL(queue_delayed_work_on); | ||
150 | 174 | ||
151 | static void run_workqueue(struct cpu_workqueue_struct *cwq) | 175 | static void run_workqueue(struct cpu_workqueue_struct *cwq) |
152 | { | 176 | { |
@@ -281,6 +305,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) | |||
281 | unlock_cpu_hotplug(); | 305 | unlock_cpu_hotplug(); |
282 | } | 306 | } |
283 | } | 307 | } |
308 | EXPORT_SYMBOL_GPL(flush_workqueue); | ||
284 | 309 | ||
285 | static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, | 310 | static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, |
286 | int cpu) | 311 | int cpu) |
@@ -358,6 +383,7 @@ struct workqueue_struct *__create_workqueue(const char *name, | |||
358 | } | 383 | } |
359 | return wq; | 384 | return wq; |
360 | } | 385 | } |
386 | EXPORT_SYMBOL_GPL(__create_workqueue); | ||
361 | 387 | ||
362 | static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) | 388 | static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) |
363 | { | 389 | { |
@@ -395,6 +421,7 @@ void destroy_workqueue(struct workqueue_struct *wq) | |||
395 | free_percpu(wq->cpu_wq); | 421 | free_percpu(wq->cpu_wq); |
396 | kfree(wq); | 422 | kfree(wq); |
397 | } | 423 | } |
424 | EXPORT_SYMBOL_GPL(destroy_workqueue); | ||
398 | 425 | ||
399 | static struct workqueue_struct *keventd_wq; | 426 | static struct workqueue_struct *keventd_wq; |
400 | 427 | ||
@@ -402,48 +429,49 @@ int fastcall schedule_work(struct work_struct *work) | |||
402 | { | 429 | { |
403 | return queue_work(keventd_wq, work); | 430 | return queue_work(keventd_wq, work); |
404 | } | 431 | } |
432 | EXPORT_SYMBOL(schedule_work); | ||
405 | 433 | ||
406 | int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) | 434 | int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) |
407 | { | 435 | { |
408 | return queue_delayed_work(keventd_wq, work, delay); | 436 | return queue_delayed_work(keventd_wq, work, delay); |
409 | } | 437 | } |
438 | EXPORT_SYMBOL(schedule_delayed_work); | ||
410 | 439 | ||
411 | int schedule_delayed_work_on(int cpu, | 440 | int schedule_delayed_work_on(int cpu, |
412 | struct work_struct *work, unsigned long delay) | 441 | struct work_struct *work, unsigned long delay) |
413 | { | 442 | { |
414 | int ret = 0; | 443 | return queue_delayed_work_on(cpu, keventd_wq, work, delay); |
415 | struct timer_list *timer = &work->timer; | ||
416 | |||
417 | if (!test_and_set_bit(0, &work->pending)) { | ||
418 | BUG_ON(timer_pending(timer)); | ||
419 | BUG_ON(!list_empty(&work->entry)); | ||
420 | /* This stores keventd_wq for the moment, for the timer_fn */ | ||
421 | work->wq_data = keventd_wq; | ||
422 | timer->expires = jiffies + delay; | ||
423 | timer->data = (unsigned long)work; | ||
424 | timer->function = delayed_work_timer_fn; | ||
425 | add_timer_on(timer, cpu); | ||
426 | ret = 1; | ||
427 | } | ||
428 | return ret; | ||
429 | } | 444 | } |
445 | EXPORT_SYMBOL(schedule_delayed_work_on); | ||
430 | 446 | ||
431 | int schedule_on_each_cpu(void (*func) (void *info), void *info) | 447 | /** |
448 | * schedule_on_each_cpu - call a function on each online CPU from keventd | ||
449 | * @func: the function to call | ||
450 | * @info: a pointer to pass to func() | ||
451 | * | ||
452 | * Returns zero on success. | ||
453 | * Returns -ve errno on failure. | ||
454 | * | ||
455 | * Appears to be racy against CPU hotplug. | ||
456 | * | ||
457 | * schedule_on_each_cpu() is very slow. | ||
458 | */ | ||
459 | int schedule_on_each_cpu(void (*func)(void *info), void *info) | ||
432 | { | 460 | { |
433 | int cpu; | 461 | int cpu; |
434 | struct work_struct *work; | 462 | struct work_struct *works; |
435 | 463 | ||
436 | work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL); | 464 | works = alloc_percpu(struct work_struct); |
437 | 465 | if (!works) | |
438 | if (!work) | ||
439 | return -ENOMEM; | 466 | return -ENOMEM; |
467 | |||
440 | for_each_online_cpu(cpu) { | 468 | for_each_online_cpu(cpu) { |
441 | INIT_WORK(work + cpu, func, info); | 469 | INIT_WORK(per_cpu_ptr(works, cpu), func, info); |
442 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), | 470 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), |
443 | work + cpu); | 471 | per_cpu_ptr(works, cpu)); |
444 | } | 472 | } |
445 | flush_workqueue(keventd_wq); | 473 | flush_workqueue(keventd_wq); |
446 | kfree(work); | 474 | free_percpu(works); |
447 | return 0; | 475 | return 0; |
448 | } | 476 | } |
449 | 477 | ||
@@ -451,6 +479,7 @@ void flush_scheduled_work(void) | |||
451 | { | 479 | { |
452 | flush_workqueue(keventd_wq); | 480 | flush_workqueue(keventd_wq); |
453 | } | 481 | } |
482 | EXPORT_SYMBOL(flush_scheduled_work); | ||
454 | 483 | ||
455 | /** | 484 | /** |
456 | * cancel_rearming_delayed_workqueue - reliably kill off a delayed | 485 | * cancel_rearming_delayed_workqueue - reliably kill off a delayed |
@@ -531,11 +560,11 @@ int current_is_keventd(void) | |||
531 | static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) | 560 | static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) |
532 | { | 561 | { |
533 | struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); | 562 | struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); |
534 | LIST_HEAD(list); | 563 | struct list_head list; |
535 | struct work_struct *work; | 564 | struct work_struct *work; |
536 | 565 | ||
537 | spin_lock_irq(&cwq->lock); | 566 | spin_lock_irq(&cwq->lock); |
538 | list_splice_init(&cwq->worklist, &list); | 567 | list_replace_init(&cwq->worklist, &list); |
539 | 568 | ||
540 | while (!list_empty(&list)) { | 569 | while (!list_empty(&list)) { |
541 | printk("Taking work for %s\n", wq->name); | 570 | printk("Taking work for %s\n", wq->name); |
@@ -547,7 +576,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) | |||
547 | } | 576 | } |
548 | 577 | ||
549 | /* We're holding the cpucontrol mutex here */ | 578 | /* We're holding the cpucontrol mutex here */ |
550 | static int workqueue_cpu_callback(struct notifier_block *nfb, | 579 | static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, |
551 | unsigned long action, | 580 | unsigned long action, |
552 | void *hcpu) | 581 | void *hcpu) |
553 | { | 582 | { |
@@ -578,6 +607,8 @@ static int workqueue_cpu_callback(struct notifier_block *nfb, | |||
578 | 607 | ||
579 | case CPU_UP_CANCELED: | 608 | case CPU_UP_CANCELED: |
580 | list_for_each_entry(wq, &workqueues, list) { | 609 | list_for_each_entry(wq, &workqueues, list) { |
610 | if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread) | ||
611 | continue; | ||
581 | /* Unbind so it can run. */ | 612 | /* Unbind so it can run. */ |
582 | kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, | 613 | kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, |
583 | any_online_cpu(cpu_online_map)); | 614 | any_online_cpu(cpu_online_map)); |
@@ -605,13 +636,3 @@ void init_workqueues(void) | |||
605 | BUG_ON(!keventd_wq); | 636 | BUG_ON(!keventd_wq); |
606 | } | 637 | } |
607 | 638 | ||
608 | EXPORT_SYMBOL_GPL(__create_workqueue); | ||
609 | EXPORT_SYMBOL_GPL(queue_work); | ||
610 | EXPORT_SYMBOL_GPL(queue_delayed_work); | ||
611 | EXPORT_SYMBOL_GPL(flush_workqueue); | ||
612 | EXPORT_SYMBOL_GPL(destroy_workqueue); | ||
613 | |||
614 | EXPORT_SYMBOL(schedule_work); | ||
615 | EXPORT_SYMBOL(schedule_delayed_work); | ||
616 | EXPORT_SYMBOL(schedule_delayed_work_on); | ||
617 | EXPORT_SYMBOL(flush_scheduled_work); | ||