diff options
author | Steven Whitehouse <swhiteho@redhat.com> | 2006-07-03 10:25:08 -0400 |
---|---|---|
committer | Steven Whitehouse <swhiteho@redhat.com> | 2006-07-03 10:25:08 -0400 |
commit | 0a1340c185734a57fbf4775927966ad4a1347b02 (patch) | |
tree | d9ed8f0dd809a7c542a3356601125ea5b5aaa804 /kernel | |
parent | af18ddb8864b096e3ed4732e2d4b21c956dcfe3a (diff) | |
parent | 29454dde27d8e340bb1987bad9aa504af7081eba (diff) |
Merge rsync://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Conflicts:
include/linux/kernel.h
Diffstat (limited to 'kernel')
75 files changed, 9457 insertions, 1761 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 58908f9d156a..82fb182f6f61 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -10,18 +10,22 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ | |||
10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
11 | hrtimer.o | 11 | hrtimer.o |
12 | 12 | ||
13 | obj-y += time/ | ||
13 | obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o | 14 | obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o |
14 | obj-$(CONFIG_FUTEX) += futex.o | 15 | obj-$(CONFIG_FUTEX) += futex.o |
15 | ifeq ($(CONFIG_COMPAT),y) | 16 | ifeq ($(CONFIG_COMPAT),y) |
16 | obj-$(CONFIG_FUTEX) += futex_compat.o | 17 | obj-$(CONFIG_FUTEX) += futex_compat.o |
17 | endif | 18 | endif |
19 | obj-$(CONFIG_RT_MUTEXES) += rtmutex.o | ||
20 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o | ||
21 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o | ||
18 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 22 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
19 | obj-$(CONFIG_SMP) += cpu.o spinlock.o | 23 | obj-$(CONFIG_SMP) += cpu.o spinlock.o |
20 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o | 24 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o |
21 | obj-$(CONFIG_UID16) += uid16.o | 25 | obj-$(CONFIG_UID16) += uid16.o |
22 | obj-$(CONFIG_MODULES) += module.o | 26 | obj-$(CONFIG_MODULES) += module.o |
23 | obj-$(CONFIG_OBSOLETE_INTERMODULE) += intermodule.o | ||
24 | obj-$(CONFIG_KALLSYMS) += kallsyms.o | 27 | obj-$(CONFIG_KALLSYMS) += kallsyms.o |
28 | obj-$(CONFIG_STACK_UNWIND) += unwind.o | ||
25 | obj-$(CONFIG_PM) += power/ | 29 | obj-$(CONFIG_PM) += power/ |
26 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o | 30 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o |
27 | obj-$(CONFIG_KEXEC) += kexec.o | 31 | obj-$(CONFIG_KEXEC) += kexec.o |
diff --git a/kernel/acct.c b/kernel/acct.c index b327f4d20104..f18e0b8df3e1 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -43,7 +43,6 @@ | |||
43 | * a struct file opened for write. Fixed. 2/6/2000, AV. | 43 | * a struct file opened for write. Fixed. 2/6/2000, AV. |
44 | */ | 44 | */ |
45 | 45 | ||
46 | #include <linux/config.h> | ||
47 | #include <linux/mm.h> | 46 | #include <linux/mm.h> |
48 | #include <linux/slab.h> | 47 | #include <linux/slab.h> |
49 | #include <linux/acct.h> | 48 | #include <linux/acct.h> |
@@ -75,7 +74,7 @@ int acct_parm[3] = {4, 2, 30}; | |||
75 | /* | 74 | /* |
76 | * External references and all of the globals. | 75 | * External references and all of the globals. |
77 | */ | 76 | */ |
78 | static void do_acct_process(long, struct file *); | 77 | static void do_acct_process(struct file *); |
79 | 78 | ||
80 | /* | 79 | /* |
81 | * This structure is used so that all the data protected by lock | 80 | * This structure is used so that all the data protected by lock |
@@ -118,7 +117,7 @@ static int check_free_space(struct file *file) | |||
118 | spin_unlock(&acct_globals.lock); | 117 | spin_unlock(&acct_globals.lock); |
119 | 118 | ||
120 | /* May block */ | 119 | /* May block */ |
121 | if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf)) | 120 | if (vfs_statfs(file->f_dentry, &sbuf)) |
122 | return res; | 121 | return res; |
123 | suspend = sbuf.f_blocks * SUSPEND; | 122 | suspend = sbuf.f_blocks * SUSPEND; |
124 | resume = sbuf.f_blocks * RESUME; | 123 | resume = sbuf.f_blocks * RESUME; |
@@ -196,7 +195,7 @@ static void acct_file_reopen(struct file *file) | |||
196 | if (old_acct) { | 195 | if (old_acct) { |
197 | mnt_unpin(old_acct->f_vfsmnt); | 196 | mnt_unpin(old_acct->f_vfsmnt); |
198 | spin_unlock(&acct_globals.lock); | 197 | spin_unlock(&acct_globals.lock); |
199 | do_acct_process(0, old_acct); | 198 | do_acct_process(old_acct); |
200 | filp_close(old_acct, NULL); | 199 | filp_close(old_acct, NULL); |
201 | spin_lock(&acct_globals.lock); | 200 | spin_lock(&acct_globals.lock); |
202 | } | 201 | } |
@@ -419,16 +418,15 @@ static u32 encode_float(u64 value) | |||
419 | /* | 418 | /* |
420 | * do_acct_process does all actual work. Caller holds the reference to file. | 419 | * do_acct_process does all actual work. Caller holds the reference to file. |
421 | */ | 420 | */ |
422 | static void do_acct_process(long exitcode, struct file *file) | 421 | static void do_acct_process(struct file *file) |
423 | { | 422 | { |
423 | struct pacct_struct *pacct = ¤t->signal->pacct; | ||
424 | acct_t ac; | 424 | acct_t ac; |
425 | mm_segment_t fs; | 425 | mm_segment_t fs; |
426 | unsigned long vsize; | ||
427 | unsigned long flim; | 426 | unsigned long flim; |
428 | u64 elapsed; | 427 | u64 elapsed; |
429 | u64 run_time; | 428 | u64 run_time; |
430 | struct timespec uptime; | 429 | struct timespec uptime; |
431 | unsigned long jiffies; | ||
432 | 430 | ||
433 | /* | 431 | /* |
434 | * First check to see if there is enough free_space to continue | 432 | * First check to see if there is enough free_space to continue |
@@ -469,12 +467,6 @@ static void do_acct_process(long exitcode, struct file *file) | |||
469 | #endif | 467 | #endif |
470 | do_div(elapsed, AHZ); | 468 | do_div(elapsed, AHZ); |
471 | ac.ac_btime = xtime.tv_sec - elapsed; | 469 | ac.ac_btime = xtime.tv_sec - elapsed; |
472 | jiffies = cputime_to_jiffies(cputime_add(current->utime, | ||
473 | current->signal->utime)); | ||
474 | ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies)); | ||
475 | jiffies = cputime_to_jiffies(cputime_add(current->stime, | ||
476 | current->signal->stime)); | ||
477 | ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies)); | ||
478 | /* we really need to bite the bullet and change layout */ | 470 | /* we really need to bite the bullet and change layout */ |
479 | ac.ac_uid = current->uid; | 471 | ac.ac_uid = current->uid; |
480 | ac.ac_gid = current->gid; | 472 | ac.ac_gid = current->gid; |
@@ -496,37 +488,18 @@ static void do_acct_process(long exitcode, struct file *file) | |||
496 | old_encode_dev(tty_devnum(current->signal->tty)) : 0; | 488 | old_encode_dev(tty_devnum(current->signal->tty)) : 0; |
497 | read_unlock(&tasklist_lock); | 489 | read_unlock(&tasklist_lock); |
498 | 490 | ||
499 | ac.ac_flag = 0; | 491 | spin_lock(¤t->sighand->siglock); |
500 | if (current->flags & PF_FORKNOEXEC) | 492 | ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); |
501 | ac.ac_flag |= AFORK; | 493 | ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); |
502 | if (current->flags & PF_SUPERPRIV) | 494 | ac.ac_flag = pacct->ac_flag; |
503 | ac.ac_flag |= ASU; | 495 | ac.ac_mem = encode_comp_t(pacct->ac_mem); |
504 | if (current->flags & PF_DUMPCORE) | 496 | ac.ac_minflt = encode_comp_t(pacct->ac_minflt); |
505 | ac.ac_flag |= ACORE; | 497 | ac.ac_majflt = encode_comp_t(pacct->ac_majflt); |
506 | if (current->flags & PF_SIGNALED) | 498 | ac.ac_exitcode = pacct->ac_exitcode; |
507 | ac.ac_flag |= AXSIG; | 499 | spin_unlock(¤t->sighand->siglock); |
508 | |||
509 | vsize = 0; | ||
510 | if (current->mm) { | ||
511 | struct vm_area_struct *vma; | ||
512 | down_read(¤t->mm->mmap_sem); | ||
513 | vma = current->mm->mmap; | ||
514 | while (vma) { | ||
515 | vsize += vma->vm_end - vma->vm_start; | ||
516 | vma = vma->vm_next; | ||
517 | } | ||
518 | up_read(¤t->mm->mmap_sem); | ||
519 | } | ||
520 | vsize = vsize / 1024; | ||
521 | ac.ac_mem = encode_comp_t(vsize); | ||
522 | ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ | 500 | ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ |
523 | ac.ac_rw = encode_comp_t(ac.ac_io / 1024); | 501 | ac.ac_rw = encode_comp_t(ac.ac_io / 1024); |
524 | ac.ac_minflt = encode_comp_t(current->signal->min_flt + | ||
525 | current->min_flt); | ||
526 | ac.ac_majflt = encode_comp_t(current->signal->maj_flt + | ||
527 | current->maj_flt); | ||
528 | ac.ac_swaps = encode_comp_t(0); | 502 | ac.ac_swaps = encode_comp_t(0); |
529 | ac.ac_exitcode = exitcode; | ||
530 | 503 | ||
531 | /* | 504 | /* |
532 | * Kernel segment override to datasegment and write it | 505 | * Kernel segment override to datasegment and write it |
@@ -546,12 +519,64 @@ static void do_acct_process(long exitcode, struct file *file) | |||
546 | } | 519 | } |
547 | 520 | ||
548 | /** | 521 | /** |
522 | * acct_init_pacct - initialize a new pacct_struct | ||
523 | * @pacct: per-process accounting info struct to initialize | ||
524 | */ | ||
525 | void acct_init_pacct(struct pacct_struct *pacct) | ||
526 | { | ||
527 | memset(pacct, 0, sizeof(struct pacct_struct)); | ||
528 | pacct->ac_utime = pacct->ac_stime = cputime_zero; | ||
529 | } | ||
530 | |||
531 | /** | ||
532 | * acct_collect - collect accounting information into pacct_struct | ||
533 | * @exitcode: task exit code | ||
534 | * @group_dead: not 0, if this thread is the last one in the process. | ||
535 | */ | ||
536 | void acct_collect(long exitcode, int group_dead) | ||
537 | { | ||
538 | struct pacct_struct *pacct = ¤t->signal->pacct; | ||
539 | unsigned long vsize = 0; | ||
540 | |||
541 | if (group_dead && current->mm) { | ||
542 | struct vm_area_struct *vma; | ||
543 | down_read(¤t->mm->mmap_sem); | ||
544 | vma = current->mm->mmap; | ||
545 | while (vma) { | ||
546 | vsize += vma->vm_end - vma->vm_start; | ||
547 | vma = vma->vm_next; | ||
548 | } | ||
549 | up_read(¤t->mm->mmap_sem); | ||
550 | } | ||
551 | |||
552 | spin_lock_irq(¤t->sighand->siglock); | ||
553 | if (group_dead) | ||
554 | pacct->ac_mem = vsize / 1024; | ||
555 | if (thread_group_leader(current)) { | ||
556 | pacct->ac_exitcode = exitcode; | ||
557 | if (current->flags & PF_FORKNOEXEC) | ||
558 | pacct->ac_flag |= AFORK; | ||
559 | } | ||
560 | if (current->flags & PF_SUPERPRIV) | ||
561 | pacct->ac_flag |= ASU; | ||
562 | if (current->flags & PF_DUMPCORE) | ||
563 | pacct->ac_flag |= ACORE; | ||
564 | if (current->flags & PF_SIGNALED) | ||
565 | pacct->ac_flag |= AXSIG; | ||
566 | pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); | ||
567 | pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); | ||
568 | pacct->ac_minflt += current->min_flt; | ||
569 | pacct->ac_majflt += current->maj_flt; | ||
570 | spin_unlock_irq(¤t->sighand->siglock); | ||
571 | } | ||
572 | |||
573 | /** | ||
549 | * acct_process - now just a wrapper around do_acct_process | 574 | * acct_process - now just a wrapper around do_acct_process |
550 | * @exitcode: task exit code | 575 | * @exitcode: task exit code |
551 | * | 576 | * |
552 | * handles process accounting for an exiting task | 577 | * handles process accounting for an exiting task |
553 | */ | 578 | */ |
554 | void acct_process(long exitcode) | 579 | void acct_process(void) |
555 | { | 580 | { |
556 | struct file *file = NULL; | 581 | struct file *file = NULL; |
557 | 582 | ||
@@ -570,7 +595,7 @@ void acct_process(long exitcode) | |||
570 | get_file(file); | 595 | get_file(file); |
571 | spin_unlock(&acct_globals.lock); | 596 | spin_unlock(&acct_globals.lock); |
572 | 597 | ||
573 | do_acct_process(exitcode, file); | 598 | do_acct_process(file); |
574 | fput(file); | 599 | fput(file); |
575 | } | 600 | } |
576 | 601 | ||
@@ -599,9 +624,7 @@ void acct_update_integrals(struct task_struct *tsk) | |||
599 | */ | 624 | */ |
600 | void acct_clear_integrals(struct task_struct *tsk) | 625 | void acct_clear_integrals(struct task_struct *tsk) |
601 | { | 626 | { |
602 | if (tsk) { | 627 | tsk->acct_stimexpd = 0; |
603 | tsk->acct_stimexpd = 0; | 628 | tsk->acct_rss_mem1 = 0; |
604 | tsk->acct_rss_mem1 = 0; | 629 | tsk->acct_vm_mem1 = 0; |
605 | tsk->acct_vm_mem1 = 0; | ||
606 | } | ||
607 | } | 630 | } |
diff --git a/kernel/audit.c b/kernel/audit.c index df57b493e1cb..d417ca1db79b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -56,6 +56,7 @@ | |||
56 | #include <linux/skbuff.h> | 56 | #include <linux/skbuff.h> |
57 | #include <linux/netlink.h> | 57 | #include <linux/netlink.h> |
58 | #include <linux/selinux.h> | 58 | #include <linux/selinux.h> |
59 | #include <linux/inotify.h> | ||
59 | 60 | ||
60 | #include "audit.h" | 61 | #include "audit.h" |
61 | 62 | ||
@@ -89,6 +90,7 @@ static int audit_backlog_wait_overflow = 0; | |||
89 | /* The identity of the user shutting down the audit system. */ | 90 | /* The identity of the user shutting down the audit system. */ |
90 | uid_t audit_sig_uid = -1; | 91 | uid_t audit_sig_uid = -1; |
91 | pid_t audit_sig_pid = -1; | 92 | pid_t audit_sig_pid = -1; |
93 | u32 audit_sig_sid = 0; | ||
92 | 94 | ||
93 | /* Records can be lost in several ways: | 95 | /* Records can be lost in several ways: |
94 | 0) [suppressed in audit_alloc] | 96 | 0) [suppressed in audit_alloc] |
@@ -102,6 +104,12 @@ static atomic_t audit_lost = ATOMIC_INIT(0); | |||
102 | /* The netlink socket. */ | 104 | /* The netlink socket. */ |
103 | static struct sock *audit_sock; | 105 | static struct sock *audit_sock; |
104 | 106 | ||
107 | /* Inotify handle. */ | ||
108 | struct inotify_handle *audit_ih; | ||
109 | |||
110 | /* Hash for inode-based rules */ | ||
111 | struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; | ||
112 | |||
105 | /* The audit_freelist is a list of pre-allocated audit buffers (if more | 113 | /* The audit_freelist is a list of pre-allocated audit buffers (if more |
106 | * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of | 114 | * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of |
107 | * being placed on the freelist). */ | 115 | * being placed on the freelist). */ |
@@ -114,10 +122,8 @@ static struct task_struct *kauditd_task; | |||
114 | static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); | 122 | static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); |
115 | static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); | 123 | static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); |
116 | 124 | ||
117 | /* The netlink socket is only to be read by 1 CPU, which lets us assume | 125 | /* Serialize requests from userspace. */ |
118 | * that list additions and deletions never happen simultaneously in | 126 | static DEFINE_MUTEX(audit_cmd_mutex); |
119 | * auditsc.c */ | ||
120 | DEFINE_MUTEX(audit_netlink_mutex); | ||
121 | 127 | ||
122 | /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting | 128 | /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting |
123 | * audit records. Since printk uses a 1024 byte buffer, this buffer | 129 | * audit records. Since printk uses a 1024 byte buffer, this buffer |
@@ -250,7 +256,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) | |||
250 | "audit_rate_limit=%d old=%d by auid=%u", | 256 | "audit_rate_limit=%d old=%d by auid=%u", |
251 | limit, old, loginuid); | 257 | limit, old, loginuid); |
252 | audit_rate_limit = limit; | 258 | audit_rate_limit = limit; |
253 | return old; | 259 | return 0; |
254 | } | 260 | } |
255 | 261 | ||
256 | static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) | 262 | static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) |
@@ -273,7 +279,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) | |||
273 | "audit_backlog_limit=%d old=%d by auid=%u", | 279 | "audit_backlog_limit=%d old=%d by auid=%u", |
274 | limit, old, loginuid); | 280 | limit, old, loginuid); |
275 | audit_backlog_limit = limit; | 281 | audit_backlog_limit = limit; |
276 | return old; | 282 | return 0; |
277 | } | 283 | } |
278 | 284 | ||
279 | static int audit_set_enabled(int state, uid_t loginuid, u32 sid) | 285 | static int audit_set_enabled(int state, uid_t loginuid, u32 sid) |
@@ -299,7 +305,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid) | |||
299 | "audit_enabled=%d old=%d by auid=%u", | 305 | "audit_enabled=%d old=%d by auid=%u", |
300 | state, old, loginuid); | 306 | state, old, loginuid); |
301 | audit_enabled = state; | 307 | audit_enabled = state; |
302 | return old; | 308 | return 0; |
303 | } | 309 | } |
304 | 310 | ||
305 | static int audit_set_failure(int state, uid_t loginuid, u32 sid) | 311 | static int audit_set_failure(int state, uid_t loginuid, u32 sid) |
@@ -327,7 +333,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid) | |||
327 | "audit_failure=%d old=%d by auid=%u", | 333 | "audit_failure=%d old=%d by auid=%u", |
328 | state, old, loginuid); | 334 | state, old, loginuid); |
329 | audit_failure = state; | 335 | audit_failure = state; |
330 | return old; | 336 | return 0; |
331 | } | 337 | } |
332 | 338 | ||
333 | static int kauditd_thread(void *dummy) | 339 | static int kauditd_thread(void *dummy) |
@@ -363,9 +369,52 @@ static int kauditd_thread(void *dummy) | |||
363 | remove_wait_queue(&kauditd_wait, &wait); | 369 | remove_wait_queue(&kauditd_wait, &wait); |
364 | } | 370 | } |
365 | } | 371 | } |
372 | } | ||
373 | |||
374 | int audit_send_list(void *_dest) | ||
375 | { | ||
376 | struct audit_netlink_list *dest = _dest; | ||
377 | int pid = dest->pid; | ||
378 | struct sk_buff *skb; | ||
379 | |||
380 | /* wait for parent to finish and send an ACK */ | ||
381 | mutex_lock(&audit_cmd_mutex); | ||
382 | mutex_unlock(&audit_cmd_mutex); | ||
383 | |||
384 | while ((skb = __skb_dequeue(&dest->q)) != NULL) | ||
385 | netlink_unicast(audit_sock, skb, pid, 0); | ||
386 | |||
387 | kfree(dest); | ||
388 | |||
366 | return 0; | 389 | return 0; |
367 | } | 390 | } |
368 | 391 | ||
392 | struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, | ||
393 | int multi, void *payload, int size) | ||
394 | { | ||
395 | struct sk_buff *skb; | ||
396 | struct nlmsghdr *nlh; | ||
397 | int len = NLMSG_SPACE(size); | ||
398 | void *data; | ||
399 | int flags = multi ? NLM_F_MULTI : 0; | ||
400 | int t = done ? NLMSG_DONE : type; | ||
401 | |||
402 | skb = alloc_skb(len, GFP_KERNEL); | ||
403 | if (!skb) | ||
404 | return NULL; | ||
405 | |||
406 | nlh = NLMSG_PUT(skb, pid, seq, t, size); | ||
407 | nlh->nlmsg_flags = flags; | ||
408 | data = NLMSG_DATA(nlh); | ||
409 | memcpy(data, payload, size); | ||
410 | return skb; | ||
411 | |||
412 | nlmsg_failure: /* Used by NLMSG_PUT */ | ||
413 | if (skb) | ||
414 | kfree_skb(skb); | ||
415 | return NULL; | ||
416 | } | ||
417 | |||
369 | /** | 418 | /** |
370 | * audit_send_reply - send an audit reply message via netlink | 419 | * audit_send_reply - send an audit reply message via netlink |
371 | * @pid: process id to send reply to | 420 | * @pid: process id to send reply to |
@@ -383,36 +432,20 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi, | |||
383 | void *payload, int size) | 432 | void *payload, int size) |
384 | { | 433 | { |
385 | struct sk_buff *skb; | 434 | struct sk_buff *skb; |
386 | struct nlmsghdr *nlh; | 435 | skb = audit_make_reply(pid, seq, type, done, multi, payload, size); |
387 | int len = NLMSG_SPACE(size); | ||
388 | void *data; | ||
389 | int flags = multi ? NLM_F_MULTI : 0; | ||
390 | int t = done ? NLMSG_DONE : type; | ||
391 | |||
392 | skb = alloc_skb(len, GFP_KERNEL); | ||
393 | if (!skb) | 436 | if (!skb) |
394 | return; | 437 | return; |
395 | |||
396 | nlh = NLMSG_PUT(skb, pid, seq, t, size); | ||
397 | nlh->nlmsg_flags = flags; | ||
398 | data = NLMSG_DATA(nlh); | ||
399 | memcpy(data, payload, size); | ||
400 | |||
401 | /* Ignore failure. It'll only happen if the sender goes away, | 438 | /* Ignore failure. It'll only happen if the sender goes away, |
402 | because our timeout is set to infinite. */ | 439 | because our timeout is set to infinite. */ |
403 | netlink_unicast(audit_sock, skb, pid, 0); | 440 | netlink_unicast(audit_sock, skb, pid, 0); |
404 | return; | 441 | return; |
405 | |||
406 | nlmsg_failure: /* Used by NLMSG_PUT */ | ||
407 | if (skb) | ||
408 | kfree_skb(skb); | ||
409 | } | 442 | } |
410 | 443 | ||
411 | /* | 444 | /* |
412 | * Check for appropriate CAP_AUDIT_ capabilities on incoming audit | 445 | * Check for appropriate CAP_AUDIT_ capabilities on incoming audit |
413 | * control messages. | 446 | * control messages. |
414 | */ | 447 | */ |
415 | static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) | 448 | static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) |
416 | { | 449 | { |
417 | int err = 0; | 450 | int err = 0; |
418 | 451 | ||
@@ -426,13 +459,13 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) | |||
426 | case AUDIT_DEL: | 459 | case AUDIT_DEL: |
427 | case AUDIT_DEL_RULE: | 460 | case AUDIT_DEL_RULE: |
428 | case AUDIT_SIGNAL_INFO: | 461 | case AUDIT_SIGNAL_INFO: |
429 | if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) | 462 | if (security_netlink_recv(skb, CAP_AUDIT_CONTROL)) |
430 | err = -EPERM; | 463 | err = -EPERM; |
431 | break; | 464 | break; |
432 | case AUDIT_USER: | 465 | case AUDIT_USER: |
433 | case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: | 466 | case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: |
434 | case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: | 467 | case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: |
435 | if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) | 468 | if (security_netlink_recv(skb, CAP_AUDIT_WRITE)) |
436 | err = -EPERM; | 469 | err = -EPERM; |
437 | break; | 470 | break; |
438 | default: /* bad msg */ | 471 | default: /* bad msg */ |
@@ -451,9 +484,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
451 | struct audit_buffer *ab; | 484 | struct audit_buffer *ab; |
452 | u16 msg_type = nlh->nlmsg_type; | 485 | u16 msg_type = nlh->nlmsg_type; |
453 | uid_t loginuid; /* loginuid of sender */ | 486 | uid_t loginuid; /* loginuid of sender */ |
454 | struct audit_sig_info sig_data; | 487 | struct audit_sig_info *sig_data; |
488 | char *ctx; | ||
489 | u32 len; | ||
455 | 490 | ||
456 | err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); | 491 | err = audit_netlink_ok(skb, msg_type); |
457 | if (err) | 492 | if (err) |
458 | return err; | 493 | return err; |
459 | 494 | ||
@@ -503,12 +538,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
503 | if (status_get->mask & AUDIT_STATUS_PID) { | 538 | if (status_get->mask & AUDIT_STATUS_PID) { |
504 | int old = audit_pid; | 539 | int old = audit_pid; |
505 | if (sid) { | 540 | if (sid) { |
506 | char *ctx = NULL; | 541 | if ((err = selinux_ctxid_to_string( |
507 | u32 len; | ||
508 | int rc; | ||
509 | if ((rc = selinux_ctxid_to_string( | ||
510 | sid, &ctx, &len))) | 542 | sid, &ctx, &len))) |
511 | return rc; | 543 | return err; |
512 | else | 544 | else |
513 | audit_log(NULL, GFP_KERNEL, | 545 | audit_log(NULL, GFP_KERNEL, |
514 | AUDIT_CONFIG_CHANGE, | 546 | AUDIT_CONFIG_CHANGE, |
@@ -523,10 +555,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
523 | audit_pid = status_get->pid; | 555 | audit_pid = status_get->pid; |
524 | } | 556 | } |
525 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) | 557 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) |
526 | audit_set_rate_limit(status_get->rate_limit, | 558 | err = audit_set_rate_limit(status_get->rate_limit, |
527 | loginuid, sid); | 559 | loginuid, sid); |
528 | if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) | 560 | if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) |
529 | audit_set_backlog_limit(status_get->backlog_limit, | 561 | err = audit_set_backlog_limit(status_get->backlog_limit, |
530 | loginuid, sid); | 562 | loginuid, sid); |
531 | break; | 563 | break; |
532 | case AUDIT_USER: | 564 | case AUDIT_USER: |
@@ -544,8 +576,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
544 | "user pid=%d uid=%u auid=%u", | 576 | "user pid=%d uid=%u auid=%u", |
545 | pid, uid, loginuid); | 577 | pid, uid, loginuid); |
546 | if (sid) { | 578 | if (sid) { |
547 | char *ctx = NULL; | ||
548 | u32 len; | ||
549 | if (selinux_ctxid_to_string( | 579 | if (selinux_ctxid_to_string( |
550 | sid, &ctx, &len)) { | 580 | sid, &ctx, &len)) { |
551 | audit_log_format(ab, | 581 | audit_log_format(ab, |
@@ -584,10 +614,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
584 | loginuid, sid); | 614 | loginuid, sid); |
585 | break; | 615 | break; |
586 | case AUDIT_SIGNAL_INFO: | 616 | case AUDIT_SIGNAL_INFO: |
587 | sig_data.uid = audit_sig_uid; | 617 | err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len); |
588 | sig_data.pid = audit_sig_pid; | 618 | if (err) |
619 | return err; | ||
620 | sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); | ||
621 | if (!sig_data) { | ||
622 | kfree(ctx); | ||
623 | return -ENOMEM; | ||
624 | } | ||
625 | sig_data->uid = audit_sig_uid; | ||
626 | sig_data->pid = audit_sig_pid; | ||
627 | memcpy(sig_data->ctx, ctx, len); | ||
628 | kfree(ctx); | ||
589 | audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, | 629 | audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, |
590 | 0, 0, &sig_data, sizeof(sig_data)); | 630 | 0, 0, sig_data, sizeof(*sig_data) + len); |
631 | kfree(sig_data); | ||
591 | break; | 632 | break; |
592 | default: | 633 | default: |
593 | err = -EINVAL; | 634 | err = -EINVAL; |
@@ -629,20 +670,30 @@ static void audit_receive(struct sock *sk, int length) | |||
629 | struct sk_buff *skb; | 670 | struct sk_buff *skb; |
630 | unsigned int qlen; | 671 | unsigned int qlen; |
631 | 672 | ||
632 | mutex_lock(&audit_netlink_mutex); | 673 | mutex_lock(&audit_cmd_mutex); |
633 | 674 | ||
634 | for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { | 675 | for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { |
635 | skb = skb_dequeue(&sk->sk_receive_queue); | 676 | skb = skb_dequeue(&sk->sk_receive_queue); |
636 | audit_receive_skb(skb); | 677 | audit_receive_skb(skb); |
637 | kfree_skb(skb); | 678 | kfree_skb(skb); |
638 | } | 679 | } |
639 | mutex_unlock(&audit_netlink_mutex); | 680 | mutex_unlock(&audit_cmd_mutex); |
640 | } | 681 | } |
641 | 682 | ||
683 | #ifdef CONFIG_AUDITSYSCALL | ||
684 | static const struct inotify_operations audit_inotify_ops = { | ||
685 | .handle_event = audit_handle_ievent, | ||
686 | .destroy_watch = audit_free_parent, | ||
687 | }; | ||
688 | #endif | ||
642 | 689 | ||
643 | /* Initialize audit support at boot time. */ | 690 | /* Initialize audit support at boot time. */ |
644 | static int __init audit_init(void) | 691 | static int __init audit_init(void) |
645 | { | 692 | { |
693 | #ifdef CONFIG_AUDITSYSCALL | ||
694 | int i; | ||
695 | #endif | ||
696 | |||
646 | printk(KERN_INFO "audit: initializing netlink socket (%s)\n", | 697 | printk(KERN_INFO "audit: initializing netlink socket (%s)\n", |
647 | audit_default ? "enabled" : "disabled"); | 698 | audit_default ? "enabled" : "disabled"); |
648 | audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, | 699 | audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, |
@@ -661,6 +712,16 @@ static int __init audit_init(void) | |||
661 | selinux_audit_set_callback(&selinux_audit_rule_update); | 712 | selinux_audit_set_callback(&selinux_audit_rule_update); |
662 | 713 | ||
663 | audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); | 714 | audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); |
715 | |||
716 | #ifdef CONFIG_AUDITSYSCALL | ||
717 | audit_ih = inotify_init(&audit_inotify_ops); | ||
718 | if (IS_ERR(audit_ih)) | ||
719 | audit_panic("cannot initialize inotify handle"); | ||
720 | |||
721 | for (i = 0; i < AUDIT_INODE_BUCKETS; i++) | ||
722 | INIT_LIST_HEAD(&audit_inode_hash[i]); | ||
723 | #endif | ||
724 | |||
664 | return 0; | 725 | return 0; |
665 | } | 726 | } |
666 | __initcall(audit_init); | 727 | __initcall(audit_init); |
@@ -690,10 +751,12 @@ static void audit_buffer_free(struct audit_buffer *ab) | |||
690 | kfree_skb(ab->skb); | 751 | kfree_skb(ab->skb); |
691 | 752 | ||
692 | spin_lock_irqsave(&audit_freelist_lock, flags); | 753 | spin_lock_irqsave(&audit_freelist_lock, flags); |
693 | if (++audit_freelist_count > AUDIT_MAXFREE) | 754 | if (audit_freelist_count > AUDIT_MAXFREE) |
694 | kfree(ab); | 755 | kfree(ab); |
695 | else | 756 | else { |
757 | audit_freelist_count++; | ||
696 | list_add(&ab->list, &audit_freelist); | 758 | list_add(&ab->list, &audit_freelist); |
759 | } | ||
697 | spin_unlock_irqrestore(&audit_freelist_lock, flags); | 760 | spin_unlock_irqrestore(&audit_freelist_lock, flags); |
698 | } | 761 | } |
699 | 762 | ||
@@ -755,7 +818,7 @@ err: | |||
755 | */ | 818 | */ |
756 | unsigned int audit_serial(void) | 819 | unsigned int audit_serial(void) |
757 | { | 820 | { |
758 | static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; | 821 | static DEFINE_SPINLOCK(serial_lock); |
759 | static unsigned int serial = 0; | 822 | static unsigned int serial = 0; |
760 | 823 | ||
761 | unsigned long flags; | 824 | unsigned long flags; |
@@ -988,28 +1051,76 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, | |||
988 | skb_put(skb, len << 1); /* new string is twice the old string */ | 1051 | skb_put(skb, len << 1); /* new string is twice the old string */ |
989 | } | 1052 | } |
990 | 1053 | ||
1054 | /* | ||
1055 | * Format a string of no more than slen characters into the audit buffer, | ||
1056 | * enclosed in quote marks. | ||
1057 | */ | ||
1058 | static void audit_log_n_string(struct audit_buffer *ab, size_t slen, | ||
1059 | const char *string) | ||
1060 | { | ||
1061 | int avail, new_len; | ||
1062 | unsigned char *ptr; | ||
1063 | struct sk_buff *skb; | ||
1064 | |||
1065 | BUG_ON(!ab->skb); | ||
1066 | skb = ab->skb; | ||
1067 | avail = skb_tailroom(skb); | ||
1068 | new_len = slen + 3; /* enclosing quotes + null terminator */ | ||
1069 | if (new_len > avail) { | ||
1070 | avail = audit_expand(ab, new_len); | ||
1071 | if (!avail) | ||
1072 | return; | ||
1073 | } | ||
1074 | ptr = skb->tail; | ||
1075 | *ptr++ = '"'; | ||
1076 | memcpy(ptr, string, slen); | ||
1077 | ptr += slen; | ||
1078 | *ptr++ = '"'; | ||
1079 | *ptr = 0; | ||
1080 | skb_put(skb, slen + 2); /* don't include null terminator */ | ||
1081 | } | ||
1082 | |||
991 | /** | 1083 | /** |
992 | * audit_log_unstrustedstring - log a string that may contain random characters | 1084 | * audit_log_n_unstrustedstring - log a string that may contain random characters |
993 | * @ab: audit_buffer | 1085 | * @ab: audit_buffer |
1086 | * @len: lenth of string (not including trailing null) | ||
994 | * @string: string to be logged | 1087 | * @string: string to be logged |
995 | * | 1088 | * |
996 | * This code will escape a string that is passed to it if the string | 1089 | * This code will escape a string that is passed to it if the string |
997 | * contains a control character, unprintable character, double quote mark, | 1090 | * contains a control character, unprintable character, double quote mark, |
998 | * or a space. Unescaped strings will start and end with a double quote mark. | 1091 | * or a space. Unescaped strings will start and end with a double quote mark. |
999 | * Strings that are escaped are printed in hex (2 digits per char). | 1092 | * Strings that are escaped are printed in hex (2 digits per char). |
1093 | * | ||
1094 | * The caller specifies the number of characters in the string to log, which may | ||
1095 | * or may not be the entire string. | ||
1000 | */ | 1096 | */ |
1001 | void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) | 1097 | const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, |
1098 | const char *string) | ||
1002 | { | 1099 | { |
1003 | const unsigned char *p = string; | 1100 | const unsigned char *p = string; |
1004 | 1101 | ||
1005 | while (*p) { | 1102 | while (*p) { |
1006 | if (*p == '"' || *p < 0x21 || *p > 0x7f) { | 1103 | if (*p == '"' || *p < 0x21 || *p > 0x7f) { |
1007 | audit_log_hex(ab, string, strlen(string)); | 1104 | audit_log_hex(ab, string, len); |
1008 | return; | 1105 | return string + len + 1; |
1009 | } | 1106 | } |
1010 | p++; | 1107 | p++; |
1011 | } | 1108 | } |
1012 | audit_log_format(ab, "\"%s\"", string); | 1109 | audit_log_n_string(ab, len, string); |
1110 | return p + 1; | ||
1111 | } | ||
1112 | |||
1113 | /** | ||
1114 | * audit_log_unstrustedstring - log a string that may contain random characters | ||
1115 | * @ab: audit_buffer | ||
1116 | * @string: string to be logged | ||
1117 | * | ||
1118 | * Same as audit_log_n_unstrustedstring(), except that strlen is used to | ||
1119 | * determine string length. | ||
1120 | */ | ||
1121 | const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string) | ||
1122 | { | ||
1123 | return audit_log_n_untrustedstring(ab, strlen(string), string); | ||
1013 | } | 1124 | } |
1014 | 1125 | ||
1015 | /* This is a helper-function to print the escaped d_path */ | 1126 | /* This is a helper-function to print the escaped d_path */ |
diff --git a/kernel/audit.h b/kernel/audit.h index 6f733920fd32..6aa33b848cf2 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
@@ -19,9 +19,9 @@ | |||
19 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 19 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
20 | */ | 20 | */ |
21 | 21 | ||
22 | #include <linux/mutex.h> | ||
23 | #include <linux/fs.h> | 22 | #include <linux/fs.h> |
24 | #include <linux/audit.h> | 23 | #include <linux/audit.h> |
24 | #include <linux/skbuff.h> | ||
25 | 25 | ||
26 | /* 0 = no checking | 26 | /* 0 = no checking |
27 | 1 = put_count checking | 27 | 1 = put_count checking |
@@ -53,6 +53,18 @@ enum audit_state { | |||
53 | }; | 53 | }; |
54 | 54 | ||
55 | /* Rule lists */ | 55 | /* Rule lists */ |
56 | struct audit_parent; | ||
57 | |||
58 | struct audit_watch { | ||
59 | atomic_t count; /* reference count */ | ||
60 | char *path; /* insertion path */ | ||
61 | dev_t dev; /* associated superblock device */ | ||
62 | unsigned long ino; /* associated inode number */ | ||
63 | struct audit_parent *parent; /* associated parent */ | ||
64 | struct list_head wlist; /* entry in parent->watches list */ | ||
65 | struct list_head rules; /* associated rules */ | ||
66 | }; | ||
67 | |||
56 | struct audit_field { | 68 | struct audit_field { |
57 | u32 type; | 69 | u32 type; |
58 | u32 val; | 70 | u32 val; |
@@ -69,7 +81,11 @@ struct audit_krule { | |||
69 | u32 mask[AUDIT_BITMASK_SIZE]; | 81 | u32 mask[AUDIT_BITMASK_SIZE]; |
70 | u32 buflen; /* for data alloc on list rules */ | 82 | u32 buflen; /* for data alloc on list rules */ |
71 | u32 field_count; | 83 | u32 field_count; |
84 | char *filterkey; /* ties events to rules */ | ||
72 | struct audit_field *fields; | 85 | struct audit_field *fields; |
86 | struct audit_field *inode_f; /* quick access to an inode field */ | ||
87 | struct audit_watch *watch; /* associated watch */ | ||
88 | struct list_head rlist; /* entry in audit_watch.rules list */ | ||
73 | }; | 89 | }; |
74 | 90 | ||
75 | struct audit_entry { | 91 | struct audit_entry { |
@@ -78,15 +94,53 @@ struct audit_entry { | |||
78 | struct audit_krule rule; | 94 | struct audit_krule rule; |
79 | }; | 95 | }; |
80 | 96 | ||
81 | |||
82 | extern int audit_pid; | 97 | extern int audit_pid; |
83 | extern int audit_comparator(const u32 left, const u32 op, const u32 right); | ||
84 | 98 | ||
99 | #define AUDIT_INODE_BUCKETS 32 | ||
100 | extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; | ||
101 | |||
102 | static inline int audit_hash_ino(u32 ino) | ||
103 | { | ||
104 | return (ino & (AUDIT_INODE_BUCKETS-1)); | ||
105 | } | ||
106 | |||
107 | extern int audit_comparator(const u32 left, const u32 op, const u32 right); | ||
108 | extern int audit_compare_dname_path(const char *dname, const char *path, | ||
109 | int *dirlen); | ||
110 | extern struct sk_buff * audit_make_reply(int pid, int seq, int type, | ||
111 | int done, int multi, | ||
112 | void *payload, int size); | ||
85 | extern void audit_send_reply(int pid, int seq, int type, | 113 | extern void audit_send_reply(int pid, int seq, int type, |
86 | int done, int multi, | 114 | int done, int multi, |
87 | void *payload, int size); | 115 | void *payload, int size); |
88 | extern void audit_log_lost(const char *message); | 116 | extern void audit_log_lost(const char *message); |
89 | extern void audit_panic(const char *message); | 117 | extern void audit_panic(const char *message); |
90 | extern struct mutex audit_netlink_mutex; | ||
91 | 118 | ||
119 | struct audit_netlink_list { | ||
120 | int pid; | ||
121 | struct sk_buff_head q; | ||
122 | }; | ||
123 | |||
124 | int audit_send_list(void *); | ||
125 | |||
126 | struct inotify_watch; | ||
127 | extern void audit_free_parent(struct inotify_watch *); | ||
128 | extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32, | ||
129 | const char *, struct inode *); | ||
92 | extern int selinux_audit_rule_update(void); | 130 | extern int selinux_audit_rule_update(void); |
131 | |||
132 | #ifdef CONFIG_AUDITSYSCALL | ||
133 | extern void __audit_signal_info(int sig, struct task_struct *t); | ||
134 | static inline void audit_signal_info(int sig, struct task_struct *t) | ||
135 | { | ||
136 | if (unlikely(audit_pid && t->tgid == audit_pid)) | ||
137 | __audit_signal_info(sig, t); | ||
138 | } | ||
139 | extern enum audit_state audit_filter_inodes(struct task_struct *, | ||
140 | struct audit_context *); | ||
141 | extern void audit_set_auditable(struct audit_context *); | ||
142 | #else | ||
143 | #define audit_signal_info(s,t) | ||
144 | #define audit_filter_inodes(t,c) AUDIT_DISABLED | ||
145 | #define audit_set_auditable(c) | ||
146 | #endif | ||
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 7c134906d689..5b4e16276ca0 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
@@ -22,13 +22,59 @@ | |||
22 | #include <linux/kernel.h> | 22 | #include <linux/kernel.h> |
23 | #include <linux/audit.h> | 23 | #include <linux/audit.h> |
24 | #include <linux/kthread.h> | 24 | #include <linux/kthread.h> |
25 | #include <linux/mutex.h> | ||
26 | #include <linux/fs.h> | ||
27 | #include <linux/namei.h> | ||
25 | #include <linux/netlink.h> | 28 | #include <linux/netlink.h> |
29 | #include <linux/sched.h> | ||
30 | #include <linux/inotify.h> | ||
26 | #include <linux/selinux.h> | 31 | #include <linux/selinux.h> |
27 | #include "audit.h" | 32 | #include "audit.h" |
28 | 33 | ||
29 | /* There are three lists of rules -- one to search at task creation | 34 | /* |
30 | * time, one to search at syscall entry time, and another to search at | 35 | * Locking model: |
31 | * syscall exit time. */ | 36 | * |
37 | * audit_filter_mutex: | ||
38 | * Synchronizes writes and blocking reads of audit's filterlist | ||
39 | * data. Rcu is used to traverse the filterlist and access | ||
40 | * contents of structs audit_entry, audit_watch and opaque | ||
41 | * selinux rules during filtering. If modified, these structures | ||
42 | * must be copied and replace their counterparts in the filterlist. | ||
43 | * An audit_parent struct is not accessed during filtering, so may | ||
44 | * be written directly provided audit_filter_mutex is held. | ||
45 | */ | ||
46 | |||
47 | /* | ||
48 | * Reference counting: | ||
49 | * | ||
50 | * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED | ||
51 | * event. Each audit_watch holds a reference to its associated parent. | ||
52 | * | ||
53 | * audit_watch: if added to lists, lifetime is from audit_init_watch() to | ||
54 | * audit_remove_watch(). Additionally, an audit_watch may exist | ||
55 | * temporarily to assist in searching existing filter data. Each | ||
56 | * audit_krule holds a reference to its associated watch. | ||
57 | */ | ||
58 | |||
59 | struct audit_parent { | ||
60 | struct list_head ilist; /* entry in inotify registration list */ | ||
61 | struct list_head watches; /* associated watches */ | ||
62 | struct inotify_watch wdata; /* inotify watch data */ | ||
63 | unsigned flags; /* status flags */ | ||
64 | }; | ||
65 | |||
66 | /* | ||
67 | * audit_parent status flags: | ||
68 | * | ||
69 | * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to | ||
70 | * a filesystem event to ensure we're adding audit watches to a valid parent. | ||
71 | * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot | ||
72 | * receive them while we have nameidata, but must be used for IN_MOVE_SELF which | ||
73 | * we can receive while holding nameidata. | ||
74 | */ | ||
75 | #define AUDIT_PARENT_INVALID 0x001 | ||
76 | |||
77 | /* Audit filter lists, defined in <linux/audit.h> */ | ||
32 | struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { | 78 | struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { |
33 | LIST_HEAD_INIT(audit_filter_list[0]), | 79 | LIST_HEAD_INIT(audit_filter_list[0]), |
34 | LIST_HEAD_INIT(audit_filter_list[1]), | 80 | LIST_HEAD_INIT(audit_filter_list[1]), |
@@ -41,9 +87,53 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { | |||
41 | #endif | 87 | #endif |
42 | }; | 88 | }; |
43 | 89 | ||
90 | static DEFINE_MUTEX(audit_filter_mutex); | ||
91 | |||
92 | /* Inotify handle */ | ||
93 | extern struct inotify_handle *audit_ih; | ||
94 | |||
95 | /* Inotify events we care about. */ | ||
96 | #define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF | ||
97 | |||
98 | void audit_free_parent(struct inotify_watch *i_watch) | ||
99 | { | ||
100 | struct audit_parent *parent; | ||
101 | |||
102 | parent = container_of(i_watch, struct audit_parent, wdata); | ||
103 | WARN_ON(!list_empty(&parent->watches)); | ||
104 | kfree(parent); | ||
105 | } | ||
106 | |||
107 | static inline void audit_get_watch(struct audit_watch *watch) | ||
108 | { | ||
109 | atomic_inc(&watch->count); | ||
110 | } | ||
111 | |||
112 | static void audit_put_watch(struct audit_watch *watch) | ||
113 | { | ||
114 | if (atomic_dec_and_test(&watch->count)) { | ||
115 | WARN_ON(watch->parent); | ||
116 | WARN_ON(!list_empty(&watch->rules)); | ||
117 | kfree(watch->path); | ||
118 | kfree(watch); | ||
119 | } | ||
120 | } | ||
121 | |||
122 | static void audit_remove_watch(struct audit_watch *watch) | ||
123 | { | ||
124 | list_del(&watch->wlist); | ||
125 | put_inotify_watch(&watch->parent->wdata); | ||
126 | watch->parent = NULL; | ||
127 | audit_put_watch(watch); /* match initial get */ | ||
128 | } | ||
129 | |||
44 | static inline void audit_free_rule(struct audit_entry *e) | 130 | static inline void audit_free_rule(struct audit_entry *e) |
45 | { | 131 | { |
46 | int i; | 132 | int i; |
133 | |||
134 | /* some rules don't have associated watches */ | ||
135 | if (e->rule.watch) | ||
136 | audit_put_watch(e->rule.watch); | ||
47 | if (e->rule.fields) | 137 | if (e->rule.fields) |
48 | for (i = 0; i < e->rule.field_count; i++) { | 138 | for (i = 0; i < e->rule.field_count; i++) { |
49 | struct audit_field *f = &e->rule.fields[i]; | 139 | struct audit_field *f = &e->rule.fields[i]; |
@@ -51,6 +141,7 @@ static inline void audit_free_rule(struct audit_entry *e) | |||
51 | selinux_audit_rule_free(f->se_rule); | 141 | selinux_audit_rule_free(f->se_rule); |
52 | } | 142 | } |
53 | kfree(e->rule.fields); | 143 | kfree(e->rule.fields); |
144 | kfree(e->rule.filterkey); | ||
54 | kfree(e); | 145 | kfree(e); |
55 | } | 146 | } |
56 | 147 | ||
@@ -60,6 +151,50 @@ static inline void audit_free_rule_rcu(struct rcu_head *head) | |||
60 | audit_free_rule(e); | 151 | audit_free_rule(e); |
61 | } | 152 | } |
62 | 153 | ||
154 | /* Initialize a parent watch entry. */ | ||
155 | static struct audit_parent *audit_init_parent(struct nameidata *ndp) | ||
156 | { | ||
157 | struct audit_parent *parent; | ||
158 | s32 wd; | ||
159 | |||
160 | parent = kzalloc(sizeof(*parent), GFP_KERNEL); | ||
161 | if (unlikely(!parent)) | ||
162 | return ERR_PTR(-ENOMEM); | ||
163 | |||
164 | INIT_LIST_HEAD(&parent->watches); | ||
165 | parent->flags = 0; | ||
166 | |||
167 | inotify_init_watch(&parent->wdata); | ||
168 | /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ | ||
169 | get_inotify_watch(&parent->wdata); | ||
170 | wd = inotify_add_watch(audit_ih, &parent->wdata, ndp->dentry->d_inode, | ||
171 | AUDIT_IN_WATCH); | ||
172 | if (wd < 0) { | ||
173 | audit_free_parent(&parent->wdata); | ||
174 | return ERR_PTR(wd); | ||
175 | } | ||
176 | |||
177 | return parent; | ||
178 | } | ||
179 | |||
180 | /* Initialize a watch entry. */ | ||
181 | static struct audit_watch *audit_init_watch(char *path) | ||
182 | { | ||
183 | struct audit_watch *watch; | ||
184 | |||
185 | watch = kzalloc(sizeof(*watch), GFP_KERNEL); | ||
186 | if (unlikely(!watch)) | ||
187 | return ERR_PTR(-ENOMEM); | ||
188 | |||
189 | INIT_LIST_HEAD(&watch->rules); | ||
190 | atomic_set(&watch->count, 1); | ||
191 | watch->path = path; | ||
192 | watch->dev = (dev_t)-1; | ||
193 | watch->ino = (unsigned long)-1; | ||
194 | |||
195 | return watch; | ||
196 | } | ||
197 | |||
63 | /* Initialize an audit filterlist entry. */ | 198 | /* Initialize an audit filterlist entry. */ |
64 | static inline struct audit_entry *audit_init_entry(u32 field_count) | 199 | static inline struct audit_entry *audit_init_entry(u32 field_count) |
65 | { | 200 | { |
@@ -107,6 +242,66 @@ static char *audit_unpack_string(void **bufp, size_t *remain, size_t len) | |||
107 | return str; | 242 | return str; |
108 | } | 243 | } |
109 | 244 | ||
245 | /* Translate an inode field to kernel respresentation. */ | ||
246 | static inline int audit_to_inode(struct audit_krule *krule, | ||
247 | struct audit_field *f) | ||
248 | { | ||
249 | if (krule->listnr != AUDIT_FILTER_EXIT || | ||
250 | krule->watch || krule->inode_f) | ||
251 | return -EINVAL; | ||
252 | |||
253 | krule->inode_f = f; | ||
254 | return 0; | ||
255 | } | ||
256 | |||
257 | /* Translate a watch string to kernel respresentation. */ | ||
258 | static int audit_to_watch(struct audit_krule *krule, char *path, int len, | ||
259 | u32 op) | ||
260 | { | ||
261 | struct audit_watch *watch; | ||
262 | |||
263 | if (!audit_ih) | ||
264 | return -EOPNOTSUPP; | ||
265 | |||
266 | if (path[0] != '/' || path[len-1] == '/' || | ||
267 | krule->listnr != AUDIT_FILTER_EXIT || | ||
268 | op & ~AUDIT_EQUAL || | ||
269 | krule->inode_f || krule->watch) /* 1 inode # per rule, for hash */ | ||
270 | return -EINVAL; | ||
271 | |||
272 | watch = audit_init_watch(path); | ||
273 | if (unlikely(IS_ERR(watch))) | ||
274 | return PTR_ERR(watch); | ||
275 | |||
276 | audit_get_watch(watch); | ||
277 | krule->watch = watch; | ||
278 | |||
279 | return 0; | ||
280 | } | ||
281 | |||
282 | static __u32 *classes[AUDIT_SYSCALL_CLASSES]; | ||
283 | |||
284 | int __init audit_register_class(int class, unsigned *list) | ||
285 | { | ||
286 | __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL); | ||
287 | if (!p) | ||
288 | return -ENOMEM; | ||
289 | while (*list != ~0U) { | ||
290 | unsigned n = *list++; | ||
291 | if (n >= AUDIT_BITMASK_SIZE * 32 - AUDIT_SYSCALL_CLASSES) { | ||
292 | kfree(p); | ||
293 | return -EINVAL; | ||
294 | } | ||
295 | p[AUDIT_WORD(n)] |= AUDIT_BIT(n); | ||
296 | } | ||
297 | if (class >= AUDIT_SYSCALL_CLASSES || classes[class]) { | ||
298 | kfree(p); | ||
299 | return -EINVAL; | ||
300 | } | ||
301 | classes[class] = p; | ||
302 | return 0; | ||
303 | } | ||
304 | |||
110 | /* Common user-space to kernel rule translation. */ | 305 | /* Common user-space to kernel rule translation. */ |
111 | static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) | 306 | static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) |
112 | { | 307 | { |
@@ -128,8 +323,11 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) | |||
128 | #endif | 323 | #endif |
129 | ; | 324 | ; |
130 | } | 325 | } |
131 | if (rule->action != AUDIT_NEVER && rule->action != AUDIT_POSSIBLE && | 326 | if (unlikely(rule->action == AUDIT_POSSIBLE)) { |
132 | rule->action != AUDIT_ALWAYS) | 327 | printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n"); |
328 | goto exit_err; | ||
329 | } | ||
330 | if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS) | ||
133 | goto exit_err; | 331 | goto exit_err; |
134 | if (rule->field_count > AUDIT_MAX_FIELDS) | 332 | if (rule->field_count > AUDIT_MAX_FIELDS) |
135 | goto exit_err; | 333 | goto exit_err; |
@@ -147,6 +345,22 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) | |||
147 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) | 345 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) |
148 | entry->rule.mask[i] = rule->mask[i]; | 346 | entry->rule.mask[i] = rule->mask[i]; |
149 | 347 | ||
348 | for (i = 0; i < AUDIT_SYSCALL_CLASSES; i++) { | ||
349 | int bit = AUDIT_BITMASK_SIZE * 32 - i - 1; | ||
350 | __u32 *p = &entry->rule.mask[AUDIT_WORD(bit)]; | ||
351 | __u32 *class; | ||
352 | |||
353 | if (!(*p & AUDIT_BIT(bit))) | ||
354 | continue; | ||
355 | *p &= ~AUDIT_BIT(bit); | ||
356 | class = classes[i]; | ||
357 | if (class) { | ||
358 | int j; | ||
359 | for (j = 0; j < AUDIT_BITMASK_SIZE; j++) | ||
360 | entry->rule.mask[j] |= class[j]; | ||
361 | } | ||
362 | } | ||
363 | |||
150 | return entry; | 364 | return entry; |
151 | 365 | ||
152 | exit_err: | 366 | exit_err: |
@@ -158,6 +372,7 @@ exit_err: | |||
158 | static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) | 372 | static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) |
159 | { | 373 | { |
160 | struct audit_entry *entry; | 374 | struct audit_entry *entry; |
375 | struct audit_field *f; | ||
161 | int err = 0; | 376 | int err = 0; |
162 | int i; | 377 | int i; |
163 | 378 | ||
@@ -172,14 +387,37 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) | |||
172 | f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); | 387 | f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); |
173 | f->val = rule->values[i]; | 388 | f->val = rule->values[i]; |
174 | 389 | ||
175 | if (f->type & AUDIT_UNUSED_BITS || | 390 | err = -EINVAL; |
176 | f->type == AUDIT_SE_USER || | 391 | switch(f->type) { |
177 | f->type == AUDIT_SE_ROLE || | 392 | default: |
178 | f->type == AUDIT_SE_TYPE || | ||
179 | f->type == AUDIT_SE_SEN || | ||
180 | f->type == AUDIT_SE_CLR) { | ||
181 | err = -EINVAL; | ||
182 | goto exit_free; | 393 | goto exit_free; |
394 | case AUDIT_PID: | ||
395 | case AUDIT_UID: | ||
396 | case AUDIT_EUID: | ||
397 | case AUDIT_SUID: | ||
398 | case AUDIT_FSUID: | ||
399 | case AUDIT_GID: | ||
400 | case AUDIT_EGID: | ||
401 | case AUDIT_SGID: | ||
402 | case AUDIT_FSGID: | ||
403 | case AUDIT_LOGINUID: | ||
404 | case AUDIT_PERS: | ||
405 | case AUDIT_ARCH: | ||
406 | case AUDIT_MSGTYPE: | ||
407 | case AUDIT_DEVMAJOR: | ||
408 | case AUDIT_DEVMINOR: | ||
409 | case AUDIT_EXIT: | ||
410 | case AUDIT_SUCCESS: | ||
411 | case AUDIT_ARG0: | ||
412 | case AUDIT_ARG1: | ||
413 | case AUDIT_ARG2: | ||
414 | case AUDIT_ARG3: | ||
415 | break; | ||
416 | case AUDIT_INODE: | ||
417 | err = audit_to_inode(&entry->rule, f); | ||
418 | if (err) | ||
419 | goto exit_free; | ||
420 | break; | ||
183 | } | 421 | } |
184 | 422 | ||
185 | entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; | 423 | entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; |
@@ -196,6 +434,18 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) | |||
196 | } | 434 | } |
197 | } | 435 | } |
198 | 436 | ||
437 | f = entry->rule.inode_f; | ||
438 | if (f) { | ||
439 | switch(f->op) { | ||
440 | case AUDIT_NOT_EQUAL: | ||
441 | entry->rule.inode_f = NULL; | ||
442 | case AUDIT_EQUAL: | ||
443 | break; | ||
444 | default: | ||
445 | goto exit_free; | ||
446 | } | ||
447 | } | ||
448 | |||
199 | exit_nofree: | 449 | exit_nofree: |
200 | return entry; | 450 | return entry; |
201 | 451 | ||
@@ -210,6 +460,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
210 | { | 460 | { |
211 | int err = 0; | 461 | int err = 0; |
212 | struct audit_entry *entry; | 462 | struct audit_entry *entry; |
463 | struct audit_field *f; | ||
213 | void *bufp; | 464 | void *bufp; |
214 | size_t remain = datasz - sizeof(struct audit_rule_data); | 465 | size_t remain = datasz - sizeof(struct audit_rule_data); |
215 | int i; | 466 | int i; |
@@ -235,11 +486,39 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
235 | f->se_str = NULL; | 486 | f->se_str = NULL; |
236 | f->se_rule = NULL; | 487 | f->se_rule = NULL; |
237 | switch(f->type) { | 488 | switch(f->type) { |
238 | case AUDIT_SE_USER: | 489 | case AUDIT_PID: |
239 | case AUDIT_SE_ROLE: | 490 | case AUDIT_UID: |
240 | case AUDIT_SE_TYPE: | 491 | case AUDIT_EUID: |
241 | case AUDIT_SE_SEN: | 492 | case AUDIT_SUID: |
242 | case AUDIT_SE_CLR: | 493 | case AUDIT_FSUID: |
494 | case AUDIT_GID: | ||
495 | case AUDIT_EGID: | ||
496 | case AUDIT_SGID: | ||
497 | case AUDIT_FSGID: | ||
498 | case AUDIT_LOGINUID: | ||
499 | case AUDIT_PERS: | ||
500 | case AUDIT_ARCH: | ||
501 | case AUDIT_MSGTYPE: | ||
502 | case AUDIT_PPID: | ||
503 | case AUDIT_DEVMAJOR: | ||
504 | case AUDIT_DEVMINOR: | ||
505 | case AUDIT_EXIT: | ||
506 | case AUDIT_SUCCESS: | ||
507 | case AUDIT_ARG0: | ||
508 | case AUDIT_ARG1: | ||
509 | case AUDIT_ARG2: | ||
510 | case AUDIT_ARG3: | ||
511 | break; | ||
512 | case AUDIT_SUBJ_USER: | ||
513 | case AUDIT_SUBJ_ROLE: | ||
514 | case AUDIT_SUBJ_TYPE: | ||
515 | case AUDIT_SUBJ_SEN: | ||
516 | case AUDIT_SUBJ_CLR: | ||
517 | case AUDIT_OBJ_USER: | ||
518 | case AUDIT_OBJ_ROLE: | ||
519 | case AUDIT_OBJ_TYPE: | ||
520 | case AUDIT_OBJ_LEV_LOW: | ||
521 | case AUDIT_OBJ_LEV_HIGH: | ||
243 | str = audit_unpack_string(&bufp, &remain, f->val); | 522 | str = audit_unpack_string(&bufp, &remain, f->val); |
244 | if (IS_ERR(str)) | 523 | if (IS_ERR(str)) |
245 | goto exit_free; | 524 | goto exit_free; |
@@ -260,6 +539,47 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
260 | } else | 539 | } else |
261 | f->se_str = str; | 540 | f->se_str = str; |
262 | break; | 541 | break; |
542 | case AUDIT_WATCH: | ||
543 | str = audit_unpack_string(&bufp, &remain, f->val); | ||
544 | if (IS_ERR(str)) | ||
545 | goto exit_free; | ||
546 | entry->rule.buflen += f->val; | ||
547 | |||
548 | err = audit_to_watch(&entry->rule, str, f->val, f->op); | ||
549 | if (err) { | ||
550 | kfree(str); | ||
551 | goto exit_free; | ||
552 | } | ||
553 | break; | ||
554 | case AUDIT_INODE: | ||
555 | err = audit_to_inode(&entry->rule, f); | ||
556 | if (err) | ||
557 | goto exit_free; | ||
558 | break; | ||
559 | case AUDIT_FILTERKEY: | ||
560 | err = -EINVAL; | ||
561 | if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) | ||
562 | goto exit_free; | ||
563 | str = audit_unpack_string(&bufp, &remain, f->val); | ||
564 | if (IS_ERR(str)) | ||
565 | goto exit_free; | ||
566 | entry->rule.buflen += f->val; | ||
567 | entry->rule.filterkey = str; | ||
568 | break; | ||
569 | default: | ||
570 | goto exit_free; | ||
571 | } | ||
572 | } | ||
573 | |||
574 | f = entry->rule.inode_f; | ||
575 | if (f) { | ||
576 | switch(f->op) { | ||
577 | case AUDIT_NOT_EQUAL: | ||
578 | entry->rule.inode_f = NULL; | ||
579 | case AUDIT_EQUAL: | ||
580 | break; | ||
581 | default: | ||
582 | goto exit_free; | ||
263 | } | 583 | } |
264 | } | 584 | } |
265 | 585 | ||
@@ -291,7 +611,7 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) | |||
291 | 611 | ||
292 | rule = kmalloc(sizeof(*rule), GFP_KERNEL); | 612 | rule = kmalloc(sizeof(*rule), GFP_KERNEL); |
293 | if (unlikely(!rule)) | 613 | if (unlikely(!rule)) |
294 | return ERR_PTR(-ENOMEM); | 614 | return NULL; |
295 | memset(rule, 0, sizeof(*rule)); | 615 | memset(rule, 0, sizeof(*rule)); |
296 | 616 | ||
297 | rule->flags = krule->flags | krule->listnr; | 617 | rule->flags = krule->flags | krule->listnr; |
@@ -322,7 +642,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) | |||
322 | 642 | ||
323 | data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL); | 643 | data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL); |
324 | if (unlikely(!data)) | 644 | if (unlikely(!data)) |
325 | return ERR_PTR(-ENOMEM); | 645 | return NULL; |
326 | memset(data, 0, sizeof(*data)); | 646 | memset(data, 0, sizeof(*data)); |
327 | 647 | ||
328 | data->flags = krule->flags | krule->listnr; | 648 | data->flags = krule->flags | krule->listnr; |
@@ -335,14 +655,27 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) | |||
335 | data->fields[i] = f->type; | 655 | data->fields[i] = f->type; |
336 | data->fieldflags[i] = f->op; | 656 | data->fieldflags[i] = f->op; |
337 | switch(f->type) { | 657 | switch(f->type) { |
338 | case AUDIT_SE_USER: | 658 | case AUDIT_SUBJ_USER: |
339 | case AUDIT_SE_ROLE: | 659 | case AUDIT_SUBJ_ROLE: |
340 | case AUDIT_SE_TYPE: | 660 | case AUDIT_SUBJ_TYPE: |
341 | case AUDIT_SE_SEN: | 661 | case AUDIT_SUBJ_SEN: |
342 | case AUDIT_SE_CLR: | 662 | case AUDIT_SUBJ_CLR: |
663 | case AUDIT_OBJ_USER: | ||
664 | case AUDIT_OBJ_ROLE: | ||
665 | case AUDIT_OBJ_TYPE: | ||
666 | case AUDIT_OBJ_LEV_LOW: | ||
667 | case AUDIT_OBJ_LEV_HIGH: | ||
343 | data->buflen += data->values[i] = | 668 | data->buflen += data->values[i] = |
344 | audit_pack_string(&bufp, f->se_str); | 669 | audit_pack_string(&bufp, f->se_str); |
345 | break; | 670 | break; |
671 | case AUDIT_WATCH: | ||
672 | data->buflen += data->values[i] = | ||
673 | audit_pack_string(&bufp, krule->watch->path); | ||
674 | break; | ||
675 | case AUDIT_FILTERKEY: | ||
676 | data->buflen += data->values[i] = | ||
677 | audit_pack_string(&bufp, krule->filterkey); | ||
678 | break; | ||
346 | default: | 679 | default: |
347 | data->values[i] = f->val; | 680 | data->values[i] = f->val; |
348 | } | 681 | } |
@@ -370,14 +703,28 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) | |||
370 | return 1; | 703 | return 1; |
371 | 704 | ||
372 | switch(a->fields[i].type) { | 705 | switch(a->fields[i].type) { |
373 | case AUDIT_SE_USER: | 706 | case AUDIT_SUBJ_USER: |
374 | case AUDIT_SE_ROLE: | 707 | case AUDIT_SUBJ_ROLE: |
375 | case AUDIT_SE_TYPE: | 708 | case AUDIT_SUBJ_TYPE: |
376 | case AUDIT_SE_SEN: | 709 | case AUDIT_SUBJ_SEN: |
377 | case AUDIT_SE_CLR: | 710 | case AUDIT_SUBJ_CLR: |
711 | case AUDIT_OBJ_USER: | ||
712 | case AUDIT_OBJ_ROLE: | ||
713 | case AUDIT_OBJ_TYPE: | ||
714 | case AUDIT_OBJ_LEV_LOW: | ||
715 | case AUDIT_OBJ_LEV_HIGH: | ||
378 | if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) | 716 | if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) |
379 | return 1; | 717 | return 1; |
380 | break; | 718 | break; |
719 | case AUDIT_WATCH: | ||
720 | if (strcmp(a->watch->path, b->watch->path)) | ||
721 | return 1; | ||
722 | break; | ||
723 | case AUDIT_FILTERKEY: | ||
724 | /* both filterkeys exist based on above type compare */ | ||
725 | if (strcmp(a->filterkey, b->filterkey)) | ||
726 | return 1; | ||
727 | break; | ||
381 | default: | 728 | default: |
382 | if (a->fields[i].val != b->fields[i].val) | 729 | if (a->fields[i].val != b->fields[i].val) |
383 | return 1; | 730 | return 1; |
@@ -391,6 +738,32 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) | |||
391 | return 0; | 738 | return 0; |
392 | } | 739 | } |
393 | 740 | ||
741 | /* Duplicate the given audit watch. The new watch's rules list is initialized | ||
742 | * to an empty list and wlist is undefined. */ | ||
743 | static struct audit_watch *audit_dupe_watch(struct audit_watch *old) | ||
744 | { | ||
745 | char *path; | ||
746 | struct audit_watch *new; | ||
747 | |||
748 | path = kstrdup(old->path, GFP_KERNEL); | ||
749 | if (unlikely(!path)) | ||
750 | return ERR_PTR(-ENOMEM); | ||
751 | |||
752 | new = audit_init_watch(path); | ||
753 | if (unlikely(IS_ERR(new))) { | ||
754 | kfree(path); | ||
755 | goto out; | ||
756 | } | ||
757 | |||
758 | new->dev = old->dev; | ||
759 | new->ino = old->ino; | ||
760 | get_inotify_watch(&old->parent->wdata); | ||
761 | new->parent = old->parent; | ||
762 | |||
763 | out: | ||
764 | return new; | ||
765 | } | ||
766 | |||
394 | /* Duplicate selinux field information. The se_rule is opaque, so must be | 767 | /* Duplicate selinux field information. The se_rule is opaque, so must be |
395 | * re-initialized. */ | 768 | * re-initialized. */ |
396 | static inline int audit_dupe_selinux_field(struct audit_field *df, | 769 | static inline int audit_dupe_selinux_field(struct audit_field *df, |
@@ -422,12 +795,16 @@ static inline int audit_dupe_selinux_field(struct audit_field *df, | |||
422 | /* Duplicate an audit rule. This will be a deep copy with the exception | 795 | /* Duplicate an audit rule. This will be a deep copy with the exception |
423 | * of the watch - that pointer is carried over. The selinux specific fields | 796 | * of the watch - that pointer is carried over. The selinux specific fields |
424 | * will be updated in the copy. The point is to be able to replace the old | 797 | * will be updated in the copy. The point is to be able to replace the old |
425 | * rule with the new rule in the filterlist, then free the old rule. */ | 798 | * rule with the new rule in the filterlist, then free the old rule. |
426 | static struct audit_entry *audit_dupe_rule(struct audit_krule *old) | 799 | * The rlist element is undefined; list manipulations are handled apart from |
800 | * the initial copy. */ | ||
801 | static struct audit_entry *audit_dupe_rule(struct audit_krule *old, | ||
802 | struct audit_watch *watch) | ||
427 | { | 803 | { |
428 | u32 fcount = old->field_count; | 804 | u32 fcount = old->field_count; |
429 | struct audit_entry *entry; | 805 | struct audit_entry *entry; |
430 | struct audit_krule *new; | 806 | struct audit_krule *new; |
807 | char *fk; | ||
431 | int i, err = 0; | 808 | int i, err = 0; |
432 | 809 | ||
433 | entry = audit_init_entry(fcount); | 810 | entry = audit_init_entry(fcount); |
@@ -442,6 +819,8 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old) | |||
442 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) | 819 | for (i = 0; i < AUDIT_BITMASK_SIZE; i++) |
443 | new->mask[i] = old->mask[i]; | 820 | new->mask[i] = old->mask[i]; |
444 | new->buflen = old->buflen; | 821 | new->buflen = old->buflen; |
822 | new->inode_f = old->inode_f; | ||
823 | new->watch = NULL; | ||
445 | new->field_count = old->field_count; | 824 | new->field_count = old->field_count; |
446 | memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); | 825 | memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); |
447 | 826 | ||
@@ -449,13 +828,25 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old) | |||
449 | * the originals will all be freed when the old rule is freed. */ | 828 | * the originals will all be freed when the old rule is freed. */ |
450 | for (i = 0; i < fcount; i++) { | 829 | for (i = 0; i < fcount; i++) { |
451 | switch (new->fields[i].type) { | 830 | switch (new->fields[i].type) { |
452 | case AUDIT_SE_USER: | 831 | case AUDIT_SUBJ_USER: |
453 | case AUDIT_SE_ROLE: | 832 | case AUDIT_SUBJ_ROLE: |
454 | case AUDIT_SE_TYPE: | 833 | case AUDIT_SUBJ_TYPE: |
455 | case AUDIT_SE_SEN: | 834 | case AUDIT_SUBJ_SEN: |
456 | case AUDIT_SE_CLR: | 835 | case AUDIT_SUBJ_CLR: |
836 | case AUDIT_OBJ_USER: | ||
837 | case AUDIT_OBJ_ROLE: | ||
838 | case AUDIT_OBJ_TYPE: | ||
839 | case AUDIT_OBJ_LEV_LOW: | ||
840 | case AUDIT_OBJ_LEV_HIGH: | ||
457 | err = audit_dupe_selinux_field(&new->fields[i], | 841 | err = audit_dupe_selinux_field(&new->fields[i], |
458 | &old->fields[i]); | 842 | &old->fields[i]); |
843 | break; | ||
844 | case AUDIT_FILTERKEY: | ||
845 | fk = kstrdup(old->filterkey, GFP_KERNEL); | ||
846 | if (unlikely(!fk)) | ||
847 | err = -ENOMEM; | ||
848 | else | ||
849 | new->filterkey = fk; | ||
459 | } | 850 | } |
460 | if (err) { | 851 | if (err) { |
461 | audit_free_rule(entry); | 852 | audit_free_rule(entry); |
@@ -463,68 +854,409 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old) | |||
463 | } | 854 | } |
464 | } | 855 | } |
465 | 856 | ||
857 | if (watch) { | ||
858 | audit_get_watch(watch); | ||
859 | new->watch = watch; | ||
860 | } | ||
861 | |||
466 | return entry; | 862 | return entry; |
467 | } | 863 | } |
468 | 864 | ||
469 | /* Add rule to given filterlist if not a duplicate. Protected by | 865 | /* Update inode info in audit rules based on filesystem event. */ |
470 | * audit_netlink_mutex. */ | 866 | static void audit_update_watch(struct audit_parent *parent, |
867 | const char *dname, dev_t dev, | ||
868 | unsigned long ino, unsigned invalidating) | ||
869 | { | ||
870 | struct audit_watch *owatch, *nwatch, *nextw; | ||
871 | struct audit_krule *r, *nextr; | ||
872 | struct audit_entry *oentry, *nentry; | ||
873 | struct audit_buffer *ab; | ||
874 | |||
875 | mutex_lock(&audit_filter_mutex); | ||
876 | list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { | ||
877 | if (audit_compare_dname_path(dname, owatch->path, NULL)) | ||
878 | continue; | ||
879 | |||
880 | /* If the update involves invalidating rules, do the inode-based | ||
881 | * filtering now, so we don't omit records. */ | ||
882 | if (invalidating && | ||
883 | audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT) | ||
884 | audit_set_auditable(current->audit_context); | ||
885 | |||
886 | nwatch = audit_dupe_watch(owatch); | ||
887 | if (unlikely(IS_ERR(nwatch))) { | ||
888 | mutex_unlock(&audit_filter_mutex); | ||
889 | audit_panic("error updating watch, skipping"); | ||
890 | return; | ||
891 | } | ||
892 | nwatch->dev = dev; | ||
893 | nwatch->ino = ino; | ||
894 | |||
895 | list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) { | ||
896 | |||
897 | oentry = container_of(r, struct audit_entry, rule); | ||
898 | list_del(&oentry->rule.rlist); | ||
899 | list_del_rcu(&oentry->list); | ||
900 | |||
901 | nentry = audit_dupe_rule(&oentry->rule, nwatch); | ||
902 | if (unlikely(IS_ERR(nentry))) | ||
903 | audit_panic("error updating watch, removing"); | ||
904 | else { | ||
905 | int h = audit_hash_ino((u32)ino); | ||
906 | list_add(&nentry->rule.rlist, &nwatch->rules); | ||
907 | list_add_rcu(&nentry->list, &audit_inode_hash[h]); | ||
908 | } | ||
909 | |||
910 | call_rcu(&oentry->rcu, audit_free_rule_rcu); | ||
911 | } | ||
912 | |||
913 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | ||
914 | audit_log_format(ab, "audit updated rules specifying watch="); | ||
915 | audit_log_untrustedstring(ab, owatch->path); | ||
916 | audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino); | ||
917 | audit_log_end(ab); | ||
918 | |||
919 | audit_remove_watch(owatch); | ||
920 | goto add_watch_to_parent; /* event applies to a single watch */ | ||
921 | } | ||
922 | mutex_unlock(&audit_filter_mutex); | ||
923 | return; | ||
924 | |||
925 | add_watch_to_parent: | ||
926 | list_add(&nwatch->wlist, &parent->watches); | ||
927 | mutex_unlock(&audit_filter_mutex); | ||
928 | return; | ||
929 | } | ||
930 | |||
931 | /* Remove all watches & rules associated with a parent that is going away. */ | ||
932 | static void audit_remove_parent_watches(struct audit_parent *parent) | ||
933 | { | ||
934 | struct audit_watch *w, *nextw; | ||
935 | struct audit_krule *r, *nextr; | ||
936 | struct audit_entry *e; | ||
937 | |||
938 | mutex_lock(&audit_filter_mutex); | ||
939 | parent->flags |= AUDIT_PARENT_INVALID; | ||
940 | list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { | ||
941 | list_for_each_entry_safe(r, nextr, &w->rules, rlist) { | ||
942 | e = container_of(r, struct audit_entry, rule); | ||
943 | list_del(&r->rlist); | ||
944 | list_del_rcu(&e->list); | ||
945 | call_rcu(&e->rcu, audit_free_rule_rcu); | ||
946 | |||
947 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
948 | "audit implicitly removed rule from list=%d\n", | ||
949 | AUDIT_FILTER_EXIT); | ||
950 | } | ||
951 | audit_remove_watch(w); | ||
952 | } | ||
953 | mutex_unlock(&audit_filter_mutex); | ||
954 | } | ||
955 | |||
956 | /* Unregister inotify watches for parents on in_list. | ||
957 | * Generates an IN_IGNORED event. */ | ||
958 | static void audit_inotify_unregister(struct list_head *in_list) | ||
959 | { | ||
960 | struct audit_parent *p, *n; | ||
961 | |||
962 | list_for_each_entry_safe(p, n, in_list, ilist) { | ||
963 | list_del(&p->ilist); | ||
964 | inotify_rm_watch(audit_ih, &p->wdata); | ||
965 | /* the put matching the get in audit_do_del_rule() */ | ||
966 | put_inotify_watch(&p->wdata); | ||
967 | } | ||
968 | } | ||
969 | |||
970 | /* Find an existing audit rule. | ||
971 | * Caller must hold audit_filter_mutex to prevent stale rule data. */ | ||
972 | static struct audit_entry *audit_find_rule(struct audit_entry *entry, | ||
973 | struct list_head *list) | ||
974 | { | ||
975 | struct audit_entry *e, *found = NULL; | ||
976 | int h; | ||
977 | |||
978 | if (entry->rule.watch) { | ||
979 | /* we don't know the inode number, so must walk entire hash */ | ||
980 | for (h = 0; h < AUDIT_INODE_BUCKETS; h++) { | ||
981 | list = &audit_inode_hash[h]; | ||
982 | list_for_each_entry(e, list, list) | ||
983 | if (!audit_compare_rule(&entry->rule, &e->rule)) { | ||
984 | found = e; | ||
985 | goto out; | ||
986 | } | ||
987 | } | ||
988 | goto out; | ||
989 | } | ||
990 | |||
991 | list_for_each_entry(e, list, list) | ||
992 | if (!audit_compare_rule(&entry->rule, &e->rule)) { | ||
993 | found = e; | ||
994 | goto out; | ||
995 | } | ||
996 | |||
997 | out: | ||
998 | return found; | ||
999 | } | ||
1000 | |||
1001 | /* Get path information necessary for adding watches. */ | ||
1002 | static int audit_get_nd(char *path, struct nameidata **ndp, | ||
1003 | struct nameidata **ndw) | ||
1004 | { | ||
1005 | struct nameidata *ndparent, *ndwatch; | ||
1006 | int err; | ||
1007 | |||
1008 | ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); | ||
1009 | if (unlikely(!ndparent)) | ||
1010 | return -ENOMEM; | ||
1011 | |||
1012 | ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); | ||
1013 | if (unlikely(!ndwatch)) { | ||
1014 | kfree(ndparent); | ||
1015 | return -ENOMEM; | ||
1016 | } | ||
1017 | |||
1018 | err = path_lookup(path, LOOKUP_PARENT, ndparent); | ||
1019 | if (err) { | ||
1020 | kfree(ndparent); | ||
1021 | kfree(ndwatch); | ||
1022 | return err; | ||
1023 | } | ||
1024 | |||
1025 | err = path_lookup(path, 0, ndwatch); | ||
1026 | if (err) { | ||
1027 | kfree(ndwatch); | ||
1028 | ndwatch = NULL; | ||
1029 | } | ||
1030 | |||
1031 | *ndp = ndparent; | ||
1032 | *ndw = ndwatch; | ||
1033 | |||
1034 | return 0; | ||
1035 | } | ||
1036 | |||
1037 | /* Release resources used for watch path information. */ | ||
1038 | static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) | ||
1039 | { | ||
1040 | if (ndp) { | ||
1041 | path_release(ndp); | ||
1042 | kfree(ndp); | ||
1043 | } | ||
1044 | if (ndw) { | ||
1045 | path_release(ndw); | ||
1046 | kfree(ndw); | ||
1047 | } | ||
1048 | } | ||
1049 | |||
1050 | /* Associate the given rule with an existing parent inotify_watch. | ||
1051 | * Caller must hold audit_filter_mutex. */ | ||
1052 | static void audit_add_to_parent(struct audit_krule *krule, | ||
1053 | struct audit_parent *parent) | ||
1054 | { | ||
1055 | struct audit_watch *w, *watch = krule->watch; | ||
1056 | int watch_found = 0; | ||
1057 | |||
1058 | list_for_each_entry(w, &parent->watches, wlist) { | ||
1059 | if (strcmp(watch->path, w->path)) | ||
1060 | continue; | ||
1061 | |||
1062 | watch_found = 1; | ||
1063 | |||
1064 | /* put krule's and initial refs to temporary watch */ | ||
1065 | audit_put_watch(watch); | ||
1066 | audit_put_watch(watch); | ||
1067 | |||
1068 | audit_get_watch(w); | ||
1069 | krule->watch = watch = w; | ||
1070 | break; | ||
1071 | } | ||
1072 | |||
1073 | if (!watch_found) { | ||
1074 | get_inotify_watch(&parent->wdata); | ||
1075 | watch->parent = parent; | ||
1076 | |||
1077 | list_add(&watch->wlist, &parent->watches); | ||
1078 | } | ||
1079 | list_add(&krule->rlist, &watch->rules); | ||
1080 | } | ||
1081 | |||
1082 | /* Find a matching watch entry, or add this one. | ||
1083 | * Caller must hold audit_filter_mutex. */ | ||
1084 | static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, | ||
1085 | struct nameidata *ndw) | ||
1086 | { | ||
1087 | struct audit_watch *watch = krule->watch; | ||
1088 | struct inotify_watch *i_watch; | ||
1089 | struct audit_parent *parent; | ||
1090 | int ret = 0; | ||
1091 | |||
1092 | /* update watch filter fields */ | ||
1093 | if (ndw) { | ||
1094 | watch->dev = ndw->dentry->d_inode->i_sb->s_dev; | ||
1095 | watch->ino = ndw->dentry->d_inode->i_ino; | ||
1096 | } | ||
1097 | |||
1098 | /* The audit_filter_mutex must not be held during inotify calls because | ||
1099 | * we hold it during inotify event callback processing. If an existing | ||
1100 | * inotify watch is found, inotify_find_watch() grabs a reference before | ||
1101 | * returning. | ||
1102 | */ | ||
1103 | mutex_unlock(&audit_filter_mutex); | ||
1104 | |||
1105 | if (inotify_find_watch(audit_ih, ndp->dentry->d_inode, &i_watch) < 0) { | ||
1106 | parent = audit_init_parent(ndp); | ||
1107 | if (IS_ERR(parent)) { | ||
1108 | /* caller expects mutex locked */ | ||
1109 | mutex_lock(&audit_filter_mutex); | ||
1110 | return PTR_ERR(parent); | ||
1111 | } | ||
1112 | } else | ||
1113 | parent = container_of(i_watch, struct audit_parent, wdata); | ||
1114 | |||
1115 | mutex_lock(&audit_filter_mutex); | ||
1116 | |||
1117 | /* parent was moved before we took audit_filter_mutex */ | ||
1118 | if (parent->flags & AUDIT_PARENT_INVALID) | ||
1119 | ret = -ENOENT; | ||
1120 | else | ||
1121 | audit_add_to_parent(krule, parent); | ||
1122 | |||
1123 | /* match get in audit_init_parent or inotify_find_watch */ | ||
1124 | put_inotify_watch(&parent->wdata); | ||
1125 | return ret; | ||
1126 | } | ||
1127 | |||
1128 | /* Add rule to given filterlist if not a duplicate. */ | ||
471 | static inline int audit_add_rule(struct audit_entry *entry, | 1129 | static inline int audit_add_rule(struct audit_entry *entry, |
472 | struct list_head *list) | 1130 | struct list_head *list) |
473 | { | 1131 | { |
474 | struct audit_entry *e; | 1132 | struct audit_entry *e; |
1133 | struct audit_field *inode_f = entry->rule.inode_f; | ||
1134 | struct audit_watch *watch = entry->rule.watch; | ||
1135 | struct nameidata *ndp, *ndw; | ||
1136 | int h, err, putnd_needed = 0; | ||
1137 | |||
1138 | if (inode_f) { | ||
1139 | h = audit_hash_ino(inode_f->val); | ||
1140 | list = &audit_inode_hash[h]; | ||
1141 | } | ||
475 | 1142 | ||
476 | /* Do not use the _rcu iterator here, since this is the only | 1143 | mutex_lock(&audit_filter_mutex); |
477 | * addition routine. */ | 1144 | e = audit_find_rule(entry, list); |
478 | list_for_each_entry(e, list, list) { | 1145 | mutex_unlock(&audit_filter_mutex); |
479 | if (!audit_compare_rule(&entry->rule, &e->rule)) | 1146 | if (e) { |
480 | return -EEXIST; | 1147 | err = -EEXIST; |
1148 | goto error; | ||
1149 | } | ||
1150 | |||
1151 | /* Avoid calling path_lookup under audit_filter_mutex. */ | ||
1152 | if (watch) { | ||
1153 | err = audit_get_nd(watch->path, &ndp, &ndw); | ||
1154 | if (err) | ||
1155 | goto error; | ||
1156 | putnd_needed = 1; | ||
1157 | } | ||
1158 | |||
1159 | mutex_lock(&audit_filter_mutex); | ||
1160 | if (watch) { | ||
1161 | /* audit_filter_mutex is dropped and re-taken during this call */ | ||
1162 | err = audit_add_watch(&entry->rule, ndp, ndw); | ||
1163 | if (err) { | ||
1164 | mutex_unlock(&audit_filter_mutex); | ||
1165 | goto error; | ||
1166 | } | ||
1167 | h = audit_hash_ino((u32)watch->ino); | ||
1168 | list = &audit_inode_hash[h]; | ||
481 | } | 1169 | } |
482 | 1170 | ||
483 | if (entry->rule.flags & AUDIT_FILTER_PREPEND) { | 1171 | if (entry->rule.flags & AUDIT_FILTER_PREPEND) { |
484 | list_add_rcu(&entry->list, list); | 1172 | list_add_rcu(&entry->list, list); |
1173 | entry->rule.flags &= ~AUDIT_FILTER_PREPEND; | ||
485 | } else { | 1174 | } else { |
486 | list_add_tail_rcu(&entry->list, list); | 1175 | list_add_tail_rcu(&entry->list, list); |
487 | } | 1176 | } |
1177 | mutex_unlock(&audit_filter_mutex); | ||
488 | 1178 | ||
489 | return 0; | 1179 | if (putnd_needed) |
1180 | audit_put_nd(ndp, ndw); | ||
1181 | |||
1182 | return 0; | ||
1183 | |||
1184 | error: | ||
1185 | if (putnd_needed) | ||
1186 | audit_put_nd(ndp, ndw); | ||
1187 | if (watch) | ||
1188 | audit_put_watch(watch); /* tmp watch, matches initial get */ | ||
1189 | return err; | ||
490 | } | 1190 | } |
491 | 1191 | ||
492 | /* Remove an existing rule from filterlist. Protected by | 1192 | /* Remove an existing rule from filterlist. */ |
493 | * audit_netlink_mutex. */ | ||
494 | static inline int audit_del_rule(struct audit_entry *entry, | 1193 | static inline int audit_del_rule(struct audit_entry *entry, |
495 | struct list_head *list) | 1194 | struct list_head *list) |
496 | { | 1195 | { |
497 | struct audit_entry *e; | 1196 | struct audit_entry *e; |
1197 | struct audit_field *inode_f = entry->rule.inode_f; | ||
1198 | struct audit_watch *watch, *tmp_watch = entry->rule.watch; | ||
1199 | LIST_HEAD(inotify_list); | ||
1200 | int h, ret = 0; | ||
1201 | |||
1202 | if (inode_f) { | ||
1203 | h = audit_hash_ino(inode_f->val); | ||
1204 | list = &audit_inode_hash[h]; | ||
1205 | } | ||
498 | 1206 | ||
499 | /* Do not use the _rcu iterator here, since this is the only | 1207 | mutex_lock(&audit_filter_mutex); |
500 | * deletion routine. */ | 1208 | e = audit_find_rule(entry, list); |
501 | list_for_each_entry(e, list, list) { | 1209 | if (!e) { |
502 | if (!audit_compare_rule(&entry->rule, &e->rule)) { | 1210 | mutex_unlock(&audit_filter_mutex); |
503 | list_del_rcu(&e->list); | 1211 | ret = -ENOENT; |
504 | call_rcu(&e->rcu, audit_free_rule_rcu); | 1212 | goto out; |
505 | return 0; | 1213 | } |
1214 | |||
1215 | watch = e->rule.watch; | ||
1216 | if (watch) { | ||
1217 | struct audit_parent *parent = watch->parent; | ||
1218 | |||
1219 | list_del(&e->rule.rlist); | ||
1220 | |||
1221 | if (list_empty(&watch->rules)) { | ||
1222 | audit_remove_watch(watch); | ||
1223 | |||
1224 | if (list_empty(&parent->watches)) { | ||
1225 | /* Put parent on the inotify un-registration | ||
1226 | * list. Grab a reference before releasing | ||
1227 | * audit_filter_mutex, to be released in | ||
1228 | * audit_inotify_unregister(). */ | ||
1229 | list_add(&parent->ilist, &inotify_list); | ||
1230 | get_inotify_watch(&parent->wdata); | ||
1231 | } | ||
506 | } | 1232 | } |
507 | } | 1233 | } |
508 | return -ENOENT; /* No matching rule */ | 1234 | |
1235 | list_del_rcu(&e->list); | ||
1236 | call_rcu(&e->rcu, audit_free_rule_rcu); | ||
1237 | |||
1238 | mutex_unlock(&audit_filter_mutex); | ||
1239 | |||
1240 | if (!list_empty(&inotify_list)) | ||
1241 | audit_inotify_unregister(&inotify_list); | ||
1242 | |||
1243 | out: | ||
1244 | if (tmp_watch) | ||
1245 | audit_put_watch(tmp_watch); /* match initial get */ | ||
1246 | |||
1247 | return ret; | ||
509 | } | 1248 | } |
510 | 1249 | ||
511 | /* List rules using struct audit_rule. Exists for backward | 1250 | /* List rules using struct audit_rule. Exists for backward |
512 | * compatibility with userspace. */ | 1251 | * compatibility with userspace. */ |
513 | static int audit_list(void *_dest) | 1252 | static void audit_list(int pid, int seq, struct sk_buff_head *q) |
514 | { | 1253 | { |
515 | int pid, seq; | 1254 | struct sk_buff *skb; |
516 | int *dest = _dest; | ||
517 | struct audit_entry *entry; | 1255 | struct audit_entry *entry; |
518 | int i; | 1256 | int i; |
519 | 1257 | ||
520 | pid = dest[0]; | 1258 | /* This is a blocking read, so use audit_filter_mutex instead of rcu |
521 | seq = dest[1]; | 1259 | * iterator to sync with list writers. */ |
522 | kfree(dest); | ||
523 | |||
524 | mutex_lock(&audit_netlink_mutex); | ||
525 | |||
526 | /* The *_rcu iterators not needed here because we are | ||
527 | always called with audit_netlink_mutex held. */ | ||
528 | for (i=0; i<AUDIT_NR_FILTERS; i++) { | 1260 | for (i=0; i<AUDIT_NR_FILTERS; i++) { |
529 | list_for_each_entry(entry, &audit_filter_list[i], list) { | 1261 | list_for_each_entry(entry, &audit_filter_list[i], list) { |
530 | struct audit_rule *rule; | 1262 | struct audit_rule *rule; |
@@ -532,33 +1264,41 @@ static int audit_list(void *_dest) | |||
532 | rule = audit_krule_to_rule(&entry->rule); | 1264 | rule = audit_krule_to_rule(&entry->rule); |
533 | if (unlikely(!rule)) | 1265 | if (unlikely(!rule)) |
534 | break; | 1266 | break; |
535 | audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, | 1267 | skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, |
536 | rule, sizeof(*rule)); | 1268 | rule, sizeof(*rule)); |
1269 | if (skb) | ||
1270 | skb_queue_tail(q, skb); | ||
537 | kfree(rule); | 1271 | kfree(rule); |
538 | } | 1272 | } |
539 | } | 1273 | } |
540 | audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); | 1274 | for (i = 0; i < AUDIT_INODE_BUCKETS; i++) { |
541 | 1275 | list_for_each_entry(entry, &audit_inode_hash[i], list) { | |
542 | mutex_unlock(&audit_netlink_mutex); | 1276 | struct audit_rule *rule; |
543 | return 0; | 1277 | |
1278 | rule = audit_krule_to_rule(&entry->rule); | ||
1279 | if (unlikely(!rule)) | ||
1280 | break; | ||
1281 | skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, | ||
1282 | rule, sizeof(*rule)); | ||
1283 | if (skb) | ||
1284 | skb_queue_tail(q, skb); | ||
1285 | kfree(rule); | ||
1286 | } | ||
1287 | } | ||
1288 | skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); | ||
1289 | if (skb) | ||
1290 | skb_queue_tail(q, skb); | ||
544 | } | 1291 | } |
545 | 1292 | ||
546 | /* List rules using struct audit_rule_data. */ | 1293 | /* List rules using struct audit_rule_data. */ |
547 | static int audit_list_rules(void *_dest) | 1294 | static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) |
548 | { | 1295 | { |
549 | int pid, seq; | 1296 | struct sk_buff *skb; |
550 | int *dest = _dest; | ||
551 | struct audit_entry *e; | 1297 | struct audit_entry *e; |
552 | int i; | 1298 | int i; |
553 | 1299 | ||
554 | pid = dest[0]; | 1300 | /* This is a blocking read, so use audit_filter_mutex instead of rcu |
555 | seq = dest[1]; | 1301 | * iterator to sync with list writers. */ |
556 | kfree(dest); | ||
557 | |||
558 | mutex_lock(&audit_netlink_mutex); | ||
559 | |||
560 | /* The *_rcu iterators not needed here because we are | ||
561 | always called with audit_netlink_mutex held. */ | ||
562 | for (i=0; i<AUDIT_NR_FILTERS; i++) { | 1302 | for (i=0; i<AUDIT_NR_FILTERS; i++) { |
563 | list_for_each_entry(e, &audit_filter_list[i], list) { | 1303 | list_for_each_entry(e, &audit_filter_list[i], list) { |
564 | struct audit_rule_data *data; | 1304 | struct audit_rule_data *data; |
@@ -566,15 +1306,58 @@ static int audit_list_rules(void *_dest) | |||
566 | data = audit_krule_to_data(&e->rule); | 1306 | data = audit_krule_to_data(&e->rule); |
567 | if (unlikely(!data)) | 1307 | if (unlikely(!data)) |
568 | break; | 1308 | break; |
569 | audit_send_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, | 1309 | skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, |
570 | data, sizeof(*data)); | 1310 | data, sizeof(*data) + data->buflen); |
1311 | if (skb) | ||
1312 | skb_queue_tail(q, skb); | ||
571 | kfree(data); | 1313 | kfree(data); |
572 | } | 1314 | } |
573 | } | 1315 | } |
574 | audit_send_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); | 1316 | for (i=0; i< AUDIT_INODE_BUCKETS; i++) { |
1317 | list_for_each_entry(e, &audit_inode_hash[i], list) { | ||
1318 | struct audit_rule_data *data; | ||
575 | 1319 | ||
576 | mutex_unlock(&audit_netlink_mutex); | 1320 | data = audit_krule_to_data(&e->rule); |
577 | return 0; | 1321 | if (unlikely(!data)) |
1322 | break; | ||
1323 | skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, | ||
1324 | data, sizeof(*data) + data->buflen); | ||
1325 | if (skb) | ||
1326 | skb_queue_tail(q, skb); | ||
1327 | kfree(data); | ||
1328 | } | ||
1329 | } | ||
1330 | skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); | ||
1331 | if (skb) | ||
1332 | skb_queue_tail(q, skb); | ||
1333 | } | ||
1334 | |||
1335 | /* Log rule additions and removals */ | ||
1336 | static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action, | ||
1337 | struct audit_krule *rule, int res) | ||
1338 | { | ||
1339 | struct audit_buffer *ab; | ||
1340 | |||
1341 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | ||
1342 | if (!ab) | ||
1343 | return; | ||
1344 | audit_log_format(ab, "auid=%u", loginuid); | ||
1345 | if (sid) { | ||
1346 | char *ctx = NULL; | ||
1347 | u32 len; | ||
1348 | if (selinux_ctxid_to_string(sid, &ctx, &len)) | ||
1349 | audit_log_format(ab, " ssid=%u", sid); | ||
1350 | else | ||
1351 | audit_log_format(ab, " subj=%s", ctx); | ||
1352 | kfree(ctx); | ||
1353 | } | ||
1354 | audit_log_format(ab, " %s rule key=", action); | ||
1355 | if (rule->filterkey) | ||
1356 | audit_log_untrustedstring(ab, rule->filterkey); | ||
1357 | else | ||
1358 | audit_log_format(ab, "(null)"); | ||
1359 | audit_log_format(ab, " list=%d res=%d", rule->listnr, res); | ||
1360 | audit_log_end(ab); | ||
578 | } | 1361 | } |
579 | 1362 | ||
580 | /** | 1363 | /** |
@@ -592,7 +1375,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, | |||
592 | size_t datasz, uid_t loginuid, u32 sid) | 1375 | size_t datasz, uid_t loginuid, u32 sid) |
593 | { | 1376 | { |
594 | struct task_struct *tsk; | 1377 | struct task_struct *tsk; |
595 | int *dest; | 1378 | struct audit_netlink_list *dest; |
596 | int err = 0; | 1379 | int err = 0; |
597 | struct audit_entry *entry; | 1380 | struct audit_entry *entry; |
598 | 1381 | ||
@@ -605,18 +1388,22 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, | |||
605 | * happen if we're actually running in the context of auditctl | 1388 | * happen if we're actually running in the context of auditctl |
606 | * trying to _send_ the stuff */ | 1389 | * trying to _send_ the stuff */ |
607 | 1390 | ||
608 | dest = kmalloc(2 * sizeof(int), GFP_KERNEL); | 1391 | dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); |
609 | if (!dest) | 1392 | if (!dest) |
610 | return -ENOMEM; | 1393 | return -ENOMEM; |
611 | dest[0] = pid; | 1394 | dest->pid = pid; |
612 | dest[1] = seq; | 1395 | skb_queue_head_init(&dest->q); |
613 | 1396 | ||
1397 | mutex_lock(&audit_filter_mutex); | ||
614 | if (type == AUDIT_LIST) | 1398 | if (type == AUDIT_LIST) |
615 | tsk = kthread_run(audit_list, dest, "audit_list"); | 1399 | audit_list(pid, seq, &dest->q); |
616 | else | 1400 | else |
617 | tsk = kthread_run(audit_list_rules, dest, | 1401 | audit_list_rules(pid, seq, &dest->q); |
618 | "audit_list_rules"); | 1402 | mutex_unlock(&audit_filter_mutex); |
1403 | |||
1404 | tsk = kthread_run(audit_send_list, dest, "audit_send_list"); | ||
619 | if (IS_ERR(tsk)) { | 1405 | if (IS_ERR(tsk)) { |
1406 | skb_queue_purge(&dest->q); | ||
620 | kfree(dest); | 1407 | kfree(dest); |
621 | err = PTR_ERR(tsk); | 1408 | err = PTR_ERR(tsk); |
622 | } | 1409 | } |
@@ -632,23 +1419,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, | |||
632 | 1419 | ||
633 | err = audit_add_rule(entry, | 1420 | err = audit_add_rule(entry, |
634 | &audit_filter_list[entry->rule.listnr]); | 1421 | &audit_filter_list[entry->rule.listnr]); |
635 | if (sid) { | 1422 | audit_log_rule_change(loginuid, sid, "add", &entry->rule, !err); |
636 | char *ctx = NULL; | ||
637 | u32 len; | ||
638 | if (selinux_ctxid_to_string(sid, &ctx, &len)) { | ||
639 | /* Maybe call audit_panic? */ | ||
640 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
641 | "auid=%u ssid=%u add rule to list=%d res=%d", | ||
642 | loginuid, sid, entry->rule.listnr, !err); | ||
643 | } else | ||
644 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
645 | "auid=%u subj=%s add rule to list=%d res=%d", | ||
646 | loginuid, ctx, entry->rule.listnr, !err); | ||
647 | kfree(ctx); | ||
648 | } else | ||
649 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
650 | "auid=%u add rule to list=%d res=%d", | ||
651 | loginuid, entry->rule.listnr, !err); | ||
652 | 1423 | ||
653 | if (err) | 1424 | if (err) |
654 | audit_free_rule(entry); | 1425 | audit_free_rule(entry); |
@@ -664,24 +1435,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, | |||
664 | 1435 | ||
665 | err = audit_del_rule(entry, | 1436 | err = audit_del_rule(entry, |
666 | &audit_filter_list[entry->rule.listnr]); | 1437 | &audit_filter_list[entry->rule.listnr]); |
667 | 1438 | audit_log_rule_change(loginuid, sid, "remove", &entry->rule, | |
668 | if (sid) { | 1439 | !err); |
669 | char *ctx = NULL; | ||
670 | u32 len; | ||
671 | if (selinux_ctxid_to_string(sid, &ctx, &len)) { | ||
672 | /* Maybe call audit_panic? */ | ||
673 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
674 | "auid=%u ssid=%u remove rule from list=%d res=%d", | ||
675 | loginuid, sid, entry->rule.listnr, !err); | ||
676 | } else | ||
677 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
678 | "auid=%u subj=%s remove rule from list=%d res=%d", | ||
679 | loginuid, ctx, entry->rule.listnr, !err); | ||
680 | kfree(ctx); | ||
681 | } else | ||
682 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
683 | "auid=%u remove rule from list=%d res=%d", | ||
684 | loginuid, entry->rule.listnr, !err); | ||
685 | 1440 | ||
686 | audit_free_rule(entry); | 1441 | audit_free_rule(entry); |
687 | break; | 1442 | break; |
@@ -712,7 +1467,43 @@ int audit_comparator(const u32 left, const u32 op, const u32 right) | |||
712 | return 0; | 1467 | return 0; |
713 | } | 1468 | } |
714 | 1469 | ||
1470 | /* Compare given dentry name with last component in given path, | ||
1471 | * return of 0 indicates a match. */ | ||
1472 | int audit_compare_dname_path(const char *dname, const char *path, | ||
1473 | int *dirlen) | ||
1474 | { | ||
1475 | int dlen, plen; | ||
1476 | const char *p; | ||
1477 | |||
1478 | if (!dname || !path) | ||
1479 | return 1; | ||
715 | 1480 | ||
1481 | dlen = strlen(dname); | ||
1482 | plen = strlen(path); | ||
1483 | if (plen < dlen) | ||
1484 | return 1; | ||
1485 | |||
1486 | /* disregard trailing slashes */ | ||
1487 | p = path + plen - 1; | ||
1488 | while ((*p == '/') && (p > path)) | ||
1489 | p--; | ||
1490 | |||
1491 | /* find last path component */ | ||
1492 | p = p - dlen + 1; | ||
1493 | if (p < path) | ||
1494 | return 1; | ||
1495 | else if (p > path) { | ||
1496 | if (*--p != '/') | ||
1497 | return 1; | ||
1498 | else | ||
1499 | p++; | ||
1500 | } | ||
1501 | |||
1502 | /* return length of path's directory component */ | ||
1503 | if (dirlen) | ||
1504 | *dirlen = p - path; | ||
1505 | return strncmp(p, dname, dlen); | ||
1506 | } | ||
716 | 1507 | ||
717 | static int audit_filter_user_rules(struct netlink_skb_parms *cb, | 1508 | static int audit_filter_user_rules(struct netlink_skb_parms *cb, |
718 | struct audit_krule *rule, | 1509 | struct audit_krule *rule, |
@@ -744,7 +1535,6 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, | |||
744 | } | 1535 | } |
745 | switch (rule->action) { | 1536 | switch (rule->action) { |
746 | case AUDIT_NEVER: *state = AUDIT_DISABLED; break; | 1537 | case AUDIT_NEVER: *state = AUDIT_DISABLED; break; |
747 | case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; | ||
748 | case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; | 1538 | case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; |
749 | } | 1539 | } |
750 | return 1; | 1540 | return 1; |
@@ -806,11 +1596,16 @@ static inline int audit_rule_has_selinux(struct audit_krule *rule) | |||
806 | for (i = 0; i < rule->field_count; i++) { | 1596 | for (i = 0; i < rule->field_count; i++) { |
807 | struct audit_field *f = &rule->fields[i]; | 1597 | struct audit_field *f = &rule->fields[i]; |
808 | switch (f->type) { | 1598 | switch (f->type) { |
809 | case AUDIT_SE_USER: | 1599 | case AUDIT_SUBJ_USER: |
810 | case AUDIT_SE_ROLE: | 1600 | case AUDIT_SUBJ_ROLE: |
811 | case AUDIT_SE_TYPE: | 1601 | case AUDIT_SUBJ_TYPE: |
812 | case AUDIT_SE_SEN: | 1602 | case AUDIT_SUBJ_SEN: |
813 | case AUDIT_SE_CLR: | 1603 | case AUDIT_SUBJ_CLR: |
1604 | case AUDIT_OBJ_USER: | ||
1605 | case AUDIT_OBJ_ROLE: | ||
1606 | case AUDIT_OBJ_TYPE: | ||
1607 | case AUDIT_OBJ_LEV_LOW: | ||
1608 | case AUDIT_OBJ_LEV_HIGH: | ||
814 | return 1; | 1609 | return 1; |
815 | } | 1610 | } |
816 | } | 1611 | } |
@@ -826,32 +1621,65 @@ static inline int audit_rule_has_selinux(struct audit_krule *rule) | |||
826 | int selinux_audit_rule_update(void) | 1621 | int selinux_audit_rule_update(void) |
827 | { | 1622 | { |
828 | struct audit_entry *entry, *n, *nentry; | 1623 | struct audit_entry *entry, *n, *nentry; |
1624 | struct audit_watch *watch; | ||
829 | int i, err = 0; | 1625 | int i, err = 0; |
830 | 1626 | ||
831 | /* audit_netlink_mutex synchronizes the writers */ | 1627 | /* audit_filter_mutex synchronizes the writers */ |
832 | mutex_lock(&audit_netlink_mutex); | 1628 | mutex_lock(&audit_filter_mutex); |
833 | 1629 | ||
834 | for (i = 0; i < AUDIT_NR_FILTERS; i++) { | 1630 | for (i = 0; i < AUDIT_NR_FILTERS; i++) { |
835 | list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { | 1631 | list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { |
836 | if (!audit_rule_has_selinux(&entry->rule)) | 1632 | if (!audit_rule_has_selinux(&entry->rule)) |
837 | continue; | 1633 | continue; |
838 | 1634 | ||
839 | nentry = audit_dupe_rule(&entry->rule); | 1635 | watch = entry->rule.watch; |
1636 | nentry = audit_dupe_rule(&entry->rule, watch); | ||
840 | if (unlikely(IS_ERR(nentry))) { | 1637 | if (unlikely(IS_ERR(nentry))) { |
841 | /* save the first error encountered for the | 1638 | /* save the first error encountered for the |
842 | * return value */ | 1639 | * return value */ |
843 | if (!err) | 1640 | if (!err) |
844 | err = PTR_ERR(nentry); | 1641 | err = PTR_ERR(nentry); |
845 | audit_panic("error updating selinux filters"); | 1642 | audit_panic("error updating selinux filters"); |
1643 | if (watch) | ||
1644 | list_del(&entry->rule.rlist); | ||
846 | list_del_rcu(&entry->list); | 1645 | list_del_rcu(&entry->list); |
847 | } else { | 1646 | } else { |
1647 | if (watch) { | ||
1648 | list_add(&nentry->rule.rlist, | ||
1649 | &watch->rules); | ||
1650 | list_del(&entry->rule.rlist); | ||
1651 | } | ||
848 | list_replace_rcu(&entry->list, &nentry->list); | 1652 | list_replace_rcu(&entry->list, &nentry->list); |
849 | } | 1653 | } |
850 | call_rcu(&entry->rcu, audit_free_rule_rcu); | 1654 | call_rcu(&entry->rcu, audit_free_rule_rcu); |
851 | } | 1655 | } |
852 | } | 1656 | } |
853 | 1657 | ||
854 | mutex_unlock(&audit_netlink_mutex); | 1658 | mutex_unlock(&audit_filter_mutex); |
855 | 1659 | ||
856 | return err; | 1660 | return err; |
857 | } | 1661 | } |
1662 | |||
1663 | /* Update watch data in audit rules based on inotify events. */ | ||
1664 | void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, | ||
1665 | u32 cookie, const char *dname, struct inode *inode) | ||
1666 | { | ||
1667 | struct audit_parent *parent; | ||
1668 | |||
1669 | parent = container_of(i_watch, struct audit_parent, wdata); | ||
1670 | |||
1671 | if (mask & (IN_CREATE|IN_MOVED_TO) && inode) | ||
1672 | audit_update_watch(parent, dname, inode->i_sb->s_dev, | ||
1673 | inode->i_ino, 0); | ||
1674 | else if (mask & (IN_DELETE|IN_MOVED_FROM)) | ||
1675 | audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); | ||
1676 | /* inotify automatically removes the watch and sends IN_IGNORED */ | ||
1677 | else if (mask & (IN_DELETE_SELF|IN_UNMOUNT)) | ||
1678 | audit_remove_parent_watches(parent); | ||
1679 | /* inotify does not remove the watch, so remove it manually */ | ||
1680 | else if(mask & IN_MOVE_SELF) { | ||
1681 | audit_remove_parent_watches(parent); | ||
1682 | inotify_remove_watch_locked(audit_ih, i_watch); | ||
1683 | } else if (mask & IN_IGNORED) | ||
1684 | put_inotify_watch(i_watch); | ||
1685 | } | ||
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1c03a4ed1b27..ae40ac8c39e7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -3,7 +3,7 @@ | |||
3 | * | 3 | * |
4 | * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. | 4 | * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. |
5 | * Copyright 2005 Hewlett-Packard Development Company, L.P. | 5 | * Copyright 2005 Hewlett-Packard Development Company, L.P. |
6 | * Copyright (C) 2005 IBM Corporation | 6 | * Copyright (C) 2005, 2006 IBM Corporation |
7 | * All Rights Reserved. | 7 | * All Rights Reserved. |
8 | * | 8 | * |
9 | * This program is free software; you can redistribute it and/or modify | 9 | * This program is free software; you can redistribute it and/or modify |
@@ -29,6 +29,9 @@ | |||
29 | * this file -- see entry.S) is based on a GPL'd patch written by | 29 | * this file -- see entry.S) is based on a GPL'd patch written by |
30 | * okir@suse.de and Copyright 2003 SuSE Linux AG. | 30 | * okir@suse.de and Copyright 2003 SuSE Linux AG. |
31 | * | 31 | * |
32 | * POSIX message queue support added by George Wilson <ltcgcw@us.ibm.com>, | ||
33 | * 2006. | ||
34 | * | ||
32 | * The support of additional filter rules compares (>, <, >=, <=) was | 35 | * The support of additional filter rules compares (>, <, >=, <=) was |
33 | * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005. | 36 | * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005. |
34 | * | 37 | * |
@@ -49,6 +52,7 @@ | |||
49 | #include <linux/module.h> | 52 | #include <linux/module.h> |
50 | #include <linux/mount.h> | 53 | #include <linux/mount.h> |
51 | #include <linux/socket.h> | 54 | #include <linux/socket.h> |
55 | #include <linux/mqueue.h> | ||
52 | #include <linux/audit.h> | 56 | #include <linux/audit.h> |
53 | #include <linux/personality.h> | 57 | #include <linux/personality.h> |
54 | #include <linux/time.h> | 58 | #include <linux/time.h> |
@@ -59,6 +63,8 @@ | |||
59 | #include <linux/list.h> | 63 | #include <linux/list.h> |
60 | #include <linux/tty.h> | 64 | #include <linux/tty.h> |
61 | #include <linux/selinux.h> | 65 | #include <linux/selinux.h> |
66 | #include <linux/binfmts.h> | ||
67 | #include <linux/syscalls.h> | ||
62 | 68 | ||
63 | #include "audit.h" | 69 | #include "audit.h" |
64 | 70 | ||
@@ -76,6 +82,9 @@ extern int audit_enabled; | |||
76 | * path_lookup. */ | 82 | * path_lookup. */ |
77 | #define AUDIT_NAMES_RESERVED 7 | 83 | #define AUDIT_NAMES_RESERVED 7 |
78 | 84 | ||
85 | /* Indicates that audit should log the full pathname. */ | ||
86 | #define AUDIT_NAME_FULL -1 | ||
87 | |||
79 | /* When fs/namei.c:getname() is called, we store the pointer in name and | 88 | /* When fs/namei.c:getname() is called, we store the pointer in name and |
80 | * we don't let putname() free it (instead we free all of the saved | 89 | * we don't let putname() free it (instead we free all of the saved |
81 | * pointers at syscall exit time). | 90 | * pointers at syscall exit time). |
@@ -83,8 +92,9 @@ extern int audit_enabled; | |||
83 | * Further, in fs/namei.c:path_lookup() we store the inode and device. */ | 92 | * Further, in fs/namei.c:path_lookup() we store the inode and device. */ |
84 | struct audit_names { | 93 | struct audit_names { |
85 | const char *name; | 94 | const char *name; |
95 | int name_len; /* number of name's characters to log */ | ||
96 | unsigned name_put; /* call __putname() for this name */ | ||
86 | unsigned long ino; | 97 | unsigned long ino; |
87 | unsigned long pino; | ||
88 | dev_t dev; | 98 | dev_t dev; |
89 | umode_t mode; | 99 | umode_t mode; |
90 | uid_t uid; | 100 | uid_t uid; |
@@ -100,6 +110,33 @@ struct audit_aux_data { | |||
100 | 110 | ||
101 | #define AUDIT_AUX_IPCPERM 0 | 111 | #define AUDIT_AUX_IPCPERM 0 |
102 | 112 | ||
113 | struct audit_aux_data_mq_open { | ||
114 | struct audit_aux_data d; | ||
115 | int oflag; | ||
116 | mode_t mode; | ||
117 | struct mq_attr attr; | ||
118 | }; | ||
119 | |||
120 | struct audit_aux_data_mq_sendrecv { | ||
121 | struct audit_aux_data d; | ||
122 | mqd_t mqdes; | ||
123 | size_t msg_len; | ||
124 | unsigned int msg_prio; | ||
125 | struct timespec abs_timeout; | ||
126 | }; | ||
127 | |||
128 | struct audit_aux_data_mq_notify { | ||
129 | struct audit_aux_data d; | ||
130 | mqd_t mqdes; | ||
131 | struct sigevent notification; | ||
132 | }; | ||
133 | |||
134 | struct audit_aux_data_mq_getsetattr { | ||
135 | struct audit_aux_data d; | ||
136 | mqd_t mqdes; | ||
137 | struct mq_attr mqstat; | ||
138 | }; | ||
139 | |||
103 | struct audit_aux_data_ipcctl { | 140 | struct audit_aux_data_ipcctl { |
104 | struct audit_aux_data d; | 141 | struct audit_aux_data d; |
105 | struct ipc_perm p; | 142 | struct ipc_perm p; |
@@ -110,6 +147,13 @@ struct audit_aux_data_ipcctl { | |||
110 | u32 osid; | 147 | u32 osid; |
111 | }; | 148 | }; |
112 | 149 | ||
150 | struct audit_aux_data_execve { | ||
151 | struct audit_aux_data d; | ||
152 | int argc; | ||
153 | int envc; | ||
154 | char mem[0]; | ||
155 | }; | ||
156 | |||
113 | struct audit_aux_data_socketcall { | 157 | struct audit_aux_data_socketcall { |
114 | struct audit_aux_data d; | 158 | struct audit_aux_data d; |
115 | int nargs; | 159 | int nargs; |
@@ -142,13 +186,14 @@ struct audit_context { | |||
142 | int auditable; /* 1 if record should be written */ | 186 | int auditable; /* 1 if record should be written */ |
143 | int name_count; | 187 | int name_count; |
144 | struct audit_names names[AUDIT_NAMES]; | 188 | struct audit_names names[AUDIT_NAMES]; |
189 | char * filterkey; /* key for rule that triggered record */ | ||
145 | struct dentry * pwd; | 190 | struct dentry * pwd; |
146 | struct vfsmount * pwdmnt; | 191 | struct vfsmount * pwdmnt; |
147 | struct audit_context *previous; /* For nested syscalls */ | 192 | struct audit_context *previous; /* For nested syscalls */ |
148 | struct audit_aux_data *aux; | 193 | struct audit_aux_data *aux; |
149 | 194 | ||
150 | /* Save things to print about task_struct */ | 195 | /* Save things to print about task_struct */ |
151 | pid_t pid; | 196 | pid_t pid, ppid; |
152 | uid_t uid, euid, suid, fsuid; | 197 | uid_t uid, euid, suid, fsuid; |
153 | gid_t gid, egid, sgid, fsgid; | 198 | gid_t gid, egid, sgid, fsgid; |
154 | unsigned long personality; | 199 | unsigned long personality; |
@@ -160,12 +205,13 @@ struct audit_context { | |||
160 | #endif | 205 | #endif |
161 | }; | 206 | }; |
162 | 207 | ||
163 | 208 | /* Determine if any context name data matches a rule's watch data */ | |
164 | /* Compare a task_struct with an audit_rule. Return 1 on match, 0 | 209 | /* Compare a task_struct with an audit_rule. Return 1 on match, 0 |
165 | * otherwise. */ | 210 | * otherwise. */ |
166 | static int audit_filter_rules(struct task_struct *tsk, | 211 | static int audit_filter_rules(struct task_struct *tsk, |
167 | struct audit_krule *rule, | 212 | struct audit_krule *rule, |
168 | struct audit_context *ctx, | 213 | struct audit_context *ctx, |
214 | struct audit_names *name, | ||
169 | enum audit_state *state) | 215 | enum audit_state *state) |
170 | { | 216 | { |
171 | int i, j, need_sid = 1; | 217 | int i, j, need_sid = 1; |
@@ -179,6 +225,10 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
179 | case AUDIT_PID: | 225 | case AUDIT_PID: |
180 | result = audit_comparator(tsk->pid, f->op, f->val); | 226 | result = audit_comparator(tsk->pid, f->op, f->val); |
181 | break; | 227 | break; |
228 | case AUDIT_PPID: | ||
229 | if (ctx) | ||
230 | result = audit_comparator(ctx->ppid, f->op, f->val); | ||
231 | break; | ||
182 | case AUDIT_UID: | 232 | case AUDIT_UID: |
183 | result = audit_comparator(tsk->uid, f->op, f->val); | 233 | result = audit_comparator(tsk->uid, f->op, f->val); |
184 | break; | 234 | break; |
@@ -224,7 +274,10 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
224 | } | 274 | } |
225 | break; | 275 | break; |
226 | case AUDIT_DEVMAJOR: | 276 | case AUDIT_DEVMAJOR: |
227 | if (ctx) { | 277 | if (name) |
278 | result = audit_comparator(MAJOR(name->dev), | ||
279 | f->op, f->val); | ||
280 | else if (ctx) { | ||
228 | for (j = 0; j < ctx->name_count; j++) { | 281 | for (j = 0; j < ctx->name_count; j++) { |
229 | if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { | 282 | if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { |
230 | ++result; | 283 | ++result; |
@@ -234,7 +287,10 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
234 | } | 287 | } |
235 | break; | 288 | break; |
236 | case AUDIT_DEVMINOR: | 289 | case AUDIT_DEVMINOR: |
237 | if (ctx) { | 290 | if (name) |
291 | result = audit_comparator(MINOR(name->dev), | ||
292 | f->op, f->val); | ||
293 | else if (ctx) { | ||
238 | for (j = 0; j < ctx->name_count; j++) { | 294 | for (j = 0; j < ctx->name_count; j++) { |
239 | if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { | 295 | if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { |
240 | ++result; | 296 | ++result; |
@@ -244,26 +300,32 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
244 | } | 300 | } |
245 | break; | 301 | break; |
246 | case AUDIT_INODE: | 302 | case AUDIT_INODE: |
247 | if (ctx) { | 303 | if (name) |
304 | result = (name->ino == f->val); | ||
305 | else if (ctx) { | ||
248 | for (j = 0; j < ctx->name_count; j++) { | 306 | for (j = 0; j < ctx->name_count; j++) { |
249 | if (audit_comparator(ctx->names[j].ino, f->op, f->val) || | 307 | if (audit_comparator(ctx->names[j].ino, f->op, f->val)) { |
250 | audit_comparator(ctx->names[j].pino, f->op, f->val)) { | ||
251 | ++result; | 308 | ++result; |
252 | break; | 309 | break; |
253 | } | 310 | } |
254 | } | 311 | } |
255 | } | 312 | } |
256 | break; | 313 | break; |
314 | case AUDIT_WATCH: | ||
315 | if (name && rule->watch->ino != (unsigned long)-1) | ||
316 | result = (name->dev == rule->watch->dev && | ||
317 | name->ino == rule->watch->ino); | ||
318 | break; | ||
257 | case AUDIT_LOGINUID: | 319 | case AUDIT_LOGINUID: |
258 | result = 0; | 320 | result = 0; |
259 | if (ctx) | 321 | if (ctx) |
260 | result = audit_comparator(ctx->loginuid, f->op, f->val); | 322 | result = audit_comparator(ctx->loginuid, f->op, f->val); |
261 | break; | 323 | break; |
262 | case AUDIT_SE_USER: | 324 | case AUDIT_SUBJ_USER: |
263 | case AUDIT_SE_ROLE: | 325 | case AUDIT_SUBJ_ROLE: |
264 | case AUDIT_SE_TYPE: | 326 | case AUDIT_SUBJ_TYPE: |
265 | case AUDIT_SE_SEN: | 327 | case AUDIT_SUBJ_SEN: |
266 | case AUDIT_SE_CLR: | 328 | case AUDIT_SUBJ_CLR: |
267 | /* NOTE: this may return negative values indicating | 329 | /* NOTE: this may return negative values indicating |
268 | a temporary error. We simply treat this as a | 330 | a temporary error. We simply treat this as a |
269 | match for now to avoid losing information that | 331 | match for now to avoid losing information that |
@@ -280,6 +342,46 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
280 | ctx); | 342 | ctx); |
281 | } | 343 | } |
282 | break; | 344 | break; |
345 | case AUDIT_OBJ_USER: | ||
346 | case AUDIT_OBJ_ROLE: | ||
347 | case AUDIT_OBJ_TYPE: | ||
348 | case AUDIT_OBJ_LEV_LOW: | ||
349 | case AUDIT_OBJ_LEV_HIGH: | ||
350 | /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR | ||
351 | also applies here */ | ||
352 | if (f->se_rule) { | ||
353 | /* Find files that match */ | ||
354 | if (name) { | ||
355 | result = selinux_audit_rule_match( | ||
356 | name->osid, f->type, f->op, | ||
357 | f->se_rule, ctx); | ||
358 | } else if (ctx) { | ||
359 | for (j = 0; j < ctx->name_count; j++) { | ||
360 | if (selinux_audit_rule_match( | ||
361 | ctx->names[j].osid, | ||
362 | f->type, f->op, | ||
363 | f->se_rule, ctx)) { | ||
364 | ++result; | ||
365 | break; | ||
366 | } | ||
367 | } | ||
368 | } | ||
369 | /* Find ipc objects that match */ | ||
370 | if (ctx) { | ||
371 | struct audit_aux_data *aux; | ||
372 | for (aux = ctx->aux; aux; | ||
373 | aux = aux->next) { | ||
374 | if (aux->type == AUDIT_IPC) { | ||
375 | struct audit_aux_data_ipcctl *axi = (void *)aux; | ||
376 | if (selinux_audit_rule_match(axi->osid, f->type, f->op, f->se_rule, ctx)) { | ||
377 | ++result; | ||
378 | break; | ||
379 | } | ||
380 | } | ||
381 | } | ||
382 | } | ||
383 | } | ||
384 | break; | ||
283 | case AUDIT_ARG0: | 385 | case AUDIT_ARG0: |
284 | case AUDIT_ARG1: | 386 | case AUDIT_ARG1: |
285 | case AUDIT_ARG2: | 387 | case AUDIT_ARG2: |
@@ -287,14 +389,19 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
287 | if (ctx) | 389 | if (ctx) |
288 | result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val); | 390 | result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val); |
289 | break; | 391 | break; |
392 | case AUDIT_FILTERKEY: | ||
393 | /* ignore this field for filtering */ | ||
394 | result = 1; | ||
395 | break; | ||
290 | } | 396 | } |
291 | 397 | ||
292 | if (!result) | 398 | if (!result) |
293 | return 0; | 399 | return 0; |
294 | } | 400 | } |
401 | if (rule->filterkey) | ||
402 | ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); | ||
295 | switch (rule->action) { | 403 | switch (rule->action) { |
296 | case AUDIT_NEVER: *state = AUDIT_DISABLED; break; | 404 | case AUDIT_NEVER: *state = AUDIT_DISABLED; break; |
297 | case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; | ||
298 | case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; | 405 | case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; |
299 | } | 406 | } |
300 | return 1; | 407 | return 1; |
@@ -311,7 +418,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk) | |||
311 | 418 | ||
312 | rcu_read_lock(); | 419 | rcu_read_lock(); |
313 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { | 420 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { |
314 | if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { | 421 | if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { |
315 | rcu_read_unlock(); | 422 | rcu_read_unlock(); |
316 | return state; | 423 | return state; |
317 | } | 424 | } |
@@ -341,8 +448,47 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, | |||
341 | int bit = AUDIT_BIT(ctx->major); | 448 | int bit = AUDIT_BIT(ctx->major); |
342 | 449 | ||
343 | list_for_each_entry_rcu(e, list, list) { | 450 | list_for_each_entry_rcu(e, list, list) { |
344 | if ((e->rule.mask[word] & bit) == bit | 451 | if ((e->rule.mask[word] & bit) == bit && |
345 | && audit_filter_rules(tsk, &e->rule, ctx, &state)) { | 452 | audit_filter_rules(tsk, &e->rule, ctx, NULL, |
453 | &state)) { | ||
454 | rcu_read_unlock(); | ||
455 | return state; | ||
456 | } | ||
457 | } | ||
458 | } | ||
459 | rcu_read_unlock(); | ||
460 | return AUDIT_BUILD_CONTEXT; | ||
461 | } | ||
462 | |||
463 | /* At syscall exit time, this filter is called if any audit_names[] have been | ||
464 | * collected during syscall processing. We only check rules in sublists at hash | ||
465 | * buckets applicable to the inode numbers in audit_names[]. | ||
466 | * Regarding audit_state, same rules apply as for audit_filter_syscall(). | ||
467 | */ | ||
468 | enum audit_state audit_filter_inodes(struct task_struct *tsk, | ||
469 | struct audit_context *ctx) | ||
470 | { | ||
471 | int i; | ||
472 | struct audit_entry *e; | ||
473 | enum audit_state state; | ||
474 | |||
475 | if (audit_pid && tsk->tgid == audit_pid) | ||
476 | return AUDIT_DISABLED; | ||
477 | |||
478 | rcu_read_lock(); | ||
479 | for (i = 0; i < ctx->name_count; i++) { | ||
480 | int word = AUDIT_WORD(ctx->major); | ||
481 | int bit = AUDIT_BIT(ctx->major); | ||
482 | struct audit_names *n = &ctx->names[i]; | ||
483 | int h = audit_hash_ino((u32)n->ino); | ||
484 | struct list_head *list = &audit_inode_hash[h]; | ||
485 | |||
486 | if (list_empty(list)) | ||
487 | continue; | ||
488 | |||
489 | list_for_each_entry_rcu(e, list, list) { | ||
490 | if ((e->rule.mask[word] & bit) == bit && | ||
491 | audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { | ||
346 | rcu_read_unlock(); | 492 | rcu_read_unlock(); |
347 | return state; | 493 | return state; |
348 | } | 494 | } |
@@ -352,6 +498,11 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, | |||
352 | return AUDIT_BUILD_CONTEXT; | 498 | return AUDIT_BUILD_CONTEXT; |
353 | } | 499 | } |
354 | 500 | ||
501 | void audit_set_auditable(struct audit_context *ctx) | ||
502 | { | ||
503 | ctx->auditable = 1; | ||
504 | } | ||
505 | |||
355 | static inline struct audit_context *audit_get_context(struct task_struct *tsk, | 506 | static inline struct audit_context *audit_get_context(struct task_struct *tsk, |
356 | int return_valid, | 507 | int return_valid, |
357 | int return_code) | 508 | int return_code) |
@@ -365,12 +516,22 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, | |||
365 | 516 | ||
366 | if (context->in_syscall && !context->auditable) { | 517 | if (context->in_syscall && !context->auditable) { |
367 | enum audit_state state; | 518 | enum audit_state state; |
519 | |||
368 | state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); | 520 | state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); |
521 | if (state == AUDIT_RECORD_CONTEXT) { | ||
522 | context->auditable = 1; | ||
523 | goto get_context; | ||
524 | } | ||
525 | |||
526 | state = audit_filter_inodes(tsk, context); | ||
369 | if (state == AUDIT_RECORD_CONTEXT) | 527 | if (state == AUDIT_RECORD_CONTEXT) |
370 | context->auditable = 1; | 528 | context->auditable = 1; |
529 | |||
371 | } | 530 | } |
372 | 531 | ||
532 | get_context: | ||
373 | context->pid = tsk->pid; | 533 | context->pid = tsk->pid; |
534 | context->ppid = sys_getppid(); /* sic. tsk == current in all cases */ | ||
374 | context->uid = tsk->uid; | 535 | context->uid = tsk->uid; |
375 | context->gid = tsk->gid; | 536 | context->gid = tsk->gid; |
376 | context->euid = tsk->euid; | 537 | context->euid = tsk->euid; |
@@ -413,7 +574,7 @@ static inline void audit_free_names(struct audit_context *context) | |||
413 | #endif | 574 | #endif |
414 | 575 | ||
415 | for (i = 0; i < context->name_count; i++) { | 576 | for (i = 0; i < context->name_count; i++) { |
416 | if (context->names[i].name) | 577 | if (context->names[i].name && context->names[i].name_put) |
417 | __putname(context->names[i].name); | 578 | __putname(context->names[i].name); |
418 | } | 579 | } |
419 | context->name_count = 0; | 580 | context->name_count = 0; |
@@ -513,6 +674,7 @@ static inline void audit_free_context(struct audit_context *context) | |||
513 | } | 674 | } |
514 | audit_free_names(context); | 675 | audit_free_names(context); |
515 | audit_free_aux(context); | 676 | audit_free_aux(context); |
677 | kfree(context->filterkey); | ||
516 | kfree(context); | 678 | kfree(context); |
517 | context = previous; | 679 | context = previous; |
518 | } while (context); | 680 | } while (context); |
@@ -544,8 +706,7 @@ static void audit_log_task_context(struct audit_buffer *ab) | |||
544 | return; | 706 | return; |
545 | 707 | ||
546 | error_path: | 708 | error_path: |
547 | if (ctx) | 709 | kfree(ctx); |
548 | kfree(ctx); | ||
549 | audit_panic("error in audit_log_task_context"); | 710 | audit_panic("error in audit_log_task_context"); |
550 | return; | 711 | return; |
551 | } | 712 | } |
@@ -606,7 +767,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
606 | tty = "(none)"; | 767 | tty = "(none)"; |
607 | audit_log_format(ab, | 768 | audit_log_format(ab, |
608 | " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" | 769 | " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" |
609 | " pid=%d auid=%u uid=%u gid=%u" | 770 | " ppid=%d pid=%d auid=%u uid=%u gid=%u" |
610 | " euid=%u suid=%u fsuid=%u" | 771 | " euid=%u suid=%u fsuid=%u" |
611 | " egid=%u sgid=%u fsgid=%u tty=%s", | 772 | " egid=%u sgid=%u fsgid=%u tty=%s", |
612 | context->argv[0], | 773 | context->argv[0], |
@@ -614,6 +775,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
614 | context->argv[2], | 775 | context->argv[2], |
615 | context->argv[3], | 776 | context->argv[3], |
616 | context->name_count, | 777 | context->name_count, |
778 | context->ppid, | ||
617 | context->pid, | 779 | context->pid, |
618 | context->loginuid, | 780 | context->loginuid, |
619 | context->uid, | 781 | context->uid, |
@@ -621,6 +783,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
621 | context->euid, context->suid, context->fsuid, | 783 | context->euid, context->suid, context->fsuid, |
622 | context->egid, context->sgid, context->fsgid, tty); | 784 | context->egid, context->sgid, context->fsgid, tty); |
623 | audit_log_task_info(ab, tsk); | 785 | audit_log_task_info(ab, tsk); |
786 | if (context->filterkey) { | ||
787 | audit_log_format(ab, " key="); | ||
788 | audit_log_untrustedstring(ab, context->filterkey); | ||
789 | } else | ||
790 | audit_log_format(ab, " key=(null)"); | ||
624 | audit_log_end(ab); | 791 | audit_log_end(ab); |
625 | 792 | ||
626 | for (aux = context->aux; aux; aux = aux->next) { | 793 | for (aux = context->aux; aux; aux = aux->next) { |
@@ -630,11 +797,48 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
630 | continue; /* audit_panic has been called */ | 797 | continue; /* audit_panic has been called */ |
631 | 798 | ||
632 | switch (aux->type) { | 799 | switch (aux->type) { |
800 | case AUDIT_MQ_OPEN: { | ||
801 | struct audit_aux_data_mq_open *axi = (void *)aux; | ||
802 | audit_log_format(ab, | ||
803 | "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " | ||
804 | "mq_msgsize=%ld mq_curmsgs=%ld", | ||
805 | axi->oflag, axi->mode, axi->attr.mq_flags, | ||
806 | axi->attr.mq_maxmsg, axi->attr.mq_msgsize, | ||
807 | axi->attr.mq_curmsgs); | ||
808 | break; } | ||
809 | |||
810 | case AUDIT_MQ_SENDRECV: { | ||
811 | struct audit_aux_data_mq_sendrecv *axi = (void *)aux; | ||
812 | audit_log_format(ab, | ||
813 | "mqdes=%d msg_len=%zd msg_prio=%u " | ||
814 | "abs_timeout_sec=%ld abs_timeout_nsec=%ld", | ||
815 | axi->mqdes, axi->msg_len, axi->msg_prio, | ||
816 | axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec); | ||
817 | break; } | ||
818 | |||
819 | case AUDIT_MQ_NOTIFY: { | ||
820 | struct audit_aux_data_mq_notify *axi = (void *)aux; | ||
821 | audit_log_format(ab, | ||
822 | "mqdes=%d sigev_signo=%d", | ||
823 | axi->mqdes, | ||
824 | axi->notification.sigev_signo); | ||
825 | break; } | ||
826 | |||
827 | case AUDIT_MQ_GETSETATTR: { | ||
828 | struct audit_aux_data_mq_getsetattr *axi = (void *)aux; | ||
829 | audit_log_format(ab, | ||
830 | "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld " | ||
831 | "mq_curmsgs=%ld ", | ||
832 | axi->mqdes, | ||
833 | axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg, | ||
834 | axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs); | ||
835 | break; } | ||
836 | |||
633 | case AUDIT_IPC: { | 837 | case AUDIT_IPC: { |
634 | struct audit_aux_data_ipcctl *axi = (void *)aux; | 838 | struct audit_aux_data_ipcctl *axi = (void *)aux; |
635 | audit_log_format(ab, | 839 | audit_log_format(ab, |
636 | " qbytes=%lx iuid=%u igid=%u mode=%x", | 840 | "ouid=%u ogid=%u mode=%x", |
637 | axi->qbytes, axi->uid, axi->gid, axi->mode); | 841 | axi->uid, axi->gid, axi->mode); |
638 | if (axi->osid != 0) { | 842 | if (axi->osid != 0) { |
639 | char *ctx = NULL; | 843 | char *ctx = NULL; |
640 | u32 len; | 844 | u32 len; |
@@ -652,19 +856,18 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
652 | case AUDIT_IPC_SET_PERM: { | 856 | case AUDIT_IPC_SET_PERM: { |
653 | struct audit_aux_data_ipcctl *axi = (void *)aux; | 857 | struct audit_aux_data_ipcctl *axi = (void *)aux; |
654 | audit_log_format(ab, | 858 | audit_log_format(ab, |
655 | " new qbytes=%lx new iuid=%u new igid=%u new mode=%x", | 859 | "qbytes=%lx ouid=%u ogid=%u mode=%x", |
656 | axi->qbytes, axi->uid, axi->gid, axi->mode); | 860 | axi->qbytes, axi->uid, axi->gid, axi->mode); |
657 | if (axi->osid != 0) { | 861 | break; } |
658 | char *ctx = NULL; | 862 | |
659 | u32 len; | 863 | case AUDIT_EXECVE: { |
660 | if (selinux_ctxid_to_string( | 864 | struct audit_aux_data_execve *axi = (void *)aux; |
661 | axi->osid, &ctx, &len)) { | 865 | int i; |
662 | audit_log_format(ab, " osid=%u", | 866 | const char *p; |
663 | axi->osid); | 867 | for (i = 0, p = axi->mem; i < axi->argc; i++) { |
664 | call_panic = 1; | 868 | audit_log_format(ab, "a%d=", i); |
665 | } else | 869 | p = audit_log_untrustedstring(ab, p); |
666 | audit_log_format(ab, " obj=%s", ctx); | 870 | audit_log_format(ab, "\n"); |
667 | kfree(ctx); | ||
668 | } | 871 | } |
669 | break; } | 872 | break; } |
670 | 873 | ||
@@ -700,8 +903,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
700 | } | 903 | } |
701 | } | 904 | } |
702 | for (i = 0; i < context->name_count; i++) { | 905 | for (i = 0; i < context->name_count; i++) { |
703 | unsigned long ino = context->names[i].ino; | 906 | struct audit_names *n = &context->names[i]; |
704 | unsigned long pino = context->names[i].pino; | ||
705 | 907 | ||
706 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); | 908 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); |
707 | if (!ab) | 909 | if (!ab) |
@@ -709,33 +911,47 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
709 | 911 | ||
710 | audit_log_format(ab, "item=%d", i); | 912 | audit_log_format(ab, "item=%d", i); |
711 | 913 | ||
712 | audit_log_format(ab, " name="); | 914 | if (n->name) { |
713 | if (context->names[i].name) | 915 | switch(n->name_len) { |
714 | audit_log_untrustedstring(ab, context->names[i].name); | 916 | case AUDIT_NAME_FULL: |
715 | else | 917 | /* log the full path */ |
716 | audit_log_format(ab, "(null)"); | 918 | audit_log_format(ab, " name="); |
717 | 919 | audit_log_untrustedstring(ab, n->name); | |
718 | if (pino != (unsigned long)-1) | 920 | break; |
719 | audit_log_format(ab, " parent=%lu", pino); | 921 | case 0: |
720 | if (ino != (unsigned long)-1) | 922 | /* name was specified as a relative path and the |
721 | audit_log_format(ab, " inode=%lu", ino); | 923 | * directory component is the cwd */ |
722 | if ((pino != (unsigned long)-1) || (ino != (unsigned long)-1)) | 924 | audit_log_d_path(ab, " name=", context->pwd, |
723 | audit_log_format(ab, " dev=%02x:%02x mode=%#o" | 925 | context->pwdmnt); |
724 | " ouid=%u ogid=%u rdev=%02x:%02x", | 926 | break; |
725 | MAJOR(context->names[i].dev), | 927 | default: |
726 | MINOR(context->names[i].dev), | 928 | /* log the name's directory component */ |
727 | context->names[i].mode, | 929 | audit_log_format(ab, " name="); |
728 | context->names[i].uid, | 930 | audit_log_n_untrustedstring(ab, n->name_len, |
729 | context->names[i].gid, | 931 | n->name); |
730 | MAJOR(context->names[i].rdev), | 932 | } |
731 | MINOR(context->names[i].rdev)); | 933 | } else |
732 | if (context->names[i].osid != 0) { | 934 | audit_log_format(ab, " name=(null)"); |
935 | |||
936 | if (n->ino != (unsigned long)-1) { | ||
937 | audit_log_format(ab, " inode=%lu" | ||
938 | " dev=%02x:%02x mode=%#o" | ||
939 | " ouid=%u ogid=%u rdev=%02x:%02x", | ||
940 | n->ino, | ||
941 | MAJOR(n->dev), | ||
942 | MINOR(n->dev), | ||
943 | n->mode, | ||
944 | n->uid, | ||
945 | n->gid, | ||
946 | MAJOR(n->rdev), | ||
947 | MINOR(n->rdev)); | ||
948 | } | ||
949 | if (n->osid != 0) { | ||
733 | char *ctx = NULL; | 950 | char *ctx = NULL; |
734 | u32 len; | 951 | u32 len; |
735 | if (selinux_ctxid_to_string( | 952 | if (selinux_ctxid_to_string( |
736 | context->names[i].osid, &ctx, &len)) { | 953 | n->osid, &ctx, &len)) { |
737 | audit_log_format(ab, " osid=%u", | 954 | audit_log_format(ab, " osid=%u", n->osid); |
738 | context->names[i].osid); | ||
739 | call_panic = 2; | 955 | call_panic = 2; |
740 | } else | 956 | } else |
741 | audit_log_format(ab, " obj=%s", ctx); | 957 | audit_log_format(ab, " obj=%s", ctx); |
@@ -897,6 +1113,8 @@ void audit_syscall_exit(int valid, long return_code) | |||
897 | } else { | 1113 | } else { |
898 | audit_free_names(context); | 1114 | audit_free_names(context); |
899 | audit_free_aux(context); | 1115 | audit_free_aux(context); |
1116 | kfree(context->filterkey); | ||
1117 | context->filterkey = NULL; | ||
900 | tsk->audit_context = context; | 1118 | tsk->audit_context = context; |
901 | } | 1119 | } |
902 | } | 1120 | } |
@@ -908,11 +1126,11 @@ void audit_syscall_exit(int valid, long return_code) | |||
908 | * Add a name to the list of audit names for this context. | 1126 | * Add a name to the list of audit names for this context. |
909 | * Called from fs/namei.c:getname(). | 1127 | * Called from fs/namei.c:getname(). |
910 | */ | 1128 | */ |
911 | void audit_getname(const char *name) | 1129 | void __audit_getname(const char *name) |
912 | { | 1130 | { |
913 | struct audit_context *context = current->audit_context; | 1131 | struct audit_context *context = current->audit_context; |
914 | 1132 | ||
915 | if (!context || IS_ERR(name) || !name) | 1133 | if (IS_ERR(name) || !name) |
916 | return; | 1134 | return; |
917 | 1135 | ||
918 | if (!context->in_syscall) { | 1136 | if (!context->in_syscall) { |
@@ -925,6 +1143,8 @@ void audit_getname(const char *name) | |||
925 | } | 1143 | } |
926 | BUG_ON(context->name_count >= AUDIT_NAMES); | 1144 | BUG_ON(context->name_count >= AUDIT_NAMES); |
927 | context->names[context->name_count].name = name; | 1145 | context->names[context->name_count].name = name; |
1146 | context->names[context->name_count].name_len = AUDIT_NAME_FULL; | ||
1147 | context->names[context->name_count].name_put = 1; | ||
928 | context->names[context->name_count].ino = (unsigned long)-1; | 1148 | context->names[context->name_count].ino = (unsigned long)-1; |
929 | ++context->name_count; | 1149 | ++context->name_count; |
930 | if (!context->pwd) { | 1150 | if (!context->pwd) { |
@@ -991,11 +1211,10 @@ static void audit_inode_context(int idx, const struct inode *inode) | |||
991 | * audit_inode - store the inode and device from a lookup | 1211 | * audit_inode - store the inode and device from a lookup |
992 | * @name: name being audited | 1212 | * @name: name being audited |
993 | * @inode: inode being audited | 1213 | * @inode: inode being audited |
994 | * @flags: lookup flags (as used in path_lookup()) | ||
995 | * | 1214 | * |
996 | * Called from fs/namei.c:path_lookup(). | 1215 | * Called from fs/namei.c:path_lookup(). |
997 | */ | 1216 | */ |
998 | void __audit_inode(const char *name, const struct inode *inode, unsigned flags) | 1217 | void __audit_inode(const char *name, const struct inode *inode) |
999 | { | 1218 | { |
1000 | int idx; | 1219 | int idx; |
1001 | struct audit_context *context = current->audit_context; | 1220 | struct audit_context *context = current->audit_context; |
@@ -1021,20 +1240,13 @@ void __audit_inode(const char *name, const struct inode *inode, unsigned flags) | |||
1021 | ++context->ino_count; | 1240 | ++context->ino_count; |
1022 | #endif | 1241 | #endif |
1023 | } | 1242 | } |
1243 | context->names[idx].ino = inode->i_ino; | ||
1024 | context->names[idx].dev = inode->i_sb->s_dev; | 1244 | context->names[idx].dev = inode->i_sb->s_dev; |
1025 | context->names[idx].mode = inode->i_mode; | 1245 | context->names[idx].mode = inode->i_mode; |
1026 | context->names[idx].uid = inode->i_uid; | 1246 | context->names[idx].uid = inode->i_uid; |
1027 | context->names[idx].gid = inode->i_gid; | 1247 | context->names[idx].gid = inode->i_gid; |
1028 | context->names[idx].rdev = inode->i_rdev; | 1248 | context->names[idx].rdev = inode->i_rdev; |
1029 | audit_inode_context(idx, inode); | 1249 | audit_inode_context(idx, inode); |
1030 | if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) && | ||
1031 | (strcmp(name, ".") != 0)) { | ||
1032 | context->names[idx].ino = (unsigned long)-1; | ||
1033 | context->names[idx].pino = inode->i_ino; | ||
1034 | } else { | ||
1035 | context->names[idx].ino = inode->i_ino; | ||
1036 | context->names[idx].pino = (unsigned long)-1; | ||
1037 | } | ||
1038 | } | 1250 | } |
1039 | 1251 | ||
1040 | /** | 1252 | /** |
@@ -1056,51 +1268,40 @@ void __audit_inode_child(const char *dname, const struct inode *inode, | |||
1056 | { | 1268 | { |
1057 | int idx; | 1269 | int idx; |
1058 | struct audit_context *context = current->audit_context; | 1270 | struct audit_context *context = current->audit_context; |
1271 | const char *found_name = NULL; | ||
1272 | int dirlen = 0; | ||
1059 | 1273 | ||
1060 | if (!context->in_syscall) | 1274 | if (!context->in_syscall) |
1061 | return; | 1275 | return; |
1062 | 1276 | ||
1063 | /* determine matching parent */ | 1277 | /* determine matching parent */ |
1064 | if (dname) | 1278 | if (!dname) |
1065 | for (idx = 0; idx < context->name_count; idx++) | 1279 | goto update_context; |
1066 | if (context->names[idx].pino == pino) { | 1280 | for (idx = 0; idx < context->name_count; idx++) |
1067 | const char *n; | 1281 | if (context->names[idx].ino == pino) { |
1068 | const char *name = context->names[idx].name; | 1282 | const char *name = context->names[idx].name; |
1069 | int dlen = strlen(dname); | 1283 | |
1070 | int nlen = name ? strlen(name) : 0; | 1284 | if (!name) |
1071 | 1285 | continue; | |
1072 | if (nlen < dlen) | 1286 | |
1073 | continue; | 1287 | if (audit_compare_dname_path(dname, name, &dirlen) == 0) { |
1074 | 1288 | context->names[idx].name_len = dirlen; | |
1075 | /* disregard trailing slashes */ | 1289 | found_name = name; |
1076 | n = name + nlen - 1; | 1290 | break; |
1077 | while ((*n == '/') && (n > name)) | ||
1078 | n--; | ||
1079 | |||
1080 | /* find last path component */ | ||
1081 | n = n - dlen + 1; | ||
1082 | if (n < name) | ||
1083 | continue; | ||
1084 | else if (n > name) { | ||
1085 | if (*--n != '/') | ||
1086 | continue; | ||
1087 | else | ||
1088 | n++; | ||
1089 | } | ||
1090 | |||
1091 | if (strncmp(n, dname, dlen) == 0) | ||
1092 | goto update_context; | ||
1093 | } | 1291 | } |
1292 | } | ||
1094 | 1293 | ||
1095 | /* catch-all in case match not found */ | 1294 | update_context: |
1096 | idx = context->name_count++; | 1295 | idx = context->name_count++; |
1097 | context->names[idx].name = NULL; | ||
1098 | context->names[idx].pino = pino; | ||
1099 | #if AUDIT_DEBUG | 1296 | #if AUDIT_DEBUG |
1100 | context->ino_count++; | 1297 | context->ino_count++; |
1101 | #endif | 1298 | #endif |
1299 | /* Re-use the name belonging to the slot for a matching parent directory. | ||
1300 | * All names for this context are relinquished in audit_free_names() */ | ||
1301 | context->names[idx].name = found_name; | ||
1302 | context->names[idx].name_len = AUDIT_NAME_FULL; | ||
1303 | context->names[idx].name_put = 0; /* don't call __putname() */ | ||
1102 | 1304 | ||
1103 | update_context: | ||
1104 | if (inode) { | 1305 | if (inode) { |
1105 | context->names[idx].ino = inode->i_ino; | 1306 | context->names[idx].ino = inode->i_ino; |
1106 | context->names[idx].dev = inode->i_sb->s_dev; | 1307 | context->names[idx].dev = inode->i_sb->s_dev; |
@@ -1109,7 +1310,8 @@ update_context: | |||
1109 | context->names[idx].gid = inode->i_gid; | 1310 | context->names[idx].gid = inode->i_gid; |
1110 | context->names[idx].rdev = inode->i_rdev; | 1311 | context->names[idx].rdev = inode->i_rdev; |
1111 | audit_inode_context(idx, inode); | 1312 | audit_inode_context(idx, inode); |
1112 | } | 1313 | } else |
1314 | context->names[idx].ino = (unsigned long)-1; | ||
1113 | } | 1315 | } |
1114 | 1316 | ||
1115 | /** | 1317 | /** |
@@ -1142,18 +1344,23 @@ void auditsc_get_stamp(struct audit_context *ctx, | |||
1142 | */ | 1344 | */ |
1143 | int audit_set_loginuid(struct task_struct *task, uid_t loginuid) | 1345 | int audit_set_loginuid(struct task_struct *task, uid_t loginuid) |
1144 | { | 1346 | { |
1145 | if (task->audit_context) { | 1347 | struct audit_context *context = task->audit_context; |
1146 | struct audit_buffer *ab; | 1348 | |
1147 | 1349 | if (context) { | |
1148 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); | 1350 | /* Only log if audit is enabled */ |
1149 | if (ab) { | 1351 | if (context->in_syscall) { |
1150 | audit_log_format(ab, "login pid=%d uid=%u " | 1352 | struct audit_buffer *ab; |
1151 | "old auid=%u new auid=%u", | 1353 | |
1152 | task->pid, task->uid, | 1354 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); |
1153 | task->audit_context->loginuid, loginuid); | 1355 | if (ab) { |
1154 | audit_log_end(ab); | 1356 | audit_log_format(ab, "login pid=%d uid=%u " |
1357 | "old auid=%u new auid=%u", | ||
1358 | task->pid, task->uid, | ||
1359 | context->loginuid, loginuid); | ||
1360 | audit_log_end(ab); | ||
1361 | } | ||
1155 | } | 1362 | } |
1156 | task->audit_context->loginuid = loginuid; | 1363 | context->loginuid = loginuid; |
1157 | } | 1364 | } |
1158 | return 0; | 1365 | return 0; |
1159 | } | 1366 | } |
@@ -1170,16 +1377,193 @@ uid_t audit_get_loginuid(struct audit_context *ctx) | |||
1170 | } | 1377 | } |
1171 | 1378 | ||
1172 | /** | 1379 | /** |
1173 | * audit_ipc_obj - record audit data for ipc object | 1380 | * __audit_mq_open - record audit data for a POSIX MQ open |
1174 | * @ipcp: ipc permissions | 1381 | * @oflag: open flag |
1382 | * @mode: mode bits | ||
1383 | * @u_attr: queue attributes | ||
1175 | * | 1384 | * |
1176 | * Returns 0 for success or NULL context or < 0 on error. | 1385 | * Returns 0 for success or NULL context or < 0 on error. |
1177 | */ | 1386 | */ |
1178 | int audit_ipc_obj(struct kern_ipc_perm *ipcp) | 1387 | int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) |
1179 | { | 1388 | { |
1180 | struct audit_aux_data_ipcctl *ax; | 1389 | struct audit_aux_data_mq_open *ax; |
1390 | struct audit_context *context = current->audit_context; | ||
1391 | |||
1392 | if (!audit_enabled) | ||
1393 | return 0; | ||
1394 | |||
1395 | if (likely(!context)) | ||
1396 | return 0; | ||
1397 | |||
1398 | ax = kmalloc(sizeof(*ax), GFP_ATOMIC); | ||
1399 | if (!ax) | ||
1400 | return -ENOMEM; | ||
1401 | |||
1402 | if (u_attr != NULL) { | ||
1403 | if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) { | ||
1404 | kfree(ax); | ||
1405 | return -EFAULT; | ||
1406 | } | ||
1407 | } else | ||
1408 | memset(&ax->attr, 0, sizeof(ax->attr)); | ||
1409 | |||
1410 | ax->oflag = oflag; | ||
1411 | ax->mode = mode; | ||
1412 | |||
1413 | ax->d.type = AUDIT_MQ_OPEN; | ||
1414 | ax->d.next = context->aux; | ||
1415 | context->aux = (void *)ax; | ||
1416 | return 0; | ||
1417 | } | ||
1418 | |||
1419 | /** | ||
1420 | * __audit_mq_timedsend - record audit data for a POSIX MQ timed send | ||
1421 | * @mqdes: MQ descriptor | ||
1422 | * @msg_len: Message length | ||
1423 | * @msg_prio: Message priority | ||
1424 | * @u_abs_timeout: Message timeout in absolute time | ||
1425 | * | ||
1426 | * Returns 0 for success or NULL context or < 0 on error. | ||
1427 | */ | ||
1428 | int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, | ||
1429 | const struct timespec __user *u_abs_timeout) | ||
1430 | { | ||
1431 | struct audit_aux_data_mq_sendrecv *ax; | ||
1432 | struct audit_context *context = current->audit_context; | ||
1433 | |||
1434 | if (!audit_enabled) | ||
1435 | return 0; | ||
1436 | |||
1437 | if (likely(!context)) | ||
1438 | return 0; | ||
1439 | |||
1440 | ax = kmalloc(sizeof(*ax), GFP_ATOMIC); | ||
1441 | if (!ax) | ||
1442 | return -ENOMEM; | ||
1443 | |||
1444 | if (u_abs_timeout != NULL) { | ||
1445 | if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) { | ||
1446 | kfree(ax); | ||
1447 | return -EFAULT; | ||
1448 | } | ||
1449 | } else | ||
1450 | memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout)); | ||
1451 | |||
1452 | ax->mqdes = mqdes; | ||
1453 | ax->msg_len = msg_len; | ||
1454 | ax->msg_prio = msg_prio; | ||
1455 | |||
1456 | ax->d.type = AUDIT_MQ_SENDRECV; | ||
1457 | ax->d.next = context->aux; | ||
1458 | context->aux = (void *)ax; | ||
1459 | return 0; | ||
1460 | } | ||
1461 | |||
1462 | /** | ||
1463 | * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive | ||
1464 | * @mqdes: MQ descriptor | ||
1465 | * @msg_len: Message length | ||
1466 | * @u_msg_prio: Message priority | ||
1467 | * @u_abs_timeout: Message timeout in absolute time | ||
1468 | * | ||
1469 | * Returns 0 for success or NULL context or < 0 on error. | ||
1470 | */ | ||
1471 | int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, | ||
1472 | unsigned int __user *u_msg_prio, | ||
1473 | const struct timespec __user *u_abs_timeout) | ||
1474 | { | ||
1475 | struct audit_aux_data_mq_sendrecv *ax; | ||
1476 | struct audit_context *context = current->audit_context; | ||
1477 | |||
1478 | if (!audit_enabled) | ||
1479 | return 0; | ||
1480 | |||
1481 | if (likely(!context)) | ||
1482 | return 0; | ||
1483 | |||
1484 | ax = kmalloc(sizeof(*ax), GFP_ATOMIC); | ||
1485 | if (!ax) | ||
1486 | return -ENOMEM; | ||
1487 | |||
1488 | if (u_msg_prio != NULL) { | ||
1489 | if (get_user(ax->msg_prio, u_msg_prio)) { | ||
1490 | kfree(ax); | ||
1491 | return -EFAULT; | ||
1492 | } | ||
1493 | } else | ||
1494 | ax->msg_prio = 0; | ||
1495 | |||
1496 | if (u_abs_timeout != NULL) { | ||
1497 | if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) { | ||
1498 | kfree(ax); | ||
1499 | return -EFAULT; | ||
1500 | } | ||
1501 | } else | ||
1502 | memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout)); | ||
1503 | |||
1504 | ax->mqdes = mqdes; | ||
1505 | ax->msg_len = msg_len; | ||
1506 | |||
1507 | ax->d.type = AUDIT_MQ_SENDRECV; | ||
1508 | ax->d.next = context->aux; | ||
1509 | context->aux = (void *)ax; | ||
1510 | return 0; | ||
1511 | } | ||
1512 | |||
1513 | /** | ||
1514 | * __audit_mq_notify - record audit data for a POSIX MQ notify | ||
1515 | * @mqdes: MQ descriptor | ||
1516 | * @u_notification: Notification event | ||
1517 | * | ||
1518 | * Returns 0 for success or NULL context or < 0 on error. | ||
1519 | */ | ||
1520 | |||
1521 | int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) | ||
1522 | { | ||
1523 | struct audit_aux_data_mq_notify *ax; | ||
1524 | struct audit_context *context = current->audit_context; | ||
1525 | |||
1526 | if (!audit_enabled) | ||
1527 | return 0; | ||
1528 | |||
1529 | if (likely(!context)) | ||
1530 | return 0; | ||
1531 | |||
1532 | ax = kmalloc(sizeof(*ax), GFP_ATOMIC); | ||
1533 | if (!ax) | ||
1534 | return -ENOMEM; | ||
1535 | |||
1536 | if (u_notification != NULL) { | ||
1537 | if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) { | ||
1538 | kfree(ax); | ||
1539 | return -EFAULT; | ||
1540 | } | ||
1541 | } else | ||
1542 | memset(&ax->notification, 0, sizeof(ax->notification)); | ||
1543 | |||
1544 | ax->mqdes = mqdes; | ||
1545 | |||
1546 | ax->d.type = AUDIT_MQ_NOTIFY; | ||
1547 | ax->d.next = context->aux; | ||
1548 | context->aux = (void *)ax; | ||
1549 | return 0; | ||
1550 | } | ||
1551 | |||
1552 | /** | ||
1553 | * __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute | ||
1554 | * @mqdes: MQ descriptor | ||
1555 | * @mqstat: MQ flags | ||
1556 | * | ||
1557 | * Returns 0 for success or NULL context or < 0 on error. | ||
1558 | */ | ||
1559 | int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) | ||
1560 | { | ||
1561 | struct audit_aux_data_mq_getsetattr *ax; | ||
1181 | struct audit_context *context = current->audit_context; | 1562 | struct audit_context *context = current->audit_context; |
1182 | 1563 | ||
1564 | if (!audit_enabled) | ||
1565 | return 0; | ||
1566 | |||
1183 | if (likely(!context)) | 1567 | if (likely(!context)) |
1184 | return 0; | 1568 | return 0; |
1185 | 1569 | ||
@@ -1187,6 +1571,30 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp) | |||
1187 | if (!ax) | 1571 | if (!ax) |
1188 | return -ENOMEM; | 1572 | return -ENOMEM; |
1189 | 1573 | ||
1574 | ax->mqdes = mqdes; | ||
1575 | ax->mqstat = *mqstat; | ||
1576 | |||
1577 | ax->d.type = AUDIT_MQ_GETSETATTR; | ||
1578 | ax->d.next = context->aux; | ||
1579 | context->aux = (void *)ax; | ||
1580 | return 0; | ||
1581 | } | ||
1582 | |||
1583 | /** | ||
1584 | * audit_ipc_obj - record audit data for ipc object | ||
1585 | * @ipcp: ipc permissions | ||
1586 | * | ||
1587 | * Returns 0 for success or NULL context or < 0 on error. | ||
1588 | */ | ||
1589 | int __audit_ipc_obj(struct kern_ipc_perm *ipcp) | ||
1590 | { | ||
1591 | struct audit_aux_data_ipcctl *ax; | ||
1592 | struct audit_context *context = current->audit_context; | ||
1593 | |||
1594 | ax = kmalloc(sizeof(*ax), GFP_ATOMIC); | ||
1595 | if (!ax) | ||
1596 | return -ENOMEM; | ||
1597 | |||
1190 | ax->uid = ipcp->uid; | 1598 | ax->uid = ipcp->uid; |
1191 | ax->gid = ipcp->gid; | 1599 | ax->gid = ipcp->gid; |
1192 | ax->mode = ipcp->mode; | 1600 | ax->mode = ipcp->mode; |
@@ -1207,14 +1615,11 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp) | |||
1207 | * | 1615 | * |
1208 | * Returns 0 for success or NULL context or < 0 on error. | 1616 | * Returns 0 for success or NULL context or < 0 on error. |
1209 | */ | 1617 | */ |
1210 | int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp) | 1618 | int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) |
1211 | { | 1619 | { |
1212 | struct audit_aux_data_ipcctl *ax; | 1620 | struct audit_aux_data_ipcctl *ax; |
1213 | struct audit_context *context = current->audit_context; | 1621 | struct audit_context *context = current->audit_context; |
1214 | 1622 | ||
1215 | if (likely(!context)) | ||
1216 | return 0; | ||
1217 | |||
1218 | ax = kmalloc(sizeof(*ax), GFP_ATOMIC); | 1623 | ax = kmalloc(sizeof(*ax), GFP_ATOMIC); |
1219 | if (!ax) | 1624 | if (!ax) |
1220 | return -ENOMEM; | 1625 | return -ENOMEM; |
@@ -1223,7 +1628,6 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, | |||
1223 | ax->uid = uid; | 1628 | ax->uid = uid; |
1224 | ax->gid = gid; | 1629 | ax->gid = gid; |
1225 | ax->mode = mode; | 1630 | ax->mode = mode; |
1226 | selinux_get_ipc_sid(ipcp, &ax->osid); | ||
1227 | 1631 | ||
1228 | ax->d.type = AUDIT_IPC_SET_PERM; | 1632 | ax->d.type = AUDIT_IPC_SET_PERM; |
1229 | ax->d.next = context->aux; | 1633 | ax->d.next = context->aux; |
@@ -1231,6 +1635,39 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, | |||
1231 | return 0; | 1635 | return 0; |
1232 | } | 1636 | } |
1233 | 1637 | ||
1638 | int audit_bprm(struct linux_binprm *bprm) | ||
1639 | { | ||
1640 | struct audit_aux_data_execve *ax; | ||
1641 | struct audit_context *context = current->audit_context; | ||
1642 | unsigned long p, next; | ||
1643 | void *to; | ||
1644 | |||
1645 | if (likely(!audit_enabled || !context)) | ||
1646 | return 0; | ||
1647 | |||
1648 | ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p, | ||
1649 | GFP_KERNEL); | ||
1650 | if (!ax) | ||
1651 | return -ENOMEM; | ||
1652 | |||
1653 | ax->argc = bprm->argc; | ||
1654 | ax->envc = bprm->envc; | ||
1655 | for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) { | ||
1656 | struct page *page = bprm->page[p / PAGE_SIZE]; | ||
1657 | void *kaddr = kmap(page); | ||
1658 | next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1); | ||
1659 | memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p); | ||
1660 | to += next - p; | ||
1661 | kunmap(page); | ||
1662 | } | ||
1663 | |||
1664 | ax->d.type = AUDIT_EXECVE; | ||
1665 | ax->d.next = context->aux; | ||
1666 | context->aux = (void *)ax; | ||
1667 | return 0; | ||
1668 | } | ||
1669 | |||
1670 | |||
1234 | /** | 1671 | /** |
1235 | * audit_socketcall - record audit data for sys_socketcall | 1672 | * audit_socketcall - record audit data for sys_socketcall |
1236 | * @nargs: number of args | 1673 | * @nargs: number of args |
@@ -1325,19 +1762,20 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt) | |||
1325 | * If the audit subsystem is being terminated, record the task (pid) | 1762 | * If the audit subsystem is being terminated, record the task (pid) |
1326 | * and uid that is doing that. | 1763 | * and uid that is doing that. |
1327 | */ | 1764 | */ |
1328 | void audit_signal_info(int sig, struct task_struct *t) | 1765 | void __audit_signal_info(int sig, struct task_struct *t) |
1329 | { | 1766 | { |
1330 | extern pid_t audit_sig_pid; | 1767 | extern pid_t audit_sig_pid; |
1331 | extern uid_t audit_sig_uid; | 1768 | extern uid_t audit_sig_uid; |
1332 | 1769 | extern u32 audit_sig_sid; | |
1333 | if (unlikely(audit_pid && t->tgid == audit_pid)) { | 1770 | |
1334 | if (sig == SIGTERM || sig == SIGHUP) { | 1771 | if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { |
1335 | struct audit_context *ctx = current->audit_context; | 1772 | struct task_struct *tsk = current; |
1336 | audit_sig_pid = current->pid; | 1773 | struct audit_context *ctx = tsk->audit_context; |
1337 | if (ctx) | 1774 | audit_sig_pid = tsk->pid; |
1338 | audit_sig_uid = ctx->loginuid; | 1775 | if (ctx) |
1339 | else | 1776 | audit_sig_uid = ctx->loginuid; |
1340 | audit_sig_uid = current->uid; | 1777 | else |
1341 | } | 1778 | audit_sig_uid = tsk->uid; |
1779 | selinux_get_task_sid(tsk, &audit_sig_sid); | ||
1342 | } | 1780 | } |
1343 | } | 1781 | } |
diff --git a/kernel/compat.c b/kernel/compat.c index c1601a84f8d8..126dee9530aa 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/unistd.h> | 21 | #include <linux/unistd.h> |
22 | #include <linux/security.h> | 22 | #include <linux/security.h> |
23 | #include <linux/timex.h> | 23 | #include <linux/timex.h> |
24 | #include <linux/migrate.h> | ||
24 | 25 | ||
25 | #include <asm/uaccess.h> | 26 | #include <asm/uaccess.h> |
26 | 27 | ||
@@ -729,17 +730,10 @@ void | |||
729 | sigset_from_compat (sigset_t *set, compat_sigset_t *compat) | 730 | sigset_from_compat (sigset_t *set, compat_sigset_t *compat) |
730 | { | 731 | { |
731 | switch (_NSIG_WORDS) { | 732 | switch (_NSIG_WORDS) { |
732 | #if defined (__COMPAT_ENDIAN_SWAP__) | ||
733 | case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 ); | ||
734 | case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 ); | ||
735 | case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 ); | ||
736 | case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 ); | ||
737 | #else | ||
738 | case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); | 733 | case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); |
739 | case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); | 734 | case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); |
740 | case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); | 735 | case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); |
741 | case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); | 736 | case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); |
742 | #endif | ||
743 | } | 737 | } |
744 | } | 738 | } |
745 | 739 | ||
@@ -934,3 +928,25 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) | |||
934 | 928 | ||
935 | return ret; | 929 | return ret; |
936 | } | 930 | } |
931 | |||
932 | #ifdef CONFIG_NUMA | ||
933 | asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, | ||
934 | compat_uptr_t __user *pages32, | ||
935 | const int __user *nodes, | ||
936 | int __user *status, | ||
937 | int flags) | ||
938 | { | ||
939 | const void __user * __user *pages; | ||
940 | int i; | ||
941 | |||
942 | pages = compat_alloc_user_space(nr_pages * sizeof(void *)); | ||
943 | for (i = 0; i < nr_pages; i++) { | ||
944 | compat_uptr_t p; | ||
945 | |||
946 | if (get_user(p, pages32 + i) || | ||
947 | put_user(compat_ptr(p), pages + i)) | ||
948 | return -EFAULT; | ||
949 | } | ||
950 | return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); | ||
951 | } | ||
952 | #endif | ||
diff --git a/kernel/configs.c b/kernel/configs.c index 009e1ebdcb88..f9e31974f4ad 100644 --- a/kernel/configs.c +++ b/kernel/configs.c | |||
@@ -23,7 +23,6 @@ | |||
23 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 23 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
24 | */ | 24 | */ |
25 | 25 | ||
26 | #include <linux/config.h> | ||
27 | #include <linux/kernel.h> | 26 | #include <linux/kernel.h> |
28 | #include <linux/module.h> | 27 | #include <linux/module.h> |
29 | #include <linux/proc_fs.h> | 28 | #include <linux/proc_fs.h> |
diff --git a/kernel/cpu.c b/kernel/cpu.c index fe2b8d0bfe4c..70fbf2e83766 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -13,12 +13,12 @@ | |||
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/kthread.h> | 14 | #include <linux/kthread.h> |
15 | #include <linux/stop_machine.h> | 15 | #include <linux/stop_machine.h> |
16 | #include <asm/semaphore.h> | 16 | #include <linux/mutex.h> |
17 | 17 | ||
18 | /* This protects CPUs going up and down... */ | 18 | /* This protects CPUs going up and down... */ |
19 | static DECLARE_MUTEX(cpucontrol); | 19 | static DEFINE_MUTEX(cpucontrol); |
20 | 20 | ||
21 | static BLOCKING_NOTIFIER_HEAD(cpu_chain); | 21 | static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain); |
22 | 22 | ||
23 | #ifdef CONFIG_HOTPLUG_CPU | 23 | #ifdef CONFIG_HOTPLUG_CPU |
24 | static struct task_struct *lock_cpu_hotplug_owner; | 24 | static struct task_struct *lock_cpu_hotplug_owner; |
@@ -30,9 +30,9 @@ static int __lock_cpu_hotplug(int interruptible) | |||
30 | 30 | ||
31 | if (lock_cpu_hotplug_owner != current) { | 31 | if (lock_cpu_hotplug_owner != current) { |
32 | if (interruptible) | 32 | if (interruptible) |
33 | ret = down_interruptible(&cpucontrol); | 33 | ret = mutex_lock_interruptible(&cpucontrol); |
34 | else | 34 | else |
35 | down(&cpucontrol); | 35 | mutex_lock(&cpucontrol); |
36 | } | 36 | } |
37 | 37 | ||
38 | /* | 38 | /* |
@@ -56,7 +56,7 @@ void unlock_cpu_hotplug(void) | |||
56 | { | 56 | { |
57 | if (--lock_cpu_hotplug_depth == 0) { | 57 | if (--lock_cpu_hotplug_depth == 0) { |
58 | lock_cpu_hotplug_owner = NULL; | 58 | lock_cpu_hotplug_owner = NULL; |
59 | up(&cpucontrol); | 59 | mutex_unlock(&cpucontrol); |
60 | } | 60 | } |
61 | } | 61 | } |
62 | EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); | 62 | EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); |
@@ -69,10 +69,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible); | |||
69 | #endif /* CONFIG_HOTPLUG_CPU */ | 69 | #endif /* CONFIG_HOTPLUG_CPU */ |
70 | 70 | ||
71 | /* Need to know about CPUs going up/down? */ | 71 | /* Need to know about CPUs going up/down? */ |
72 | int register_cpu_notifier(struct notifier_block *nb) | 72 | int __cpuinit register_cpu_notifier(struct notifier_block *nb) |
73 | { | 73 | { |
74 | return blocking_notifier_chain_register(&cpu_chain, nb); | 74 | return blocking_notifier_chain_register(&cpu_chain, nb); |
75 | } | 75 | } |
76 | |||
77 | #ifdef CONFIG_HOTPLUG_CPU | ||
78 | |||
76 | EXPORT_SYMBOL(register_cpu_notifier); | 79 | EXPORT_SYMBOL(register_cpu_notifier); |
77 | 80 | ||
78 | void unregister_cpu_notifier(struct notifier_block *nb) | 81 | void unregister_cpu_notifier(struct notifier_block *nb) |
@@ -81,7 +84,6 @@ void unregister_cpu_notifier(struct notifier_block *nb) | |||
81 | } | 84 | } |
82 | EXPORT_SYMBOL(unregister_cpu_notifier); | 85 | EXPORT_SYMBOL(unregister_cpu_notifier); |
83 | 86 | ||
84 | #ifdef CONFIG_HOTPLUG_CPU | ||
85 | static inline void check_for_tasks(int cpu) | 87 | static inline void check_for_tasks(int cpu) |
86 | { | 88 | { |
87 | struct task_struct *p; | 89 | struct task_struct *p; |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ab81fdd4572b..c232dc077438 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -18,7 +18,6 @@ | |||
18 | * distribution for more details. | 18 | * distribution for more details. |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include <linux/config.h> | ||
22 | #include <linux/cpu.h> | 21 | #include <linux/cpu.h> |
23 | #include <linux/cpumask.h> | 22 | #include <linux/cpumask.h> |
24 | #include <linux/cpuset.h> | 23 | #include <linux/cpuset.h> |
@@ -41,6 +40,7 @@ | |||
41 | #include <linux/rcupdate.h> | 40 | #include <linux/rcupdate.h> |
42 | #include <linux/sched.h> | 41 | #include <linux/sched.h> |
43 | #include <linux/seq_file.h> | 42 | #include <linux/seq_file.h> |
43 | #include <linux/security.h> | ||
44 | #include <linux/slab.h> | 44 | #include <linux/slab.h> |
45 | #include <linux/smp_lock.h> | 45 | #include <linux/smp_lock.h> |
46 | #include <linux/spinlock.h> | 46 | #include <linux/spinlock.h> |
@@ -392,11 +392,11 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data, | |||
392 | return 0; | 392 | return 0; |
393 | } | 393 | } |
394 | 394 | ||
395 | static struct super_block *cpuset_get_sb(struct file_system_type *fs_type, | 395 | static int cpuset_get_sb(struct file_system_type *fs_type, |
396 | int flags, const char *unused_dev_name, | 396 | int flags, const char *unused_dev_name, |
397 | void *data) | 397 | void *data, struct vfsmount *mnt) |
398 | { | 398 | { |
399 | return get_sb_single(fs_type, flags, data, cpuset_fill_super); | 399 | return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt); |
400 | } | 400 | } |
401 | 401 | ||
402 | static struct file_system_type cpuset_fs_type = { | 402 | static struct file_system_type cpuset_fs_type = { |
@@ -1063,7 +1063,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
1063 | } | 1063 | } |
1064 | 1064 | ||
1065 | /* | 1065 | /* |
1066 | * Frequency meter - How fast is some event occuring? | 1066 | * Frequency meter - How fast is some event occurring? |
1067 | * | 1067 | * |
1068 | * These routines manage a digitally filtered, constant time based, | 1068 | * These routines manage a digitally filtered, constant time based, |
1069 | * event frequency meter. There are four routines: | 1069 | * event frequency meter. There are four routines: |
@@ -1177,6 +1177,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
1177 | cpumask_t cpus; | 1177 | cpumask_t cpus; |
1178 | nodemask_t from, to; | 1178 | nodemask_t from, to; |
1179 | struct mm_struct *mm; | 1179 | struct mm_struct *mm; |
1180 | int retval; | ||
1180 | 1181 | ||
1181 | if (sscanf(pidbuf, "%d", &pid) != 1) | 1182 | if (sscanf(pidbuf, "%d", &pid) != 1) |
1182 | return -EIO; | 1183 | return -EIO; |
@@ -1205,6 +1206,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
1205 | get_task_struct(tsk); | 1206 | get_task_struct(tsk); |
1206 | } | 1207 | } |
1207 | 1208 | ||
1209 | retval = security_task_setscheduler(tsk, 0, NULL); | ||
1210 | if (retval) { | ||
1211 | put_task_struct(tsk); | ||
1212 | return retval; | ||
1213 | } | ||
1214 | |||
1208 | mutex_lock(&callback_mutex); | 1215 | mutex_lock(&callback_mutex); |
1209 | 1216 | ||
1210 | task_lock(tsk); | 1217 | task_lock(tsk); |
@@ -2434,31 +2441,43 @@ void __cpuset_memory_pressure_bump(void) | |||
2434 | */ | 2441 | */ |
2435 | static int proc_cpuset_show(struct seq_file *m, void *v) | 2442 | static int proc_cpuset_show(struct seq_file *m, void *v) |
2436 | { | 2443 | { |
2444 | struct pid *pid; | ||
2437 | struct task_struct *tsk; | 2445 | struct task_struct *tsk; |
2438 | char *buf; | 2446 | char *buf; |
2439 | int retval = 0; | 2447 | int retval; |
2440 | 2448 | ||
2449 | retval = -ENOMEM; | ||
2441 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); | 2450 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); |
2442 | if (!buf) | 2451 | if (!buf) |
2443 | return -ENOMEM; | 2452 | goto out; |
2444 | 2453 | ||
2445 | tsk = m->private; | 2454 | retval = -ESRCH; |
2455 | pid = m->private; | ||
2456 | tsk = get_pid_task(pid, PIDTYPE_PID); | ||
2457 | if (!tsk) | ||
2458 | goto out_free; | ||
2459 | |||
2460 | retval = -EINVAL; | ||
2446 | mutex_lock(&manage_mutex); | 2461 | mutex_lock(&manage_mutex); |
2462 | |||
2447 | retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); | 2463 | retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); |
2448 | if (retval < 0) | 2464 | if (retval < 0) |
2449 | goto out; | 2465 | goto out_unlock; |
2450 | seq_puts(m, buf); | 2466 | seq_puts(m, buf); |
2451 | seq_putc(m, '\n'); | 2467 | seq_putc(m, '\n'); |
2452 | out: | 2468 | out_unlock: |
2453 | mutex_unlock(&manage_mutex); | 2469 | mutex_unlock(&manage_mutex); |
2470 | put_task_struct(tsk); | ||
2471 | out_free: | ||
2454 | kfree(buf); | 2472 | kfree(buf); |
2473 | out: | ||
2455 | return retval; | 2474 | return retval; |
2456 | } | 2475 | } |
2457 | 2476 | ||
2458 | static int cpuset_open(struct inode *inode, struct file *file) | 2477 | static int cpuset_open(struct inode *inode, struct file *file) |
2459 | { | 2478 | { |
2460 | struct task_struct *tsk = PROC_I(inode)->task; | 2479 | struct pid *pid = PROC_I(inode)->pid; |
2461 | return single_open(file, proc_cpuset_show, tsk); | 2480 | return single_open(file, proc_cpuset_show, pid); |
2462 | } | 2481 | } |
2463 | 2482 | ||
2464 | struct file_operations proc_cpuset_operations = { | 2483 | struct file_operations proc_cpuset_operations = { |
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index c01cead2cfd6..3c2eaea66b1e 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c | |||
@@ -7,7 +7,6 @@ | |||
7 | * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org) | 7 | * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org) |
8 | */ | 8 | */ |
9 | 9 | ||
10 | #include <linux/config.h> | ||
11 | #include <linux/init.h> | 10 | #include <linux/init.h> |
12 | #include <linux/kernel.h> | 11 | #include <linux/kernel.h> |
13 | #include <linux/kmod.h> | 12 | #include <linux/kmod.h> |
diff --git a/kernel/exit.c b/kernel/exit.c index e06d0c10a24e..7f7ef2258553 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -4,7 +4,6 @@ | |||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | 4 | * Copyright (C) 1991, 1992 Linus Torvalds |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include <linux/config.h> | ||
8 | #include <linux/mm.h> | 7 | #include <linux/mm.h> |
9 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
10 | #include <linux/interrupt.h> | 9 | #include <linux/interrupt.h> |
@@ -36,6 +35,7 @@ | |||
36 | #include <linux/compat.h> | 35 | #include <linux/compat.h> |
37 | #include <linux/pipe_fs_i.h> | 36 | #include <linux/pipe_fs_i.h> |
38 | #include <linux/audit.h> /* for audit_free() */ | 37 | #include <linux/audit.h> /* for audit_free() */ |
38 | #include <linux/resource.h> | ||
39 | 39 | ||
40 | #include <asm/uaccess.h> | 40 | #include <asm/uaccess.h> |
41 | #include <asm/unistd.h> | 41 | #include <asm/unistd.h> |
@@ -45,8 +45,6 @@ | |||
45 | extern void sem_exit (void); | 45 | extern void sem_exit (void); |
46 | extern struct task_struct *child_reaper; | 46 | extern struct task_struct *child_reaper; |
47 | 47 | ||
48 | int getrusage(struct task_struct *, int, struct rusage __user *); | ||
49 | |||
50 | static void exit_mm(struct task_struct * tsk); | 48 | static void exit_mm(struct task_struct * tsk); |
51 | 49 | ||
52 | static void __unhash_process(struct task_struct *p) | 50 | static void __unhash_process(struct task_struct *p) |
@@ -138,12 +136,8 @@ void release_task(struct task_struct * p) | |||
138 | { | 136 | { |
139 | int zap_leader; | 137 | int zap_leader; |
140 | task_t *leader; | 138 | task_t *leader; |
141 | struct dentry *proc_dentry; | ||
142 | |||
143 | repeat: | 139 | repeat: |
144 | atomic_dec(&p->user->processes); | 140 | atomic_dec(&p->user->processes); |
145 | spin_lock(&p->proc_lock); | ||
146 | proc_dentry = proc_pid_unhash(p); | ||
147 | write_lock_irq(&tasklist_lock); | 141 | write_lock_irq(&tasklist_lock); |
148 | ptrace_unlink(p); | 142 | ptrace_unlink(p); |
149 | BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); | 143 | BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); |
@@ -172,8 +166,7 @@ repeat: | |||
172 | 166 | ||
173 | sched_exit(p); | 167 | sched_exit(p); |
174 | write_unlock_irq(&tasklist_lock); | 168 | write_unlock_irq(&tasklist_lock); |
175 | spin_unlock(&p->proc_lock); | 169 | proc_flush_task(p); |
176 | proc_pid_flush(proc_dentry); | ||
177 | release_thread(p); | 170 | release_thread(p); |
178 | call_rcu(&p->rcu, delayed_put_task_struct); | 171 | call_rcu(&p->rcu, delayed_put_task_struct); |
179 | 172 | ||
@@ -579,7 +572,7 @@ static void exit_mm(struct task_struct * tsk) | |||
579 | down_read(&mm->mmap_sem); | 572 | down_read(&mm->mmap_sem); |
580 | } | 573 | } |
581 | atomic_inc(&mm->mm_count); | 574 | atomic_inc(&mm->mm_count); |
582 | if (mm != tsk->active_mm) BUG(); | 575 | BUG_ON(mm != tsk->active_mm); |
583 | /* more a memory barrier than a real lock */ | 576 | /* more a memory barrier than a real lock */ |
584 | task_lock(tsk); | 577 | task_lock(tsk); |
585 | tsk->mm = NULL; | 578 | tsk->mm = NULL; |
@@ -895,11 +888,11 @@ fastcall NORET_TYPE void do_exit(long code) | |||
895 | if (group_dead) { | 888 | if (group_dead) { |
896 | hrtimer_cancel(&tsk->signal->real_timer); | 889 | hrtimer_cancel(&tsk->signal->real_timer); |
897 | exit_itimers(tsk->signal); | 890 | exit_itimers(tsk->signal); |
898 | acct_process(code); | ||
899 | } | 891 | } |
892 | acct_collect(code, group_dead); | ||
900 | if (unlikely(tsk->robust_list)) | 893 | if (unlikely(tsk->robust_list)) |
901 | exit_robust_list(tsk); | 894 | exit_robust_list(tsk); |
902 | #ifdef CONFIG_COMPAT | 895 | #if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT) |
903 | if (unlikely(tsk->compat_robust_list)) | 896 | if (unlikely(tsk->compat_robust_list)) |
904 | compat_exit_robust_list(tsk); | 897 | compat_exit_robust_list(tsk); |
905 | #endif | 898 | #endif |
@@ -907,6 +900,8 @@ fastcall NORET_TYPE void do_exit(long code) | |||
907 | audit_free(tsk); | 900 | audit_free(tsk); |
908 | exit_mm(tsk); | 901 | exit_mm(tsk); |
909 | 902 | ||
903 | if (group_dead) | ||
904 | acct_process(); | ||
910 | exit_sem(tsk); | 905 | exit_sem(tsk); |
911 | __exit_files(tsk); | 906 | __exit_files(tsk); |
912 | __exit_fs(tsk); | 907 | __exit_fs(tsk); |
@@ -930,9 +925,18 @@ fastcall NORET_TYPE void do_exit(long code) | |||
930 | tsk->mempolicy = NULL; | 925 | tsk->mempolicy = NULL; |
931 | #endif | 926 | #endif |
932 | /* | 927 | /* |
928 | * This must happen late, after the PID is not | ||
929 | * hashed anymore: | ||
930 | */ | ||
931 | if (unlikely(!list_empty(&tsk->pi_state_list))) | ||
932 | exit_pi_state_list(tsk); | ||
933 | if (unlikely(current->pi_state_cache)) | ||
934 | kfree(current->pi_state_cache); | ||
935 | /* | ||
933 | * If DEBUG_MUTEXES is on, make sure we are holding no locks: | 936 | * If DEBUG_MUTEXES is on, make sure we are holding no locks: |
934 | */ | 937 | */ |
935 | mutex_debug_check_no_locks_held(tsk); | 938 | mutex_debug_check_no_locks_held(tsk); |
939 | rt_mutex_debug_check_no_locks_held(tsk); | ||
936 | 940 | ||
937 | if (tsk->io_context) | 941 | if (tsk->io_context) |
938 | exit_io_context(); | 942 | exit_io_context(); |
@@ -1530,8 +1534,7 @@ check_continued: | |||
1530 | if (options & __WNOTHREAD) | 1534 | if (options & __WNOTHREAD) |
1531 | break; | 1535 | break; |
1532 | tsk = next_thread(tsk); | 1536 | tsk = next_thread(tsk); |
1533 | if (tsk->signal != current->signal) | 1537 | BUG_ON(tsk->signal != current->signal); |
1534 | BUG(); | ||
1535 | } while (tsk != current); | 1538 | } while (tsk != current); |
1536 | 1539 | ||
1537 | read_unlock(&tasklist_lock); | 1540 | read_unlock(&tasklist_lock); |
diff --git a/kernel/fork.c b/kernel/fork.c index ac8100e3088a..9064bf9e131b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -11,7 +11,6 @@ | |||
11 | * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' | 11 | * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' |
12 | */ | 12 | */ |
13 | 13 | ||
14 | #include <linux/config.h> | ||
15 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
16 | #include <linux/init.h> | 15 | #include <linux/init.h> |
17 | #include <linux/unistd.h> | 16 | #include <linux/unistd.h> |
@@ -104,6 +103,7 @@ static kmem_cache_t *mm_cachep; | |||
104 | void free_task(struct task_struct *tsk) | 103 | void free_task(struct task_struct *tsk) |
105 | { | 104 | { |
106 | free_thread_info(tsk->thread_info); | 105 | free_thread_info(tsk->thread_info); |
106 | rt_mutex_debug_task_free(tsk); | ||
107 | free_task_struct(tsk); | 107 | free_task_struct(tsk); |
108 | } | 108 | } |
109 | EXPORT_SYMBOL(free_task); | 109 | EXPORT_SYMBOL(free_task); |
@@ -368,6 +368,8 @@ void fastcall __mmdrop(struct mm_struct *mm) | |||
368 | */ | 368 | */ |
369 | void mmput(struct mm_struct *mm) | 369 | void mmput(struct mm_struct *mm) |
370 | { | 370 | { |
371 | might_sleep(); | ||
372 | |||
371 | if (atomic_dec_and_test(&mm->mm_users)) { | 373 | if (atomic_dec_and_test(&mm->mm_users)) { |
372 | exit_aio(mm); | 374 | exit_aio(mm); |
373 | exit_mmap(mm); | 375 | exit_mmap(mm); |
@@ -623,6 +625,7 @@ out: | |||
623 | /* | 625 | /* |
624 | * Allocate a new files structure and copy contents from the | 626 | * Allocate a new files structure and copy contents from the |
625 | * passed in files structure. | 627 | * passed in files structure. |
628 | * errorp will be valid only when the returned files_struct is NULL. | ||
626 | */ | 629 | */ |
627 | static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) | 630 | static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) |
628 | { | 631 | { |
@@ -631,6 +634,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) | |||
631 | int open_files, size, i, expand; | 634 | int open_files, size, i, expand; |
632 | struct fdtable *old_fdt, *new_fdt; | 635 | struct fdtable *old_fdt, *new_fdt; |
633 | 636 | ||
637 | *errorp = -ENOMEM; | ||
634 | newf = alloc_files(); | 638 | newf = alloc_files(); |
635 | if (!newf) | 639 | if (!newf) |
636 | goto out; | 640 | goto out; |
@@ -744,7 +748,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) | |||
744 | * break this. | 748 | * break this. |
745 | */ | 749 | */ |
746 | tsk->files = NULL; | 750 | tsk->files = NULL; |
747 | error = -ENOMEM; | ||
748 | newf = dup_fd(oldf, &error); | 751 | newf = dup_fd(oldf, &error); |
749 | if (!newf) | 752 | if (!newf) |
750 | goto out; | 753 | goto out; |
@@ -871,6 +874,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
871 | tsk->it_prof_expires = | 874 | tsk->it_prof_expires = |
872 | secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); | 875 | secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); |
873 | } | 876 | } |
877 | acct_init_pacct(&sig->pacct); | ||
874 | 878 | ||
875 | return 0; | 879 | return 0; |
876 | } | 880 | } |
@@ -909,6 +913,19 @@ asmlinkage long sys_set_tid_address(int __user *tidptr) | |||
909 | return current->pid; | 913 | return current->pid; |
910 | } | 914 | } |
911 | 915 | ||
916 | static inline void rt_mutex_init_task(struct task_struct *p) | ||
917 | { | ||
918 | #ifdef CONFIG_RT_MUTEXES | ||
919 | spin_lock_init(&p->pi_lock); | ||
920 | plist_head_init(&p->pi_waiters, &p->pi_lock); | ||
921 | p->pi_blocked_on = NULL; | ||
922 | # ifdef CONFIG_DEBUG_RT_MUTEXES | ||
923 | spin_lock_init(&p->held_list_lock); | ||
924 | INIT_LIST_HEAD(&p->held_list_head); | ||
925 | # endif | ||
926 | #endif | ||
927 | } | ||
928 | |||
912 | /* | 929 | /* |
913 | * This creates a new process as a copy of the old one, | 930 | * This creates a new process as a copy of the old one, |
914 | * but does not actually start it yet. | 931 | * but does not actually start it yet. |
@@ -989,13 +1006,10 @@ static task_t *copy_process(unsigned long clone_flags, | |||
989 | if (put_user(p->pid, parent_tidptr)) | 1006 | if (put_user(p->pid, parent_tidptr)) |
990 | goto bad_fork_cleanup; | 1007 | goto bad_fork_cleanup; |
991 | 1008 | ||
992 | p->proc_dentry = NULL; | ||
993 | |||
994 | INIT_LIST_HEAD(&p->children); | 1009 | INIT_LIST_HEAD(&p->children); |
995 | INIT_LIST_HEAD(&p->sibling); | 1010 | INIT_LIST_HEAD(&p->sibling); |
996 | p->vfork_done = NULL; | 1011 | p->vfork_done = NULL; |
997 | spin_lock_init(&p->alloc_lock); | 1012 | spin_lock_init(&p->alloc_lock); |
998 | spin_lock_init(&p->proc_lock); | ||
999 | 1013 | ||
1000 | clear_tsk_thread_flag(p, TIF_SIGPENDING); | 1014 | clear_tsk_thread_flag(p, TIF_SIGPENDING); |
1001 | init_sigpending(&p->pending); | 1015 | init_sigpending(&p->pending); |
@@ -1033,6 +1047,8 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1033 | mpol_fix_fork_child_flag(p); | 1047 | mpol_fix_fork_child_flag(p); |
1034 | #endif | 1048 | #endif |
1035 | 1049 | ||
1050 | rt_mutex_init_task(p); | ||
1051 | |||
1036 | #ifdef CONFIG_DEBUG_MUTEXES | 1052 | #ifdef CONFIG_DEBUG_MUTEXES |
1037 | p->blocked_on = NULL; /* not blocked yet */ | 1053 | p->blocked_on = NULL; /* not blocked yet */ |
1038 | #endif | 1054 | #endif |
@@ -1075,6 +1091,9 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1075 | #ifdef CONFIG_COMPAT | 1091 | #ifdef CONFIG_COMPAT |
1076 | p->compat_robust_list = NULL; | 1092 | p->compat_robust_list = NULL; |
1077 | #endif | 1093 | #endif |
1094 | INIT_LIST_HEAD(&p->pi_state_list); | ||
1095 | p->pi_state_cache = NULL; | ||
1096 | |||
1078 | /* | 1097 | /* |
1079 | * sigaltstack should be cleared when sharing the same VM | 1098 | * sigaltstack should be cleared when sharing the same VM |
1080 | */ | 1099 | */ |
@@ -1155,18 +1174,6 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1155 | } | 1174 | } |
1156 | 1175 | ||
1157 | if (clone_flags & CLONE_THREAD) { | 1176 | if (clone_flags & CLONE_THREAD) { |
1158 | /* | ||
1159 | * Important: if an exit-all has been started then | ||
1160 | * do not create this new thread - the whole thread | ||
1161 | * group is supposed to exit anyway. | ||
1162 | */ | ||
1163 | if (current->signal->flags & SIGNAL_GROUP_EXIT) { | ||
1164 | spin_unlock(¤t->sighand->siglock); | ||
1165 | write_unlock_irq(&tasklist_lock); | ||
1166 | retval = -EAGAIN; | ||
1167 | goto bad_fork_cleanup_namespace; | ||
1168 | } | ||
1169 | |||
1170 | p->group_leader = current->group_leader; | 1177 | p->group_leader = current->group_leader; |
1171 | list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); | 1178 | list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); |
1172 | 1179 | ||
diff --git a/kernel/futex.c b/kernel/futex.c index 5699c512057b..15caf93e4a43 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -12,6 +12,10 @@ | |||
12 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved | 12 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved |
13 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. | 13 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. |
14 | * | 14 | * |
15 | * PI-futex support started by Ingo Molnar and Thomas Gleixner | ||
16 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
17 | * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
18 | * | ||
15 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly | 19 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly |
16 | * enough at me, Linus for the original (flawed) idea, Matthew | 20 | * enough at me, Linus for the original (flawed) idea, Matthew |
17 | * Kirkwood for proof-of-concept implementation. | 21 | * Kirkwood for proof-of-concept implementation. |
@@ -46,6 +50,8 @@ | |||
46 | #include <linux/signal.h> | 50 | #include <linux/signal.h> |
47 | #include <asm/futex.h> | 51 | #include <asm/futex.h> |
48 | 52 | ||
53 | #include "rtmutex_common.h" | ||
54 | |||
49 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) | 55 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) |
50 | 56 | ||
51 | /* | 57 | /* |
@@ -63,7 +69,7 @@ union futex_key { | |||
63 | int offset; | 69 | int offset; |
64 | } shared; | 70 | } shared; |
65 | struct { | 71 | struct { |
66 | unsigned long uaddr; | 72 | unsigned long address; |
67 | struct mm_struct *mm; | 73 | struct mm_struct *mm; |
68 | int offset; | 74 | int offset; |
69 | } private; | 75 | } private; |
@@ -75,6 +81,27 @@ union futex_key { | |||
75 | }; | 81 | }; |
76 | 82 | ||
77 | /* | 83 | /* |
84 | * Priority Inheritance state: | ||
85 | */ | ||
86 | struct futex_pi_state { | ||
87 | /* | ||
88 | * list of 'owned' pi_state instances - these have to be | ||
89 | * cleaned up in do_exit() if the task exits prematurely: | ||
90 | */ | ||
91 | struct list_head list; | ||
92 | |||
93 | /* | ||
94 | * The PI object: | ||
95 | */ | ||
96 | struct rt_mutex pi_mutex; | ||
97 | |||
98 | struct task_struct *owner; | ||
99 | atomic_t refcount; | ||
100 | |||
101 | union futex_key key; | ||
102 | }; | ||
103 | |||
104 | /* | ||
78 | * We use this hashed waitqueue instead of a normal wait_queue_t, so | 105 | * We use this hashed waitqueue instead of a normal wait_queue_t, so |
79 | * we can wake only the relevant ones (hashed queues may be shared). | 106 | * we can wake only the relevant ones (hashed queues may be shared). |
80 | * | 107 | * |
@@ -87,15 +114,19 @@ struct futex_q { | |||
87 | struct list_head list; | 114 | struct list_head list; |
88 | wait_queue_head_t waiters; | 115 | wait_queue_head_t waiters; |
89 | 116 | ||
90 | /* Which hash list lock to use. */ | 117 | /* Which hash list lock to use: */ |
91 | spinlock_t *lock_ptr; | 118 | spinlock_t *lock_ptr; |
92 | 119 | ||
93 | /* Key which the futex is hashed on. */ | 120 | /* Key which the futex is hashed on: */ |
94 | union futex_key key; | 121 | union futex_key key; |
95 | 122 | ||
96 | /* For fd, sigio sent using these. */ | 123 | /* For fd, sigio sent using these: */ |
97 | int fd; | 124 | int fd; |
98 | struct file *filp; | 125 | struct file *filp; |
126 | |||
127 | /* Optional priority inheritance state: */ | ||
128 | struct futex_pi_state *pi_state; | ||
129 | struct task_struct *task; | ||
99 | }; | 130 | }; |
100 | 131 | ||
101 | /* | 132 | /* |
@@ -144,8 +175,9 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) | |||
144 | * | 175 | * |
145 | * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. | 176 | * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. |
146 | */ | 177 | */ |
147 | static int get_futex_key(unsigned long uaddr, union futex_key *key) | 178 | static int get_futex_key(u32 __user *uaddr, union futex_key *key) |
148 | { | 179 | { |
180 | unsigned long address = (unsigned long)uaddr; | ||
149 | struct mm_struct *mm = current->mm; | 181 | struct mm_struct *mm = current->mm; |
150 | struct vm_area_struct *vma; | 182 | struct vm_area_struct *vma; |
151 | struct page *page; | 183 | struct page *page; |
@@ -154,16 +186,16 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
154 | /* | 186 | /* |
155 | * The futex address must be "naturally" aligned. | 187 | * The futex address must be "naturally" aligned. |
156 | */ | 188 | */ |
157 | key->both.offset = uaddr % PAGE_SIZE; | 189 | key->both.offset = address % PAGE_SIZE; |
158 | if (unlikely((key->both.offset % sizeof(u32)) != 0)) | 190 | if (unlikely((key->both.offset % sizeof(u32)) != 0)) |
159 | return -EINVAL; | 191 | return -EINVAL; |
160 | uaddr -= key->both.offset; | 192 | address -= key->both.offset; |
161 | 193 | ||
162 | /* | 194 | /* |
163 | * The futex is hashed differently depending on whether | 195 | * The futex is hashed differently depending on whether |
164 | * it's in a shared or private mapping. So check vma first. | 196 | * it's in a shared or private mapping. So check vma first. |
165 | */ | 197 | */ |
166 | vma = find_extend_vma(mm, uaddr); | 198 | vma = find_extend_vma(mm, address); |
167 | if (unlikely(!vma)) | 199 | if (unlikely(!vma)) |
168 | return -EFAULT; | 200 | return -EFAULT; |
169 | 201 | ||
@@ -184,7 +216,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
184 | */ | 216 | */ |
185 | if (likely(!(vma->vm_flags & VM_MAYSHARE))) { | 217 | if (likely(!(vma->vm_flags & VM_MAYSHARE))) { |
186 | key->private.mm = mm; | 218 | key->private.mm = mm; |
187 | key->private.uaddr = uaddr; | 219 | key->private.address = address; |
188 | return 0; | 220 | return 0; |
189 | } | 221 | } |
190 | 222 | ||
@@ -194,7 +226,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
194 | key->shared.inode = vma->vm_file->f_dentry->d_inode; | 226 | key->shared.inode = vma->vm_file->f_dentry->d_inode; |
195 | key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ | 227 | key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ |
196 | if (likely(!(vma->vm_flags & VM_NONLINEAR))) { | 228 | if (likely(!(vma->vm_flags & VM_NONLINEAR))) { |
197 | key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) | 229 | key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) |
198 | + vma->vm_pgoff); | 230 | + vma->vm_pgoff); |
199 | return 0; | 231 | return 0; |
200 | } | 232 | } |
@@ -205,7 +237,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
205 | * from swap. But that's a lot of code to duplicate here | 237 | * from swap. But that's a lot of code to duplicate here |
206 | * for a rare case, so we simply fetch the page. | 238 | * for a rare case, so we simply fetch the page. |
207 | */ | 239 | */ |
208 | err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); | 240 | err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); |
209 | if (err >= 0) { | 241 | if (err >= 0) { |
210 | key->shared.pgoff = | 242 | key->shared.pgoff = |
211 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 243 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
@@ -246,18 +278,244 @@ static void drop_key_refs(union futex_key *key) | |||
246 | } | 278 | } |
247 | } | 279 | } |
248 | 280 | ||
249 | static inline int get_futex_value_locked(int *dest, int __user *from) | 281 | static inline int get_futex_value_locked(u32 *dest, u32 __user *from) |
250 | { | 282 | { |
251 | int ret; | 283 | int ret; |
252 | 284 | ||
253 | inc_preempt_count(); | 285 | inc_preempt_count(); |
254 | ret = __copy_from_user_inatomic(dest, from, sizeof(int)); | 286 | ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); |
255 | dec_preempt_count(); | 287 | dec_preempt_count(); |
256 | 288 | ||
257 | return ret ? -EFAULT : 0; | 289 | return ret ? -EFAULT : 0; |
258 | } | 290 | } |
259 | 291 | ||
260 | /* | 292 | /* |
293 | * Fault handling. Called with current->mm->mmap_sem held. | ||
294 | */ | ||
295 | static int futex_handle_fault(unsigned long address, int attempt) | ||
296 | { | ||
297 | struct vm_area_struct * vma; | ||
298 | struct mm_struct *mm = current->mm; | ||
299 | |||
300 | if (attempt >= 2 || !(vma = find_vma(mm, address)) || | ||
301 | vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) | ||
302 | return -EFAULT; | ||
303 | |||
304 | switch (handle_mm_fault(mm, vma, address, 1)) { | ||
305 | case VM_FAULT_MINOR: | ||
306 | current->min_flt++; | ||
307 | break; | ||
308 | case VM_FAULT_MAJOR: | ||
309 | current->maj_flt++; | ||
310 | break; | ||
311 | default: | ||
312 | return -EFAULT; | ||
313 | } | ||
314 | return 0; | ||
315 | } | ||
316 | |||
317 | /* | ||
318 | * PI code: | ||
319 | */ | ||
320 | static int refill_pi_state_cache(void) | ||
321 | { | ||
322 | struct futex_pi_state *pi_state; | ||
323 | |||
324 | if (likely(current->pi_state_cache)) | ||
325 | return 0; | ||
326 | |||
327 | pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); | ||
328 | |||
329 | if (!pi_state) | ||
330 | return -ENOMEM; | ||
331 | |||
332 | memset(pi_state, 0, sizeof(*pi_state)); | ||
333 | INIT_LIST_HEAD(&pi_state->list); | ||
334 | /* pi_mutex gets initialized later */ | ||
335 | pi_state->owner = NULL; | ||
336 | atomic_set(&pi_state->refcount, 1); | ||
337 | |||
338 | current->pi_state_cache = pi_state; | ||
339 | |||
340 | return 0; | ||
341 | } | ||
342 | |||
343 | static struct futex_pi_state * alloc_pi_state(void) | ||
344 | { | ||
345 | struct futex_pi_state *pi_state = current->pi_state_cache; | ||
346 | |||
347 | WARN_ON(!pi_state); | ||
348 | current->pi_state_cache = NULL; | ||
349 | |||
350 | return pi_state; | ||
351 | } | ||
352 | |||
353 | static void free_pi_state(struct futex_pi_state *pi_state) | ||
354 | { | ||
355 | if (!atomic_dec_and_test(&pi_state->refcount)) | ||
356 | return; | ||
357 | |||
358 | /* | ||
359 | * If pi_state->owner is NULL, the owner is most probably dying | ||
360 | * and has cleaned up the pi_state already | ||
361 | */ | ||
362 | if (pi_state->owner) { | ||
363 | spin_lock_irq(&pi_state->owner->pi_lock); | ||
364 | list_del_init(&pi_state->list); | ||
365 | spin_unlock_irq(&pi_state->owner->pi_lock); | ||
366 | |||
367 | rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); | ||
368 | } | ||
369 | |||
370 | if (current->pi_state_cache) | ||
371 | kfree(pi_state); | ||
372 | else { | ||
373 | /* | ||
374 | * pi_state->list is already empty. | ||
375 | * clear pi_state->owner. | ||
376 | * refcount is at 0 - put it back to 1. | ||
377 | */ | ||
378 | pi_state->owner = NULL; | ||
379 | atomic_set(&pi_state->refcount, 1); | ||
380 | current->pi_state_cache = pi_state; | ||
381 | } | ||
382 | } | ||
383 | |||
384 | /* | ||
385 | * Look up the task based on what TID userspace gave us. | ||
386 | * We dont trust it. | ||
387 | */ | ||
388 | static struct task_struct * futex_find_get_task(pid_t pid) | ||
389 | { | ||
390 | struct task_struct *p; | ||
391 | |||
392 | read_lock(&tasklist_lock); | ||
393 | p = find_task_by_pid(pid); | ||
394 | if (!p) | ||
395 | goto out_unlock; | ||
396 | if ((current->euid != p->euid) && (current->euid != p->uid)) { | ||
397 | p = NULL; | ||
398 | goto out_unlock; | ||
399 | } | ||
400 | if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) { | ||
401 | p = NULL; | ||
402 | goto out_unlock; | ||
403 | } | ||
404 | get_task_struct(p); | ||
405 | out_unlock: | ||
406 | read_unlock(&tasklist_lock); | ||
407 | |||
408 | return p; | ||
409 | } | ||
410 | |||
411 | /* | ||
412 | * This task is holding PI mutexes at exit time => bad. | ||
413 | * Kernel cleans up PI-state, but userspace is likely hosed. | ||
414 | * (Robust-futex cleanup is separate and might save the day for userspace.) | ||
415 | */ | ||
416 | void exit_pi_state_list(struct task_struct *curr) | ||
417 | { | ||
418 | struct futex_hash_bucket *hb; | ||
419 | struct list_head *next, *head = &curr->pi_state_list; | ||
420 | struct futex_pi_state *pi_state; | ||
421 | union futex_key key; | ||
422 | |||
423 | /* | ||
424 | * We are a ZOMBIE and nobody can enqueue itself on | ||
425 | * pi_state_list anymore, but we have to be careful | ||
426 | * versus waiters unqueueing themselfs | ||
427 | */ | ||
428 | spin_lock_irq(&curr->pi_lock); | ||
429 | while (!list_empty(head)) { | ||
430 | |||
431 | next = head->next; | ||
432 | pi_state = list_entry(next, struct futex_pi_state, list); | ||
433 | key = pi_state->key; | ||
434 | spin_unlock_irq(&curr->pi_lock); | ||
435 | |||
436 | hb = hash_futex(&key); | ||
437 | spin_lock(&hb->lock); | ||
438 | |||
439 | spin_lock_irq(&curr->pi_lock); | ||
440 | if (head->next != next) { | ||
441 | spin_unlock(&hb->lock); | ||
442 | continue; | ||
443 | } | ||
444 | |||
445 | list_del_init(&pi_state->list); | ||
446 | |||
447 | WARN_ON(pi_state->owner != curr); | ||
448 | |||
449 | pi_state->owner = NULL; | ||
450 | spin_unlock_irq(&curr->pi_lock); | ||
451 | |||
452 | rt_mutex_unlock(&pi_state->pi_mutex); | ||
453 | |||
454 | spin_unlock(&hb->lock); | ||
455 | |||
456 | spin_lock_irq(&curr->pi_lock); | ||
457 | } | ||
458 | spin_unlock_irq(&curr->pi_lock); | ||
459 | } | ||
460 | |||
461 | static int | ||
462 | lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) | ||
463 | { | ||
464 | struct futex_pi_state *pi_state = NULL; | ||
465 | struct futex_q *this, *next; | ||
466 | struct list_head *head; | ||
467 | struct task_struct *p; | ||
468 | pid_t pid; | ||
469 | |||
470 | head = &hb->chain; | ||
471 | |||
472 | list_for_each_entry_safe(this, next, head, list) { | ||
473 | if (match_futex (&this->key, &me->key)) { | ||
474 | /* | ||
475 | * Another waiter already exists - bump up | ||
476 | * the refcount and return its pi_state: | ||
477 | */ | ||
478 | pi_state = this->pi_state; | ||
479 | atomic_inc(&pi_state->refcount); | ||
480 | me->pi_state = pi_state; | ||
481 | |||
482 | return 0; | ||
483 | } | ||
484 | } | ||
485 | |||
486 | /* | ||
487 | * We are the first waiter - try to look up the real owner and | ||
488 | * attach the new pi_state to it: | ||
489 | */ | ||
490 | pid = uval & FUTEX_TID_MASK; | ||
491 | p = futex_find_get_task(pid); | ||
492 | if (!p) | ||
493 | return -ESRCH; | ||
494 | |||
495 | pi_state = alloc_pi_state(); | ||
496 | |||
497 | /* | ||
498 | * Initialize the pi_mutex in locked state and make 'p' | ||
499 | * the owner of it: | ||
500 | */ | ||
501 | rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); | ||
502 | |||
503 | /* Store the key for possible exit cleanups: */ | ||
504 | pi_state->key = me->key; | ||
505 | |||
506 | spin_lock_irq(&p->pi_lock); | ||
507 | list_add(&pi_state->list, &p->pi_state_list); | ||
508 | pi_state->owner = p; | ||
509 | spin_unlock_irq(&p->pi_lock); | ||
510 | |||
511 | put_task_struct(p); | ||
512 | |||
513 | me->pi_state = pi_state; | ||
514 | |||
515 | return 0; | ||
516 | } | ||
517 | |||
518 | /* | ||
261 | * The hash bucket lock must be held when this is called. | 519 | * The hash bucket lock must be held when this is called. |
262 | * Afterwards, the futex_q must not be accessed. | 520 | * Afterwards, the futex_q must not be accessed. |
263 | */ | 521 | */ |
@@ -284,16 +542,80 @@ static void wake_futex(struct futex_q *q) | |||
284 | q->lock_ptr = NULL; | 542 | q->lock_ptr = NULL; |
285 | } | 543 | } |
286 | 544 | ||
545 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | ||
546 | { | ||
547 | struct task_struct *new_owner; | ||
548 | struct futex_pi_state *pi_state = this->pi_state; | ||
549 | u32 curval, newval; | ||
550 | |||
551 | if (!pi_state) | ||
552 | return -EINVAL; | ||
553 | |||
554 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); | ||
555 | |||
556 | /* | ||
557 | * This happens when we have stolen the lock and the original | ||
558 | * pending owner did not enqueue itself back on the rt_mutex. | ||
559 | * Thats not a tragedy. We know that way, that a lock waiter | ||
560 | * is on the fly. We make the futex_q waiter the pending owner. | ||
561 | */ | ||
562 | if (!new_owner) | ||
563 | new_owner = this->task; | ||
564 | |||
565 | /* | ||
566 | * We pass it to the next owner. (The WAITERS bit is always | ||
567 | * kept enabled while there is PI state around. We must also | ||
568 | * preserve the owner died bit.) | ||
569 | */ | ||
570 | newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid; | ||
571 | |||
572 | inc_preempt_count(); | ||
573 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | ||
574 | dec_preempt_count(); | ||
575 | |||
576 | if (curval == -EFAULT) | ||
577 | return -EFAULT; | ||
578 | if (curval != uval) | ||
579 | return -EINVAL; | ||
580 | |||
581 | list_del_init(&pi_state->owner->pi_state_list); | ||
582 | list_add(&pi_state->list, &new_owner->pi_state_list); | ||
583 | pi_state->owner = new_owner; | ||
584 | rt_mutex_unlock(&pi_state->pi_mutex); | ||
585 | |||
586 | return 0; | ||
587 | } | ||
588 | |||
589 | static int unlock_futex_pi(u32 __user *uaddr, u32 uval) | ||
590 | { | ||
591 | u32 oldval; | ||
592 | |||
593 | /* | ||
594 | * There is no waiter, so we unlock the futex. The owner died | ||
595 | * bit has not to be preserved here. We are the owner: | ||
596 | */ | ||
597 | inc_preempt_count(); | ||
598 | oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); | ||
599 | dec_preempt_count(); | ||
600 | |||
601 | if (oldval == -EFAULT) | ||
602 | return oldval; | ||
603 | if (oldval != uval) | ||
604 | return -EAGAIN; | ||
605 | |||
606 | return 0; | ||
607 | } | ||
608 | |||
287 | /* | 609 | /* |
288 | * Wake up all waiters hashed on the physical page that is mapped | 610 | * Wake up all waiters hashed on the physical page that is mapped |
289 | * to this virtual address: | 611 | * to this virtual address: |
290 | */ | 612 | */ |
291 | static int futex_wake(unsigned long uaddr, int nr_wake) | 613 | static int futex_wake(u32 __user *uaddr, int nr_wake) |
292 | { | 614 | { |
293 | union futex_key key; | 615 | struct futex_hash_bucket *hb; |
294 | struct futex_hash_bucket *bh; | ||
295 | struct list_head *head; | ||
296 | struct futex_q *this, *next; | 616 | struct futex_q *this, *next; |
617 | struct list_head *head; | ||
618 | union futex_key key; | ||
297 | int ret; | 619 | int ret; |
298 | 620 | ||
299 | down_read(¤t->mm->mmap_sem); | 621 | down_read(¤t->mm->mmap_sem); |
@@ -302,19 +624,23 @@ static int futex_wake(unsigned long uaddr, int nr_wake) | |||
302 | if (unlikely(ret != 0)) | 624 | if (unlikely(ret != 0)) |
303 | goto out; | 625 | goto out; |
304 | 626 | ||
305 | bh = hash_futex(&key); | 627 | hb = hash_futex(&key); |
306 | spin_lock(&bh->lock); | 628 | spin_lock(&hb->lock); |
307 | head = &bh->chain; | 629 | head = &hb->chain; |
308 | 630 | ||
309 | list_for_each_entry_safe(this, next, head, list) { | 631 | list_for_each_entry_safe(this, next, head, list) { |
310 | if (match_futex (&this->key, &key)) { | 632 | if (match_futex (&this->key, &key)) { |
633 | if (this->pi_state) { | ||
634 | ret = -EINVAL; | ||
635 | break; | ||
636 | } | ||
311 | wake_futex(this); | 637 | wake_futex(this); |
312 | if (++ret >= nr_wake) | 638 | if (++ret >= nr_wake) |
313 | break; | 639 | break; |
314 | } | 640 | } |
315 | } | 641 | } |
316 | 642 | ||
317 | spin_unlock(&bh->lock); | 643 | spin_unlock(&hb->lock); |
318 | out: | 644 | out: |
319 | up_read(¤t->mm->mmap_sem); | 645 | up_read(¤t->mm->mmap_sem); |
320 | return ret; | 646 | return ret; |
@@ -324,10 +650,12 @@ out: | |||
324 | * Wake up all waiters hashed on the physical page that is mapped | 650 | * Wake up all waiters hashed on the physical page that is mapped |
325 | * to this virtual address: | 651 | * to this virtual address: |
326 | */ | 652 | */ |
327 | static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) | 653 | static int |
654 | futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2, | ||
655 | int nr_wake, int nr_wake2, int op) | ||
328 | { | 656 | { |
329 | union futex_key key1, key2; | 657 | union futex_key key1, key2; |
330 | struct futex_hash_bucket *bh1, *bh2; | 658 | struct futex_hash_bucket *hb1, *hb2; |
331 | struct list_head *head; | 659 | struct list_head *head; |
332 | struct futex_q *this, *next; | 660 | struct futex_q *this, *next; |
333 | int ret, op_ret, attempt = 0; | 661 | int ret, op_ret, attempt = 0; |
@@ -342,27 +670,29 @@ retryfull: | |||
342 | if (unlikely(ret != 0)) | 670 | if (unlikely(ret != 0)) |
343 | goto out; | 671 | goto out; |
344 | 672 | ||
345 | bh1 = hash_futex(&key1); | 673 | hb1 = hash_futex(&key1); |
346 | bh2 = hash_futex(&key2); | 674 | hb2 = hash_futex(&key2); |
347 | 675 | ||
348 | retry: | 676 | retry: |
349 | if (bh1 < bh2) | 677 | if (hb1 < hb2) |
350 | spin_lock(&bh1->lock); | 678 | spin_lock(&hb1->lock); |
351 | spin_lock(&bh2->lock); | 679 | spin_lock(&hb2->lock); |
352 | if (bh1 > bh2) | 680 | if (hb1 > hb2) |
353 | spin_lock(&bh1->lock); | 681 | spin_lock(&hb1->lock); |
354 | 682 | ||
355 | op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); | 683 | op_ret = futex_atomic_op_inuser(op, uaddr2); |
356 | if (unlikely(op_ret < 0)) { | 684 | if (unlikely(op_ret < 0)) { |
357 | int dummy; | 685 | u32 dummy; |
358 | 686 | ||
359 | spin_unlock(&bh1->lock); | 687 | spin_unlock(&hb1->lock); |
360 | if (bh1 != bh2) | 688 | if (hb1 != hb2) |
361 | spin_unlock(&bh2->lock); | 689 | spin_unlock(&hb2->lock); |
362 | 690 | ||
363 | #ifndef CONFIG_MMU | 691 | #ifndef CONFIG_MMU |
364 | /* we don't get EFAULT from MMU faults if we don't have an MMU, | 692 | /* |
365 | * but we might get them from range checking */ | 693 | * we don't get EFAULT from MMU faults if we don't have an MMU, |
694 | * but we might get them from range checking | ||
695 | */ | ||
366 | ret = op_ret; | 696 | ret = op_ret; |
367 | goto out; | 697 | goto out; |
368 | #endif | 698 | #endif |
@@ -372,47 +702,34 @@ retry: | |||
372 | goto out; | 702 | goto out; |
373 | } | 703 | } |
374 | 704 | ||
375 | /* futex_atomic_op_inuser needs to both read and write | 705 | /* |
706 | * futex_atomic_op_inuser needs to both read and write | ||
376 | * *(int __user *)uaddr2, but we can't modify it | 707 | * *(int __user *)uaddr2, but we can't modify it |
377 | * non-atomically. Therefore, if get_user below is not | 708 | * non-atomically. Therefore, if get_user below is not |
378 | * enough, we need to handle the fault ourselves, while | 709 | * enough, we need to handle the fault ourselves, while |
379 | * still holding the mmap_sem. */ | 710 | * still holding the mmap_sem. |
711 | */ | ||
380 | if (attempt++) { | 712 | if (attempt++) { |
381 | struct vm_area_struct * vma; | 713 | if (futex_handle_fault((unsigned long)uaddr2, |
382 | struct mm_struct *mm = current->mm; | 714 | attempt)) |
383 | |||
384 | ret = -EFAULT; | ||
385 | if (attempt >= 2 || | ||
386 | !(vma = find_vma(mm, uaddr2)) || | ||
387 | vma->vm_start > uaddr2 || | ||
388 | !(vma->vm_flags & VM_WRITE)) | ||
389 | goto out; | ||
390 | |||
391 | switch (handle_mm_fault(mm, vma, uaddr2, 1)) { | ||
392 | case VM_FAULT_MINOR: | ||
393 | current->min_flt++; | ||
394 | break; | ||
395 | case VM_FAULT_MAJOR: | ||
396 | current->maj_flt++; | ||
397 | break; | ||
398 | default: | ||
399 | goto out; | 715 | goto out; |
400 | } | ||
401 | goto retry; | 716 | goto retry; |
402 | } | 717 | } |
403 | 718 | ||
404 | /* If we would have faulted, release mmap_sem, | 719 | /* |
405 | * fault it in and start all over again. */ | 720 | * If we would have faulted, release mmap_sem, |
721 | * fault it in and start all over again. | ||
722 | */ | ||
406 | up_read(¤t->mm->mmap_sem); | 723 | up_read(¤t->mm->mmap_sem); |
407 | 724 | ||
408 | ret = get_user(dummy, (int __user *)uaddr2); | 725 | ret = get_user(dummy, uaddr2); |
409 | if (ret) | 726 | if (ret) |
410 | return ret; | 727 | return ret; |
411 | 728 | ||
412 | goto retryfull; | 729 | goto retryfull; |
413 | } | 730 | } |
414 | 731 | ||
415 | head = &bh1->chain; | 732 | head = &hb1->chain; |
416 | 733 | ||
417 | list_for_each_entry_safe(this, next, head, list) { | 734 | list_for_each_entry_safe(this, next, head, list) { |
418 | if (match_futex (&this->key, &key1)) { | 735 | if (match_futex (&this->key, &key1)) { |
@@ -423,7 +740,7 @@ retry: | |||
423 | } | 740 | } |
424 | 741 | ||
425 | if (op_ret > 0) { | 742 | if (op_ret > 0) { |
426 | head = &bh2->chain; | 743 | head = &hb2->chain; |
427 | 744 | ||
428 | op_ret = 0; | 745 | op_ret = 0; |
429 | list_for_each_entry_safe(this, next, head, list) { | 746 | list_for_each_entry_safe(this, next, head, list) { |
@@ -436,9 +753,9 @@ retry: | |||
436 | ret += op_ret; | 753 | ret += op_ret; |
437 | } | 754 | } |
438 | 755 | ||
439 | spin_unlock(&bh1->lock); | 756 | spin_unlock(&hb1->lock); |
440 | if (bh1 != bh2) | 757 | if (hb1 != hb2) |
441 | spin_unlock(&bh2->lock); | 758 | spin_unlock(&hb2->lock); |
442 | out: | 759 | out: |
443 | up_read(¤t->mm->mmap_sem); | 760 | up_read(¤t->mm->mmap_sem); |
444 | return ret; | 761 | return ret; |
@@ -448,11 +765,11 @@ out: | |||
448 | * Requeue all waiters hashed on one physical page to another | 765 | * Requeue all waiters hashed on one physical page to another |
449 | * physical page. | 766 | * physical page. |
450 | */ | 767 | */ |
451 | static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, | 768 | static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, |
452 | int nr_wake, int nr_requeue, int *valp) | 769 | int nr_wake, int nr_requeue, u32 *cmpval) |
453 | { | 770 | { |
454 | union futex_key key1, key2; | 771 | union futex_key key1, key2; |
455 | struct futex_hash_bucket *bh1, *bh2; | 772 | struct futex_hash_bucket *hb1, *hb2; |
456 | struct list_head *head1; | 773 | struct list_head *head1; |
457 | struct futex_q *this, *next; | 774 | struct futex_q *this, *next; |
458 | int ret, drop_count = 0; | 775 | int ret, drop_count = 0; |
@@ -467,68 +784,72 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, | |||
467 | if (unlikely(ret != 0)) | 784 | if (unlikely(ret != 0)) |
468 | goto out; | 785 | goto out; |
469 | 786 | ||
470 | bh1 = hash_futex(&key1); | 787 | hb1 = hash_futex(&key1); |
471 | bh2 = hash_futex(&key2); | 788 | hb2 = hash_futex(&key2); |
472 | 789 | ||
473 | if (bh1 < bh2) | 790 | if (hb1 < hb2) |
474 | spin_lock(&bh1->lock); | 791 | spin_lock(&hb1->lock); |
475 | spin_lock(&bh2->lock); | 792 | spin_lock(&hb2->lock); |
476 | if (bh1 > bh2) | 793 | if (hb1 > hb2) |
477 | spin_lock(&bh1->lock); | 794 | spin_lock(&hb1->lock); |
478 | 795 | ||
479 | if (likely(valp != NULL)) { | 796 | if (likely(cmpval != NULL)) { |
480 | int curval; | 797 | u32 curval; |
481 | 798 | ||
482 | ret = get_futex_value_locked(&curval, (int __user *)uaddr1); | 799 | ret = get_futex_value_locked(&curval, uaddr1); |
483 | 800 | ||
484 | if (unlikely(ret)) { | 801 | if (unlikely(ret)) { |
485 | spin_unlock(&bh1->lock); | 802 | spin_unlock(&hb1->lock); |
486 | if (bh1 != bh2) | 803 | if (hb1 != hb2) |
487 | spin_unlock(&bh2->lock); | 804 | spin_unlock(&hb2->lock); |
488 | 805 | ||
489 | /* If we would have faulted, release mmap_sem, fault | 806 | /* |
807 | * If we would have faulted, release mmap_sem, fault | ||
490 | * it in and start all over again. | 808 | * it in and start all over again. |
491 | */ | 809 | */ |
492 | up_read(¤t->mm->mmap_sem); | 810 | up_read(¤t->mm->mmap_sem); |
493 | 811 | ||
494 | ret = get_user(curval, (int __user *)uaddr1); | 812 | ret = get_user(curval, uaddr1); |
495 | 813 | ||
496 | if (!ret) | 814 | if (!ret) |
497 | goto retry; | 815 | goto retry; |
498 | 816 | ||
499 | return ret; | 817 | return ret; |
500 | } | 818 | } |
501 | if (curval != *valp) { | 819 | if (curval != *cmpval) { |
502 | ret = -EAGAIN; | 820 | ret = -EAGAIN; |
503 | goto out_unlock; | 821 | goto out_unlock; |
504 | } | 822 | } |
505 | } | 823 | } |
506 | 824 | ||
507 | head1 = &bh1->chain; | 825 | head1 = &hb1->chain; |
508 | list_for_each_entry_safe(this, next, head1, list) { | 826 | list_for_each_entry_safe(this, next, head1, list) { |
509 | if (!match_futex (&this->key, &key1)) | 827 | if (!match_futex (&this->key, &key1)) |
510 | continue; | 828 | continue; |
511 | if (++ret <= nr_wake) { | 829 | if (++ret <= nr_wake) { |
512 | wake_futex(this); | 830 | wake_futex(this); |
513 | } else { | 831 | } else { |
514 | list_move_tail(&this->list, &bh2->chain); | 832 | /* |
515 | this->lock_ptr = &bh2->lock; | 833 | * If key1 and key2 hash to the same bucket, no need to |
834 | * requeue. | ||
835 | */ | ||
836 | if (likely(head1 != &hb2->chain)) { | ||
837 | list_move_tail(&this->list, &hb2->chain); | ||
838 | this->lock_ptr = &hb2->lock; | ||
839 | } | ||
516 | this->key = key2; | 840 | this->key = key2; |
517 | get_key_refs(&key2); | 841 | get_key_refs(&key2); |
518 | drop_count++; | 842 | drop_count++; |
519 | 843 | ||
520 | if (ret - nr_wake >= nr_requeue) | 844 | if (ret - nr_wake >= nr_requeue) |
521 | break; | 845 | break; |
522 | /* Make sure to stop if key1 == key2 */ | ||
523 | if (head1 == &bh2->chain && head1 != &next->list) | ||
524 | head1 = &this->list; | ||
525 | } | 846 | } |
526 | } | 847 | } |
527 | 848 | ||
528 | out_unlock: | 849 | out_unlock: |
529 | spin_unlock(&bh1->lock); | 850 | spin_unlock(&hb1->lock); |
530 | if (bh1 != bh2) | 851 | if (hb1 != hb2) |
531 | spin_unlock(&bh2->lock); | 852 | spin_unlock(&hb2->lock); |
532 | 853 | ||
533 | /* drop_key_refs() must be called outside the spinlocks. */ | 854 | /* drop_key_refs() must be called outside the spinlocks. */ |
534 | while (--drop_count >= 0) | 855 | while (--drop_count >= 0) |
@@ -543,7 +864,7 @@ out: | |||
543 | static inline struct futex_hash_bucket * | 864 | static inline struct futex_hash_bucket * |
544 | queue_lock(struct futex_q *q, int fd, struct file *filp) | 865 | queue_lock(struct futex_q *q, int fd, struct file *filp) |
545 | { | 866 | { |
546 | struct futex_hash_bucket *bh; | 867 | struct futex_hash_bucket *hb; |
547 | 868 | ||
548 | q->fd = fd; | 869 | q->fd = fd; |
549 | q->filp = filp; | 870 | q->filp = filp; |
@@ -551,23 +872,24 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) | |||
551 | init_waitqueue_head(&q->waiters); | 872 | init_waitqueue_head(&q->waiters); |
552 | 873 | ||
553 | get_key_refs(&q->key); | 874 | get_key_refs(&q->key); |
554 | bh = hash_futex(&q->key); | 875 | hb = hash_futex(&q->key); |
555 | q->lock_ptr = &bh->lock; | 876 | q->lock_ptr = &hb->lock; |
556 | 877 | ||
557 | spin_lock(&bh->lock); | 878 | spin_lock(&hb->lock); |
558 | return bh; | 879 | return hb; |
559 | } | 880 | } |
560 | 881 | ||
561 | static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh) | 882 | static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) |
562 | { | 883 | { |
563 | list_add_tail(&q->list, &bh->chain); | 884 | list_add_tail(&q->list, &hb->chain); |
564 | spin_unlock(&bh->lock); | 885 | q->task = current; |
886 | spin_unlock(&hb->lock); | ||
565 | } | 887 | } |
566 | 888 | ||
567 | static inline void | 889 | static inline void |
568 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) | 890 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) |
569 | { | 891 | { |
570 | spin_unlock(&bh->lock); | 892 | spin_unlock(&hb->lock); |
571 | drop_key_refs(&q->key); | 893 | drop_key_refs(&q->key); |
572 | } | 894 | } |
573 | 895 | ||
@@ -579,16 +901,17 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) | |||
579 | /* The key must be already stored in q->key. */ | 901 | /* The key must be already stored in q->key. */ |
580 | static void queue_me(struct futex_q *q, int fd, struct file *filp) | 902 | static void queue_me(struct futex_q *q, int fd, struct file *filp) |
581 | { | 903 | { |
582 | struct futex_hash_bucket *bh; | 904 | struct futex_hash_bucket *hb; |
583 | bh = queue_lock(q, fd, filp); | 905 | |
584 | __queue_me(q, bh); | 906 | hb = queue_lock(q, fd, filp); |
907 | __queue_me(q, hb); | ||
585 | } | 908 | } |
586 | 909 | ||
587 | /* Return 1 if we were still queued (ie. 0 means we were woken) */ | 910 | /* Return 1 if we were still queued (ie. 0 means we were woken) */ |
588 | static int unqueue_me(struct futex_q *q) | 911 | static int unqueue_me(struct futex_q *q) |
589 | { | 912 | { |
590 | int ret = 0; | ||
591 | spinlock_t *lock_ptr; | 913 | spinlock_t *lock_ptr; |
914 | int ret = 0; | ||
592 | 915 | ||
593 | /* In the common case we don't take the spinlock, which is nice. */ | 916 | /* In the common case we don't take the spinlock, which is nice. */ |
594 | retry: | 917 | retry: |
@@ -614,6 +937,9 @@ static int unqueue_me(struct futex_q *q) | |||
614 | } | 937 | } |
615 | WARN_ON(list_empty(&q->list)); | 938 | WARN_ON(list_empty(&q->list)); |
616 | list_del(&q->list); | 939 | list_del(&q->list); |
940 | |||
941 | BUG_ON(q->pi_state); | ||
942 | |||
617 | spin_unlock(lock_ptr); | 943 | spin_unlock(lock_ptr); |
618 | ret = 1; | 944 | ret = 1; |
619 | } | 945 | } |
@@ -622,21 +948,42 @@ static int unqueue_me(struct futex_q *q) | |||
622 | return ret; | 948 | return ret; |
623 | } | 949 | } |
624 | 950 | ||
625 | static int futex_wait(unsigned long uaddr, int val, unsigned long time) | 951 | /* |
952 | * PI futexes can not be requeued and must remove themself from the | ||
953 | * hash bucket. The hash bucket lock is held on entry and dropped here. | ||
954 | */ | ||
955 | static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) | ||
626 | { | 956 | { |
627 | DECLARE_WAITQUEUE(wait, current); | 957 | WARN_ON(list_empty(&q->list)); |
628 | int ret, curval; | 958 | list_del(&q->list); |
959 | |||
960 | BUG_ON(!q->pi_state); | ||
961 | free_pi_state(q->pi_state); | ||
962 | q->pi_state = NULL; | ||
963 | |||
964 | spin_unlock(&hb->lock); | ||
965 | |||
966 | drop_key_refs(&q->key); | ||
967 | } | ||
968 | |||
969 | static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) | ||
970 | { | ||
971 | struct task_struct *curr = current; | ||
972 | DECLARE_WAITQUEUE(wait, curr); | ||
973 | struct futex_hash_bucket *hb; | ||
629 | struct futex_q q; | 974 | struct futex_q q; |
630 | struct futex_hash_bucket *bh; | 975 | u32 uval; |
976 | int ret; | ||
631 | 977 | ||
978 | q.pi_state = NULL; | ||
632 | retry: | 979 | retry: |
633 | down_read(¤t->mm->mmap_sem); | 980 | down_read(&curr->mm->mmap_sem); |
634 | 981 | ||
635 | ret = get_futex_key(uaddr, &q.key); | 982 | ret = get_futex_key(uaddr, &q.key); |
636 | if (unlikely(ret != 0)) | 983 | if (unlikely(ret != 0)) |
637 | goto out_release_sem; | 984 | goto out_release_sem; |
638 | 985 | ||
639 | bh = queue_lock(&q, -1, NULL); | 986 | hb = queue_lock(&q, -1, NULL); |
640 | 987 | ||
641 | /* | 988 | /* |
642 | * Access the page AFTER the futex is queued. | 989 | * Access the page AFTER the futex is queued. |
@@ -658,37 +1005,35 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time) | |||
658 | * We hold the mmap semaphore, so the mapping cannot have changed | 1005 | * We hold the mmap semaphore, so the mapping cannot have changed |
659 | * since we looked it up in get_futex_key. | 1006 | * since we looked it up in get_futex_key. |
660 | */ | 1007 | */ |
661 | 1008 | ret = get_futex_value_locked(&uval, uaddr); | |
662 | ret = get_futex_value_locked(&curval, (int __user *)uaddr); | ||
663 | 1009 | ||
664 | if (unlikely(ret)) { | 1010 | if (unlikely(ret)) { |
665 | queue_unlock(&q, bh); | 1011 | queue_unlock(&q, hb); |
666 | 1012 | ||
667 | /* If we would have faulted, release mmap_sem, fault it in and | 1013 | /* |
1014 | * If we would have faulted, release mmap_sem, fault it in and | ||
668 | * start all over again. | 1015 | * start all over again. |
669 | */ | 1016 | */ |
670 | up_read(¤t->mm->mmap_sem); | 1017 | up_read(&curr->mm->mmap_sem); |
671 | 1018 | ||
672 | ret = get_user(curval, (int __user *)uaddr); | 1019 | ret = get_user(uval, uaddr); |
673 | 1020 | ||
674 | if (!ret) | 1021 | if (!ret) |
675 | goto retry; | 1022 | goto retry; |
676 | return ret; | 1023 | return ret; |
677 | } | 1024 | } |
678 | if (curval != val) { | 1025 | ret = -EWOULDBLOCK; |
679 | ret = -EWOULDBLOCK; | 1026 | if (uval != val) |
680 | queue_unlock(&q, bh); | 1027 | goto out_unlock_release_sem; |
681 | goto out_release_sem; | ||
682 | } | ||
683 | 1028 | ||
684 | /* Only actually queue if *uaddr contained val. */ | 1029 | /* Only actually queue if *uaddr contained val. */ |
685 | __queue_me(&q, bh); | 1030 | __queue_me(&q, hb); |
686 | 1031 | ||
687 | /* | 1032 | /* |
688 | * Now the futex is queued and we have checked the data, we | 1033 | * Now the futex is queued and we have checked the data, we |
689 | * don't want to hold mmap_sem while we sleep. | 1034 | * don't want to hold mmap_sem while we sleep. |
690 | */ | 1035 | */ |
691 | up_read(¤t->mm->mmap_sem); | 1036 | up_read(&curr->mm->mmap_sem); |
692 | 1037 | ||
693 | /* | 1038 | /* |
694 | * There might have been scheduling since the queue_me(), as we | 1039 | * There might have been scheduling since the queue_me(), as we |
@@ -720,12 +1065,421 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time) | |||
720 | return 0; | 1065 | return 0; |
721 | if (time == 0) | 1066 | if (time == 0) |
722 | return -ETIMEDOUT; | 1067 | return -ETIMEDOUT; |
723 | /* We expect signal_pending(current), but another thread may | 1068 | /* |
724 | * have handled it for us already. */ | 1069 | * We expect signal_pending(current), but another thread may |
1070 | * have handled it for us already. | ||
1071 | */ | ||
725 | return -EINTR; | 1072 | return -EINTR; |
726 | 1073 | ||
1074 | out_unlock_release_sem: | ||
1075 | queue_unlock(&q, hb); | ||
1076 | |||
727 | out_release_sem: | 1077 | out_release_sem: |
1078 | up_read(&curr->mm->mmap_sem); | ||
1079 | return ret; | ||
1080 | } | ||
1081 | |||
1082 | /* | ||
1083 | * Userspace tried a 0 -> TID atomic transition of the futex value | ||
1084 | * and failed. The kernel side here does the whole locking operation: | ||
1085 | * if there are waiters then it will block, it does PI, etc. (Due to | ||
1086 | * races the kernel might see a 0 value of the futex too.) | ||
1087 | */ | ||
1088 | static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, | ||
1089 | struct hrtimer_sleeper *to) | ||
1090 | { | ||
1091 | struct task_struct *curr = current; | ||
1092 | struct futex_hash_bucket *hb; | ||
1093 | u32 uval, newval, curval; | ||
1094 | struct futex_q q; | ||
1095 | int ret, attempt = 0; | ||
1096 | |||
1097 | if (refill_pi_state_cache()) | ||
1098 | return -ENOMEM; | ||
1099 | |||
1100 | q.pi_state = NULL; | ||
1101 | retry: | ||
1102 | down_read(&curr->mm->mmap_sem); | ||
1103 | |||
1104 | ret = get_futex_key(uaddr, &q.key); | ||
1105 | if (unlikely(ret != 0)) | ||
1106 | goto out_release_sem; | ||
1107 | |||
1108 | hb = queue_lock(&q, -1, NULL); | ||
1109 | |||
1110 | retry_locked: | ||
1111 | /* | ||
1112 | * To avoid races, we attempt to take the lock here again | ||
1113 | * (by doing a 0 -> TID atomic cmpxchg), while holding all | ||
1114 | * the locks. It will most likely not succeed. | ||
1115 | */ | ||
1116 | newval = current->pid; | ||
1117 | |||
1118 | inc_preempt_count(); | ||
1119 | curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); | ||
1120 | dec_preempt_count(); | ||
1121 | |||
1122 | if (unlikely(curval == -EFAULT)) | ||
1123 | goto uaddr_faulted; | ||
1124 | |||
1125 | /* We own the lock already */ | ||
1126 | if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { | ||
1127 | if (!detect && 0) | ||
1128 | force_sig(SIGKILL, current); | ||
1129 | ret = -EDEADLK; | ||
1130 | goto out_unlock_release_sem; | ||
1131 | } | ||
1132 | |||
1133 | /* | ||
1134 | * Surprise - we got the lock. Just return | ||
1135 | * to userspace: | ||
1136 | */ | ||
1137 | if (unlikely(!curval)) | ||
1138 | goto out_unlock_release_sem; | ||
1139 | |||
1140 | uval = curval; | ||
1141 | newval = uval | FUTEX_WAITERS; | ||
1142 | |||
1143 | inc_preempt_count(); | ||
1144 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | ||
1145 | dec_preempt_count(); | ||
1146 | |||
1147 | if (unlikely(curval == -EFAULT)) | ||
1148 | goto uaddr_faulted; | ||
1149 | if (unlikely(curval != uval)) | ||
1150 | goto retry_locked; | ||
1151 | |||
1152 | /* | ||
1153 | * We dont have the lock. Look up the PI state (or create it if | ||
1154 | * we are the first waiter): | ||
1155 | */ | ||
1156 | ret = lookup_pi_state(uval, hb, &q); | ||
1157 | |||
1158 | if (unlikely(ret)) { | ||
1159 | /* | ||
1160 | * There were no waiters and the owner task lookup | ||
1161 | * failed. When the OWNER_DIED bit is set, then we | ||
1162 | * know that this is a robust futex and we actually | ||
1163 | * take the lock. This is safe as we are protected by | ||
1164 | * the hash bucket lock. We also set the waiters bit | ||
1165 | * unconditionally here, to simplify glibc handling of | ||
1166 | * multiple tasks racing to acquire the lock and | ||
1167 | * cleanup the problems which were left by the dead | ||
1168 | * owner. | ||
1169 | */ | ||
1170 | if (curval & FUTEX_OWNER_DIED) { | ||
1171 | uval = newval; | ||
1172 | newval = current->pid | | ||
1173 | FUTEX_OWNER_DIED | FUTEX_WAITERS; | ||
1174 | |||
1175 | inc_preempt_count(); | ||
1176 | curval = futex_atomic_cmpxchg_inatomic(uaddr, | ||
1177 | uval, newval); | ||
1178 | dec_preempt_count(); | ||
1179 | |||
1180 | if (unlikely(curval == -EFAULT)) | ||
1181 | goto uaddr_faulted; | ||
1182 | if (unlikely(curval != uval)) | ||
1183 | goto retry_locked; | ||
1184 | ret = 0; | ||
1185 | } | ||
1186 | goto out_unlock_release_sem; | ||
1187 | } | ||
1188 | |||
1189 | /* | ||
1190 | * Only actually queue now that the atomic ops are done: | ||
1191 | */ | ||
1192 | __queue_me(&q, hb); | ||
1193 | |||
1194 | /* | ||
1195 | * Now the futex is queued and we have checked the data, we | ||
1196 | * don't want to hold mmap_sem while we sleep. | ||
1197 | */ | ||
1198 | up_read(&curr->mm->mmap_sem); | ||
1199 | |||
1200 | WARN_ON(!q.pi_state); | ||
1201 | /* | ||
1202 | * Block on the PI mutex: | ||
1203 | */ | ||
1204 | if (!trylock) | ||
1205 | ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); | ||
1206 | else { | ||
1207 | ret = rt_mutex_trylock(&q.pi_state->pi_mutex); | ||
1208 | /* Fixup the trylock return value: */ | ||
1209 | ret = ret ? 0 : -EWOULDBLOCK; | ||
1210 | } | ||
1211 | |||
1212 | down_read(&curr->mm->mmap_sem); | ||
1213 | spin_lock(q.lock_ptr); | ||
1214 | |||
1215 | /* | ||
1216 | * Got the lock. We might not be the anticipated owner if we | ||
1217 | * did a lock-steal - fix up the PI-state in that case. | ||
1218 | */ | ||
1219 | if (!ret && q.pi_state->owner != curr) { | ||
1220 | u32 newtid = current->pid | FUTEX_WAITERS; | ||
1221 | |||
1222 | /* Owner died? */ | ||
1223 | if (q.pi_state->owner != NULL) { | ||
1224 | spin_lock_irq(&q.pi_state->owner->pi_lock); | ||
1225 | list_del_init(&q.pi_state->list); | ||
1226 | spin_unlock_irq(&q.pi_state->owner->pi_lock); | ||
1227 | } else | ||
1228 | newtid |= FUTEX_OWNER_DIED; | ||
1229 | |||
1230 | q.pi_state->owner = current; | ||
1231 | |||
1232 | spin_lock_irq(¤t->pi_lock); | ||
1233 | list_add(&q.pi_state->list, ¤t->pi_state_list); | ||
1234 | spin_unlock_irq(¤t->pi_lock); | ||
1235 | |||
1236 | /* Unqueue and drop the lock */ | ||
1237 | unqueue_me_pi(&q, hb); | ||
1238 | up_read(&curr->mm->mmap_sem); | ||
1239 | /* | ||
1240 | * We own it, so we have to replace the pending owner | ||
1241 | * TID. This must be atomic as we have preserve the | ||
1242 | * owner died bit here. | ||
1243 | */ | ||
1244 | ret = get_user(uval, uaddr); | ||
1245 | while (!ret) { | ||
1246 | newval = (uval & FUTEX_OWNER_DIED) | newtid; | ||
1247 | curval = futex_atomic_cmpxchg_inatomic(uaddr, | ||
1248 | uval, newval); | ||
1249 | if (curval == -EFAULT) | ||
1250 | ret = -EFAULT; | ||
1251 | if (curval == uval) | ||
1252 | break; | ||
1253 | uval = curval; | ||
1254 | } | ||
1255 | } else { | ||
1256 | /* | ||
1257 | * Catch the rare case, where the lock was released | ||
1258 | * when we were on the way back before we locked | ||
1259 | * the hash bucket. | ||
1260 | */ | ||
1261 | if (ret && q.pi_state->owner == curr) { | ||
1262 | if (rt_mutex_trylock(&q.pi_state->pi_mutex)) | ||
1263 | ret = 0; | ||
1264 | } | ||
1265 | /* Unqueue and drop the lock */ | ||
1266 | unqueue_me_pi(&q, hb); | ||
1267 | up_read(&curr->mm->mmap_sem); | ||
1268 | } | ||
1269 | |||
1270 | if (!detect && ret == -EDEADLK && 0) | ||
1271 | force_sig(SIGKILL, current); | ||
1272 | |||
1273 | return ret; | ||
1274 | |||
1275 | out_unlock_release_sem: | ||
1276 | queue_unlock(&q, hb); | ||
1277 | |||
1278 | out_release_sem: | ||
1279 | up_read(&curr->mm->mmap_sem); | ||
1280 | return ret; | ||
1281 | |||
1282 | uaddr_faulted: | ||
1283 | /* | ||
1284 | * We have to r/w *(int __user *)uaddr, but we can't modify it | ||
1285 | * non-atomically. Therefore, if get_user below is not | ||
1286 | * enough, we need to handle the fault ourselves, while | ||
1287 | * still holding the mmap_sem. | ||
1288 | */ | ||
1289 | if (attempt++) { | ||
1290 | if (futex_handle_fault((unsigned long)uaddr, attempt)) | ||
1291 | goto out_unlock_release_sem; | ||
1292 | |||
1293 | goto retry_locked; | ||
1294 | } | ||
1295 | |||
1296 | queue_unlock(&q, hb); | ||
1297 | up_read(&curr->mm->mmap_sem); | ||
1298 | |||
1299 | ret = get_user(uval, uaddr); | ||
1300 | if (!ret && (uval != -EFAULT)) | ||
1301 | goto retry; | ||
1302 | |||
1303 | return ret; | ||
1304 | } | ||
1305 | |||
1306 | /* | ||
1307 | * Restart handler | ||
1308 | */ | ||
1309 | static long futex_lock_pi_restart(struct restart_block *restart) | ||
1310 | { | ||
1311 | struct hrtimer_sleeper timeout, *to = NULL; | ||
1312 | int ret; | ||
1313 | |||
1314 | restart->fn = do_no_restart_syscall; | ||
1315 | |||
1316 | if (restart->arg2 || restart->arg3) { | ||
1317 | to = &timeout; | ||
1318 | hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); | ||
1319 | hrtimer_init_sleeper(to, current); | ||
1320 | to->timer.expires.tv64 = ((u64)restart->arg1 << 32) | | ||
1321 | (u64) restart->arg0; | ||
1322 | } | ||
1323 | |||
1324 | pr_debug("lock_pi restart: %p, %d (%d)\n", | ||
1325 | (u32 __user *)restart->arg0, current->pid); | ||
1326 | |||
1327 | ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1, | ||
1328 | 0, to); | ||
1329 | |||
1330 | if (ret != -EINTR) | ||
1331 | return ret; | ||
1332 | |||
1333 | restart->fn = futex_lock_pi_restart; | ||
1334 | |||
1335 | /* The other values are filled in */ | ||
1336 | return -ERESTART_RESTARTBLOCK; | ||
1337 | } | ||
1338 | |||
1339 | /* | ||
1340 | * Called from the syscall entry below. | ||
1341 | */ | ||
1342 | static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, | ||
1343 | long nsec, int trylock) | ||
1344 | { | ||
1345 | struct hrtimer_sleeper timeout, *to = NULL; | ||
1346 | struct restart_block *restart; | ||
1347 | int ret; | ||
1348 | |||
1349 | if (sec != MAX_SCHEDULE_TIMEOUT) { | ||
1350 | to = &timeout; | ||
1351 | hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); | ||
1352 | hrtimer_init_sleeper(to, current); | ||
1353 | to->timer.expires = ktime_set(sec, nsec); | ||
1354 | } | ||
1355 | |||
1356 | ret = do_futex_lock_pi(uaddr, detect, trylock, to); | ||
1357 | |||
1358 | if (ret != -EINTR) | ||
1359 | return ret; | ||
1360 | |||
1361 | pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid); | ||
1362 | |||
1363 | restart = ¤t_thread_info()->restart_block; | ||
1364 | restart->fn = futex_lock_pi_restart; | ||
1365 | restart->arg0 = (unsigned long) uaddr; | ||
1366 | restart->arg1 = detect; | ||
1367 | if (to) { | ||
1368 | restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF; | ||
1369 | restart->arg3 = to->timer.expires.tv64 >> 32; | ||
1370 | } else | ||
1371 | restart->arg2 = restart->arg3 = 0; | ||
1372 | |||
1373 | return -ERESTART_RESTARTBLOCK; | ||
1374 | } | ||
1375 | |||
1376 | /* | ||
1377 | * Userspace attempted a TID -> 0 atomic transition, and failed. | ||
1378 | * This is the in-kernel slowpath: we look up the PI state (if any), | ||
1379 | * and do the rt-mutex unlock. | ||
1380 | */ | ||
1381 | static int futex_unlock_pi(u32 __user *uaddr) | ||
1382 | { | ||
1383 | struct futex_hash_bucket *hb; | ||
1384 | struct futex_q *this, *next; | ||
1385 | u32 uval; | ||
1386 | struct list_head *head; | ||
1387 | union futex_key key; | ||
1388 | int ret, attempt = 0; | ||
1389 | |||
1390 | retry: | ||
1391 | if (get_user(uval, uaddr)) | ||
1392 | return -EFAULT; | ||
1393 | /* | ||
1394 | * We release only a lock we actually own: | ||
1395 | */ | ||
1396 | if ((uval & FUTEX_TID_MASK) != current->pid) | ||
1397 | return -EPERM; | ||
1398 | /* | ||
1399 | * First take all the futex related locks: | ||
1400 | */ | ||
1401 | down_read(¤t->mm->mmap_sem); | ||
1402 | |||
1403 | ret = get_futex_key(uaddr, &key); | ||
1404 | if (unlikely(ret != 0)) | ||
1405 | goto out; | ||
1406 | |||
1407 | hb = hash_futex(&key); | ||
1408 | spin_lock(&hb->lock); | ||
1409 | |||
1410 | retry_locked: | ||
1411 | /* | ||
1412 | * To avoid races, try to do the TID -> 0 atomic transition | ||
1413 | * again. If it succeeds then we can return without waking | ||
1414 | * anyone else up: | ||
1415 | */ | ||
1416 | inc_preempt_count(); | ||
1417 | uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); | ||
1418 | dec_preempt_count(); | ||
1419 | |||
1420 | if (unlikely(uval == -EFAULT)) | ||
1421 | goto pi_faulted; | ||
1422 | /* | ||
1423 | * Rare case: we managed to release the lock atomically, | ||
1424 | * no need to wake anyone else up: | ||
1425 | */ | ||
1426 | if (unlikely(uval == current->pid)) | ||
1427 | goto out_unlock; | ||
1428 | |||
1429 | /* | ||
1430 | * Ok, other tasks may need to be woken up - check waiters | ||
1431 | * and do the wakeup if necessary: | ||
1432 | */ | ||
1433 | head = &hb->chain; | ||
1434 | |||
1435 | list_for_each_entry_safe(this, next, head, list) { | ||
1436 | if (!match_futex (&this->key, &key)) | ||
1437 | continue; | ||
1438 | ret = wake_futex_pi(uaddr, uval, this); | ||
1439 | /* | ||
1440 | * The atomic access to the futex value | ||
1441 | * generated a pagefault, so retry the | ||
1442 | * user-access and the wakeup: | ||
1443 | */ | ||
1444 | if (ret == -EFAULT) | ||
1445 | goto pi_faulted; | ||
1446 | goto out_unlock; | ||
1447 | } | ||
1448 | /* | ||
1449 | * No waiters - kernel unlocks the futex: | ||
1450 | */ | ||
1451 | ret = unlock_futex_pi(uaddr, uval); | ||
1452 | if (ret == -EFAULT) | ||
1453 | goto pi_faulted; | ||
1454 | |||
1455 | out_unlock: | ||
1456 | spin_unlock(&hb->lock); | ||
1457 | out: | ||
728 | up_read(¤t->mm->mmap_sem); | 1458 | up_read(¤t->mm->mmap_sem); |
1459 | |||
1460 | return ret; | ||
1461 | |||
1462 | pi_faulted: | ||
1463 | /* | ||
1464 | * We have to r/w *(int __user *)uaddr, but we can't modify it | ||
1465 | * non-atomically. Therefore, if get_user below is not | ||
1466 | * enough, we need to handle the fault ourselves, while | ||
1467 | * still holding the mmap_sem. | ||
1468 | */ | ||
1469 | if (attempt++) { | ||
1470 | if (futex_handle_fault((unsigned long)uaddr, attempt)) | ||
1471 | goto out_unlock; | ||
1472 | |||
1473 | goto retry_locked; | ||
1474 | } | ||
1475 | |||
1476 | spin_unlock(&hb->lock); | ||
1477 | up_read(¤t->mm->mmap_sem); | ||
1478 | |||
1479 | ret = get_user(uval, uaddr); | ||
1480 | if (!ret && (uval != -EFAULT)) | ||
1481 | goto retry; | ||
1482 | |||
729 | return ret; | 1483 | return ret; |
730 | } | 1484 | } |
731 | 1485 | ||
@@ -735,6 +1489,7 @@ static int futex_close(struct inode *inode, struct file *filp) | |||
735 | 1489 | ||
736 | unqueue_me(q); | 1490 | unqueue_me(q); |
737 | kfree(q); | 1491 | kfree(q); |
1492 | |||
738 | return 0; | 1493 | return 0; |
739 | } | 1494 | } |
740 | 1495 | ||
@@ -766,7 +1521,7 @@ static struct file_operations futex_fops = { | |||
766 | * Signal allows caller to avoid the race which would occur if they | 1521 | * Signal allows caller to avoid the race which would occur if they |
767 | * set the sigio stuff up afterwards. | 1522 | * set the sigio stuff up afterwards. |
768 | */ | 1523 | */ |
769 | static int futex_fd(unsigned long uaddr, int signal) | 1524 | static int futex_fd(u32 __user *uaddr, int signal) |
770 | { | 1525 | { |
771 | struct futex_q *q; | 1526 | struct futex_q *q; |
772 | struct file *filp; | 1527 | struct file *filp; |
@@ -803,6 +1558,7 @@ static int futex_fd(unsigned long uaddr, int signal) | |||
803 | err = -ENOMEM; | 1558 | err = -ENOMEM; |
804 | goto error; | 1559 | goto error; |
805 | } | 1560 | } |
1561 | q->pi_state = NULL; | ||
806 | 1562 | ||
807 | down_read(¤t->mm->mmap_sem); | 1563 | down_read(¤t->mm->mmap_sem); |
808 | err = get_futex_key(uaddr, &q->key); | 1564 | err = get_futex_key(uaddr, &q->key); |
@@ -840,7 +1596,7 @@ error: | |||
840 | * Implementation: user-space maintains a per-thread list of locks it | 1596 | * Implementation: user-space maintains a per-thread list of locks it |
841 | * is holding. Upon do_exit(), the kernel carefully walks this list, | 1597 | * is holding. Upon do_exit(), the kernel carefully walks this list, |
842 | * and marks all locks that are owned by this thread with the | 1598 | * and marks all locks that are owned by this thread with the |
843 | * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is | 1599 | * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is |
844 | * always manipulated with the lock held, so the list is private and | 1600 | * always manipulated with the lock held, so the list is private and |
845 | * per-thread. Userspace also maintains a per-thread 'list_op_pending' | 1601 | * per-thread. Userspace also maintains a per-thread 'list_op_pending' |
846 | * field, to allow the kernel to clean up if the thread dies after | 1602 | * field, to allow the kernel to clean up if the thread dies after |
@@ -915,7 +1671,7 @@ err_unlock: | |||
915 | */ | 1671 | */ |
916 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) | 1672 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) |
917 | { | 1673 | { |
918 | u32 uval; | 1674 | u32 uval, nval; |
919 | 1675 | ||
920 | retry: | 1676 | retry: |
921 | if (get_user(uval, uaddr)) | 1677 | if (get_user(uval, uaddr)) |
@@ -932,12 +1688,16 @@ retry: | |||
932 | * thread-death.) The rest of the cleanup is done in | 1688 | * thread-death.) The rest of the cleanup is done in |
933 | * userspace. | 1689 | * userspace. |
934 | */ | 1690 | */ |
935 | if (futex_atomic_cmpxchg_inatomic(uaddr, uval, | 1691 | nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, |
936 | uval | FUTEX_OWNER_DIED) != uval) | 1692 | uval | FUTEX_OWNER_DIED); |
1693 | if (nval == -EFAULT) | ||
1694 | return -1; | ||
1695 | |||
1696 | if (nval != uval) | ||
937 | goto retry; | 1697 | goto retry; |
938 | 1698 | ||
939 | if (uval & FUTEX_WAITERS) | 1699 | if (uval & FUTEX_WAITERS) |
940 | futex_wake((unsigned long)uaddr, 1); | 1700 | futex_wake(uaddr, 1); |
941 | } | 1701 | } |
942 | return 0; | 1702 | return 0; |
943 | } | 1703 | } |
@@ -978,7 +1738,7 @@ void exit_robust_list(struct task_struct *curr) | |||
978 | while (entry != &head->list) { | 1738 | while (entry != &head->list) { |
979 | /* | 1739 | /* |
980 | * A pending lock might already be on the list, so | 1740 | * A pending lock might already be on the list, so |
981 | * dont process it twice: | 1741 | * don't process it twice: |
982 | */ | 1742 | */ |
983 | if (entry != pending) | 1743 | if (entry != pending) |
984 | if (handle_futex_death((void *)entry + futex_offset, | 1744 | if (handle_futex_death((void *)entry + futex_offset, |
@@ -999,8 +1759,8 @@ void exit_robust_list(struct task_struct *curr) | |||
999 | } | 1759 | } |
1000 | } | 1760 | } |
1001 | 1761 | ||
1002 | long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | 1762 | long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, |
1003 | unsigned long uaddr2, int val2, int val3) | 1763 | u32 __user *uaddr2, u32 val2, u32 val3) |
1004 | { | 1764 | { |
1005 | int ret; | 1765 | int ret; |
1006 | 1766 | ||
@@ -1024,6 +1784,15 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | |||
1024 | case FUTEX_WAKE_OP: | 1784 | case FUTEX_WAKE_OP: |
1025 | ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); | 1785 | ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); |
1026 | break; | 1786 | break; |
1787 | case FUTEX_LOCK_PI: | ||
1788 | ret = futex_lock_pi(uaddr, val, timeout, val2, 0); | ||
1789 | break; | ||
1790 | case FUTEX_UNLOCK_PI: | ||
1791 | ret = futex_unlock_pi(uaddr); | ||
1792 | break; | ||
1793 | case FUTEX_TRYLOCK_PI: | ||
1794 | ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); | ||
1795 | break; | ||
1027 | default: | 1796 | default: |
1028 | ret = -ENOSYS; | 1797 | ret = -ENOSYS; |
1029 | } | 1798 | } |
@@ -1031,36 +1800,40 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | |||
1031 | } | 1800 | } |
1032 | 1801 | ||
1033 | 1802 | ||
1034 | asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, | 1803 | asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, |
1035 | struct timespec __user *utime, u32 __user *uaddr2, | 1804 | struct timespec __user *utime, u32 __user *uaddr2, |
1036 | int val3) | 1805 | u32 val3) |
1037 | { | 1806 | { |
1038 | struct timespec t; | 1807 | struct timespec t; |
1039 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; | 1808 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; |
1040 | int val2 = 0; | 1809 | u32 val2 = 0; |
1041 | 1810 | ||
1042 | if (utime && (op == FUTEX_WAIT)) { | 1811 | if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { |
1043 | if (copy_from_user(&t, utime, sizeof(t)) != 0) | 1812 | if (copy_from_user(&t, utime, sizeof(t)) != 0) |
1044 | return -EFAULT; | 1813 | return -EFAULT; |
1045 | if (!timespec_valid(&t)) | 1814 | if (!timespec_valid(&t)) |
1046 | return -EINVAL; | 1815 | return -EINVAL; |
1047 | timeout = timespec_to_jiffies(&t) + 1; | 1816 | if (op == FUTEX_WAIT) |
1817 | timeout = timespec_to_jiffies(&t) + 1; | ||
1818 | else { | ||
1819 | timeout = t.tv_sec; | ||
1820 | val2 = t.tv_nsec; | ||
1821 | } | ||
1048 | } | 1822 | } |
1049 | /* | 1823 | /* |
1050 | * requeue parameter in 'utime' if op == FUTEX_REQUEUE. | 1824 | * requeue parameter in 'utime' if op == FUTEX_REQUEUE. |
1051 | */ | 1825 | */ |
1052 | if (op >= FUTEX_REQUEUE) | 1826 | if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) |
1053 | val2 = (int) (unsigned long) utime; | 1827 | val2 = (u32) (unsigned long) utime; |
1054 | 1828 | ||
1055 | return do_futex((unsigned long)uaddr, op, val, timeout, | 1829 | return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); |
1056 | (unsigned long)uaddr2, val2, val3); | ||
1057 | } | 1830 | } |
1058 | 1831 | ||
1059 | static struct super_block * | 1832 | static int futexfs_get_sb(struct file_system_type *fs_type, |
1060 | futexfs_get_sb(struct file_system_type *fs_type, | 1833 | int flags, const char *dev_name, void *data, |
1061 | int flags, const char *dev_name, void *data) | 1834 | struct vfsmount *mnt) |
1062 | { | 1835 | { |
1063 | return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA); | 1836 | return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA, mnt); |
1064 | } | 1837 | } |
1065 | 1838 | ||
1066 | static struct file_system_type futex_fs_type = { | 1839 | static struct file_system_type futex_fs_type = { |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 1ab6a0ea3d14..d1d92b441fb7 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
@@ -129,16 +129,20 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, | |||
129 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; | 129 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; |
130 | int val2 = 0; | 130 | int val2 = 0; |
131 | 131 | ||
132 | if (utime && (op == FUTEX_WAIT)) { | 132 | if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { |
133 | if (get_compat_timespec(&t, utime)) | 133 | if (get_compat_timespec(&t, utime)) |
134 | return -EFAULT; | 134 | return -EFAULT; |
135 | if (!timespec_valid(&t)) | 135 | if (!timespec_valid(&t)) |
136 | return -EINVAL; | 136 | return -EINVAL; |
137 | timeout = timespec_to_jiffies(&t) + 1; | 137 | if (op == FUTEX_WAIT) |
138 | timeout = timespec_to_jiffies(&t) + 1; | ||
139 | else { | ||
140 | timeout = t.tv_sec; | ||
141 | val2 = t.tv_nsec; | ||
142 | } | ||
138 | } | 143 | } |
139 | if (op >= FUTEX_REQUEUE) | 144 | if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) |
140 | val2 = (int) (unsigned long) utime; | 145 | val2 = (int) (unsigned long) utime; |
141 | 146 | ||
142 | return do_futex((unsigned long)uaddr, op, val, timeout, | 147 | return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); |
143 | (unsigned long)uaddr2, val2, val3); | ||
144 | } | 148 | } |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 01fa2ae98a85..8d3dc29ef41a 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -98,7 +98,6 @@ static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) = | |||
98 | 98 | ||
99 | /** | 99 | /** |
100 | * ktime_get_ts - get the monotonic clock in timespec format | 100 | * ktime_get_ts - get the monotonic clock in timespec format |
101 | * | ||
102 | * @ts: pointer to timespec variable | 101 | * @ts: pointer to timespec variable |
103 | * | 102 | * |
104 | * The function calculates the monotonic clock from the realtime | 103 | * The function calculates the monotonic clock from the realtime |
@@ -238,7 +237,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) | |||
238 | # ifndef CONFIG_KTIME_SCALAR | 237 | # ifndef CONFIG_KTIME_SCALAR |
239 | /** | 238 | /** |
240 | * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable | 239 | * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable |
241 | * | ||
242 | * @kt: addend | 240 | * @kt: addend |
243 | * @nsec: the scalar nsec value to add | 241 | * @nsec: the scalar nsec value to add |
244 | * | 242 | * |
@@ -299,7 +297,6 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) | |||
299 | 297 | ||
300 | /** | 298 | /** |
301 | * hrtimer_forward - forward the timer expiry | 299 | * hrtimer_forward - forward the timer expiry |
302 | * | ||
303 | * @timer: hrtimer to forward | 300 | * @timer: hrtimer to forward |
304 | * @now: forward past this time | 301 | * @now: forward past this time |
305 | * @interval: the interval to forward | 302 | * @interval: the interval to forward |
@@ -393,7 +390,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | |||
393 | if (base->first == &timer->node) | 390 | if (base->first == &timer->node) |
394 | base->first = rb_next(&timer->node); | 391 | base->first = rb_next(&timer->node); |
395 | rb_erase(&timer->node, &base->active); | 392 | rb_erase(&timer->node, &base->active); |
396 | timer->node.rb_parent = HRTIMER_INACTIVE; | 393 | rb_set_parent(&timer->node, &timer->node); |
397 | } | 394 | } |
398 | 395 | ||
399 | /* | 396 | /* |
@@ -411,7 +408,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | |||
411 | 408 | ||
412 | /** | 409 | /** |
413 | * hrtimer_start - (re)start an relative timer on the current CPU | 410 | * hrtimer_start - (re)start an relative timer on the current CPU |
414 | * | ||
415 | * @timer: the timer to be added | 411 | * @timer: the timer to be added |
416 | * @tim: expiry time | 412 | * @tim: expiry time |
417 | * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) | 413 | * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) |
@@ -460,14 +456,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start); | |||
460 | 456 | ||
461 | /** | 457 | /** |
462 | * hrtimer_try_to_cancel - try to deactivate a timer | 458 | * hrtimer_try_to_cancel - try to deactivate a timer |
463 | * | ||
464 | * @timer: hrtimer to stop | 459 | * @timer: hrtimer to stop |
465 | * | 460 | * |
466 | * Returns: | 461 | * Returns: |
467 | * 0 when the timer was not active | 462 | * 0 when the timer was not active |
468 | * 1 when the timer was active | 463 | * 1 when the timer was active |
469 | * -1 when the timer is currently excuting the callback function and | 464 | * -1 when the timer is currently excuting the callback function and |
470 | * can not be stopped | 465 | * cannot be stopped |
471 | */ | 466 | */ |
472 | int hrtimer_try_to_cancel(struct hrtimer *timer) | 467 | int hrtimer_try_to_cancel(struct hrtimer *timer) |
473 | { | 468 | { |
@@ -489,7 +484,6 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); | |||
489 | 484 | ||
490 | /** | 485 | /** |
491 | * hrtimer_cancel - cancel a timer and wait for the handler to finish. | 486 | * hrtimer_cancel - cancel a timer and wait for the handler to finish. |
492 | * | ||
493 | * @timer: the timer to be cancelled | 487 | * @timer: the timer to be cancelled |
494 | * | 488 | * |
495 | * Returns: | 489 | * Returns: |
@@ -510,7 +504,6 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); | |||
510 | 504 | ||
511 | /** | 505 | /** |
512 | * hrtimer_get_remaining - get remaining time for the timer | 506 | * hrtimer_get_remaining - get remaining time for the timer |
513 | * | ||
514 | * @timer: the timer to read | 507 | * @timer: the timer to read |
515 | */ | 508 | */ |
516 | ktime_t hrtimer_get_remaining(const struct hrtimer *timer) | 509 | ktime_t hrtimer_get_remaining(const struct hrtimer *timer) |
@@ -564,7 +557,6 @@ ktime_t hrtimer_get_next_event(void) | |||
564 | 557 | ||
565 | /** | 558 | /** |
566 | * hrtimer_init - initialize a timer to the given clock | 559 | * hrtimer_init - initialize a timer to the given clock |
567 | * | ||
568 | * @timer: the timer to be initialized | 560 | * @timer: the timer to be initialized |
569 | * @clock_id: the clock to be used | 561 | * @clock_id: the clock to be used |
570 | * @mode: timer mode abs/rel | 562 | * @mode: timer mode abs/rel |
@@ -576,19 +568,18 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
576 | 568 | ||
577 | memset(timer, 0, sizeof(struct hrtimer)); | 569 | memset(timer, 0, sizeof(struct hrtimer)); |
578 | 570 | ||
579 | bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); | 571 | bases = __raw_get_cpu_var(hrtimer_bases); |
580 | 572 | ||
581 | if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) | 573 | if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) |
582 | clock_id = CLOCK_MONOTONIC; | 574 | clock_id = CLOCK_MONOTONIC; |
583 | 575 | ||
584 | timer->base = &bases[clock_id]; | 576 | timer->base = &bases[clock_id]; |
585 | timer->node.rb_parent = HRTIMER_INACTIVE; | 577 | rb_set_parent(&timer->node, &timer->node); |
586 | } | 578 | } |
587 | EXPORT_SYMBOL_GPL(hrtimer_init); | 579 | EXPORT_SYMBOL_GPL(hrtimer_init); |
588 | 580 | ||
589 | /** | 581 | /** |
590 | * hrtimer_get_res - get the timer resolution for a clock | 582 | * hrtimer_get_res - get the timer resolution for a clock |
591 | * | ||
592 | * @which_clock: which clock to query | 583 | * @which_clock: which clock to query |
593 | * @tp: pointer to timespec variable to store the resolution | 584 | * @tp: pointer to timespec variable to store the resolution |
594 | * | 585 | * |
@@ -599,7 +590,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) | |||
599 | { | 590 | { |
600 | struct hrtimer_base *bases; | 591 | struct hrtimer_base *bases; |
601 | 592 | ||
602 | bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); | 593 | bases = __raw_get_cpu_var(hrtimer_bases); |
603 | *tp = ktime_to_timespec(bases[which_clock].resolution); | 594 | *tp = ktime_to_timespec(bases[which_clock].resolution); |
604 | 595 | ||
605 | return 0; | 596 | return 0; |
@@ -842,7 +833,7 @@ static void migrate_hrtimers(int cpu) | |||
842 | } | 833 | } |
843 | #endif /* CONFIG_HOTPLUG_CPU */ | 834 | #endif /* CONFIG_HOTPLUG_CPU */ |
844 | 835 | ||
845 | static int hrtimer_cpu_notify(struct notifier_block *self, | 836 | static int __devinit hrtimer_cpu_notify(struct notifier_block *self, |
846 | unsigned long action, void *hcpu) | 837 | unsigned long action, void *hcpu) |
847 | { | 838 | { |
848 | long cpu = (long)hcpu; | 839 | long cpu = (long)hcpu; |
@@ -866,7 +857,7 @@ static int hrtimer_cpu_notify(struct notifier_block *self, | |||
866 | return NOTIFY_OK; | 857 | return NOTIFY_OK; |
867 | } | 858 | } |
868 | 859 | ||
869 | static struct notifier_block hrtimers_nb = { | 860 | static struct notifier_block __devinitdata hrtimers_nb = { |
870 | .notifier_call = hrtimer_cpu_notify, | 861 | .notifier_call = hrtimer_cpu_notify, |
871 | }; | 862 | }; |
872 | 863 | ||
diff --git a/kernel/intermodule.c b/kernel/intermodule.c deleted file mode 100644 index 55b1e5b85db9..000000000000 --- a/kernel/intermodule.c +++ /dev/null | |||
@@ -1,184 +0,0 @@ | |||
1 | /* Deprecated, do not use. Moved from module.c to here. --RR */ | ||
2 | |||
3 | /* Written by Keith Owens <kaos@ocs.com.au> Oct 2000 */ | ||
4 | #include <linux/module.h> | ||
5 | #include <linux/kmod.h> | ||
6 | #include <linux/spinlock.h> | ||
7 | #include <linux/list.h> | ||
8 | #include <linux/slab.h> | ||
9 | |||
10 | /* inter_module functions are always available, even when the kernel is | ||
11 | * compiled without modules. Consumers of inter_module_xxx routines | ||
12 | * will always work, even when both are built into the kernel, this | ||
13 | * approach removes lots of #ifdefs in mainline code. | ||
14 | */ | ||
15 | |||
16 | static struct list_head ime_list = LIST_HEAD_INIT(ime_list); | ||
17 | static DEFINE_SPINLOCK(ime_lock); | ||
18 | static int kmalloc_failed; | ||
19 | |||
20 | struct inter_module_entry { | ||
21 | struct list_head list; | ||
22 | const char *im_name; | ||
23 | struct module *owner; | ||
24 | const void *userdata; | ||
25 | }; | ||
26 | |||
27 | /** | ||
28 | * inter_module_register - register a new set of inter module data. | ||
29 | * @im_name: an arbitrary string to identify the data, must be unique | ||
30 | * @owner: module that is registering the data, always use THIS_MODULE | ||
31 | * @userdata: pointer to arbitrary userdata to be registered | ||
32 | * | ||
33 | * Description: Check that the im_name has not already been registered, | ||
34 | * complain if it has. For new data, add it to the inter_module_entry | ||
35 | * list. | ||
36 | */ | ||
37 | void inter_module_register(const char *im_name, struct module *owner, const void *userdata) | ||
38 | { | ||
39 | struct list_head *tmp; | ||
40 | struct inter_module_entry *ime, *ime_new; | ||
41 | |||
42 | if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) { | ||
43 | /* Overloaded kernel, not fatal */ | ||
44 | printk(KERN_ERR | ||
45 | "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n", | ||
46 | im_name); | ||
47 | kmalloc_failed = 1; | ||
48 | return; | ||
49 | } | ||
50 | ime_new->im_name = im_name; | ||
51 | ime_new->owner = owner; | ||
52 | ime_new->userdata = userdata; | ||
53 | |||
54 | spin_lock(&ime_lock); | ||
55 | list_for_each(tmp, &ime_list) { | ||
56 | ime = list_entry(tmp, struct inter_module_entry, list); | ||
57 | if (strcmp(ime->im_name, im_name) == 0) { | ||
58 | spin_unlock(&ime_lock); | ||
59 | kfree(ime_new); | ||
60 | /* Program logic error, fatal */ | ||
61 | printk(KERN_ERR "inter_module_register: duplicate im_name '%s'", im_name); | ||
62 | BUG(); | ||
63 | } | ||
64 | } | ||
65 | list_add(&(ime_new->list), &ime_list); | ||
66 | spin_unlock(&ime_lock); | ||
67 | } | ||
68 | |||
69 | /** | ||
70 | * inter_module_unregister - unregister a set of inter module data. | ||
71 | * @im_name: an arbitrary string to identify the data, must be unique | ||
72 | * | ||
73 | * Description: Check that the im_name has been registered, complain if | ||
74 | * it has not. For existing data, remove it from the | ||
75 | * inter_module_entry list. | ||
76 | */ | ||
77 | void inter_module_unregister(const char *im_name) | ||
78 | { | ||
79 | struct list_head *tmp; | ||
80 | struct inter_module_entry *ime; | ||
81 | |||
82 | spin_lock(&ime_lock); | ||
83 | list_for_each(tmp, &ime_list) { | ||
84 | ime = list_entry(tmp, struct inter_module_entry, list); | ||
85 | if (strcmp(ime->im_name, im_name) == 0) { | ||
86 | list_del(&(ime->list)); | ||
87 | spin_unlock(&ime_lock); | ||
88 | kfree(ime); | ||
89 | return; | ||
90 | } | ||
91 | } | ||
92 | spin_unlock(&ime_lock); | ||
93 | if (kmalloc_failed) { | ||
94 | printk(KERN_ERR | ||
95 | "inter_module_unregister: no entry for '%s', " | ||
96 | "probably caused by previous kmalloc failure\n", | ||
97 | im_name); | ||
98 | return; | ||
99 | } | ||
100 | else { | ||
101 | /* Program logic error, fatal */ | ||
102 | printk(KERN_ERR "inter_module_unregister: no entry for '%s'", im_name); | ||
103 | BUG(); | ||
104 | } | ||
105 | } | ||
106 | |||
107 | /** | ||
108 | * inter_module_get - return arbitrary userdata from another module. | ||
109 | * @im_name: an arbitrary string to identify the data, must be unique | ||
110 | * | ||
111 | * Description: If the im_name has not been registered, return NULL. | ||
112 | * Try to increment the use count on the owning module, if that fails | ||
113 | * then return NULL. Otherwise return the userdata. | ||
114 | */ | ||
115 | static const void *inter_module_get(const char *im_name) | ||
116 | { | ||
117 | struct list_head *tmp; | ||
118 | struct inter_module_entry *ime; | ||
119 | const void *result = NULL; | ||
120 | |||
121 | spin_lock(&ime_lock); | ||
122 | list_for_each(tmp, &ime_list) { | ||
123 | ime = list_entry(tmp, struct inter_module_entry, list); | ||
124 | if (strcmp(ime->im_name, im_name) == 0) { | ||
125 | if (try_module_get(ime->owner)) | ||
126 | result = ime->userdata; | ||
127 | break; | ||
128 | } | ||
129 | } | ||
130 | spin_unlock(&ime_lock); | ||
131 | return(result); | ||
132 | } | ||
133 | |||
134 | /** | ||
135 | * inter_module_get_request - im get with automatic request_module. | ||
136 | * @im_name: an arbitrary string to identify the data, must be unique | ||
137 | * @modname: module that is expected to register im_name | ||
138 | * | ||
139 | * Description: If inter_module_get fails, do request_module then retry. | ||
140 | */ | ||
141 | const void *inter_module_get_request(const char *im_name, const char *modname) | ||
142 | { | ||
143 | const void *result = inter_module_get(im_name); | ||
144 | if (!result) { | ||
145 | request_module("%s", modname); | ||
146 | result = inter_module_get(im_name); | ||
147 | } | ||
148 | return(result); | ||
149 | } | ||
150 | |||
151 | /** | ||
152 | * inter_module_put - release use of data from another module. | ||
153 | * @im_name: an arbitrary string to identify the data, must be unique | ||
154 | * | ||
155 | * Description: If the im_name has not been registered, complain, | ||
156 | * otherwise decrement the use count on the owning module. | ||
157 | */ | ||
158 | void inter_module_put(const char *im_name) | ||
159 | { | ||
160 | struct list_head *tmp; | ||
161 | struct inter_module_entry *ime; | ||
162 | |||
163 | spin_lock(&ime_lock); | ||
164 | list_for_each(tmp, &ime_list) { | ||
165 | ime = list_entry(tmp, struct inter_module_entry, list); | ||
166 | if (strcmp(ime->im_name, im_name) == 0) { | ||
167 | if (ime->owner) | ||
168 | module_put(ime->owner); | ||
169 | spin_unlock(&ime_lock); | ||
170 | return; | ||
171 | } | ||
172 | } | ||
173 | spin_unlock(&ime_lock); | ||
174 | printk(KERN_ERR "inter_module_put: no entry for '%s'", im_name); | ||
175 | BUG(); | ||
176 | } | ||
177 | |||
178 | EXPORT_SYMBOL(inter_module_register); | ||
179 | EXPORT_SYMBOL(inter_module_unregister); | ||
180 | EXPORT_SYMBOL(inter_module_get_request); | ||
181 | EXPORT_SYMBOL(inter_module_put); | ||
182 | |||
183 | MODULE_LICENSE("GPL"); | ||
184 | |||
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 9f77f50d8143..1dab0ac3f797 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile | |||
@@ -1,5 +1,5 @@ | |||
1 | 1 | ||
2 | obj-y := handle.o manage.o spurious.o | 2 | obj-y := handle.o manage.o spurious.o resend.o chip.o |
3 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o | 3 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o |
4 | obj-$(CONFIG_PROC_FS) += proc.o | 4 | obj-$(CONFIG_PROC_FS) += proc.o |
5 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o | 5 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o |
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 3467097ca61a..533068cfb607 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c | |||
@@ -11,12 +11,14 @@ | |||
11 | #include <linux/interrupt.h> | 11 | #include <linux/interrupt.h> |
12 | #include <linux/delay.h> | 12 | #include <linux/delay.h> |
13 | 13 | ||
14 | #include "internals.h" | ||
15 | |||
14 | /* | 16 | /* |
15 | * Autodetection depends on the fact that any interrupt that | 17 | * Autodetection depends on the fact that any interrupt that |
16 | * comes in on to an unassigned handler will get stuck with | 18 | * comes in on to an unassigned handler will get stuck with |
17 | * "IRQ_WAITING" cleared and the interrupt disabled. | 19 | * "IRQ_WAITING" cleared and the interrupt disabled. |
18 | */ | 20 | */ |
19 | static DECLARE_MUTEX(probe_sem); | 21 | static DEFINE_MUTEX(probing_active); |
20 | 22 | ||
21 | /** | 23 | /** |
22 | * probe_irq_on - begin an interrupt autodetect | 24 | * probe_irq_on - begin an interrupt autodetect |
@@ -27,11 +29,11 @@ static DECLARE_MUTEX(probe_sem); | |||
27 | */ | 29 | */ |
28 | unsigned long probe_irq_on(void) | 30 | unsigned long probe_irq_on(void) |
29 | { | 31 | { |
30 | unsigned long val; | 32 | struct irq_desc *desc; |
31 | irq_desc_t *desc; | 33 | unsigned long mask; |
32 | unsigned int i; | 34 | unsigned int i; |
33 | 35 | ||
34 | down(&probe_sem); | 36 | mutex_lock(&probing_active); |
35 | /* | 37 | /* |
36 | * something may have generated an irq long ago and we want to | 38 | * something may have generated an irq long ago and we want to |
37 | * flush such a longstanding irq before considering it as spurious. | 39 | * flush such a longstanding irq before considering it as spurious. |
@@ -40,8 +42,21 @@ unsigned long probe_irq_on(void) | |||
40 | desc = irq_desc + i; | 42 | desc = irq_desc + i; |
41 | 43 | ||
42 | spin_lock_irq(&desc->lock); | 44 | spin_lock_irq(&desc->lock); |
43 | if (!irq_desc[i].action) | 45 | if (!desc->action && !(desc->status & IRQ_NOPROBE)) { |
44 | irq_desc[i].handler->startup(i); | 46 | /* |
47 | * An old-style architecture might still have | ||
48 | * the handle_bad_irq handler there: | ||
49 | */ | ||
50 | compat_irq_chip_set_default_handler(desc); | ||
51 | |||
52 | /* | ||
53 | * Some chips need to know about probing in | ||
54 | * progress: | ||
55 | */ | ||
56 | if (desc->chip->set_type) | ||
57 | desc->chip->set_type(i, IRQ_TYPE_PROBE); | ||
58 | desc->chip->startup(i); | ||
59 | } | ||
45 | spin_unlock_irq(&desc->lock); | 60 | spin_unlock_irq(&desc->lock); |
46 | } | 61 | } |
47 | 62 | ||
@@ -57,9 +72,9 @@ unsigned long probe_irq_on(void) | |||
57 | desc = irq_desc + i; | 72 | desc = irq_desc + i; |
58 | 73 | ||
59 | spin_lock_irq(&desc->lock); | 74 | spin_lock_irq(&desc->lock); |
60 | if (!desc->action) { | 75 | if (!desc->action && !(desc->status & IRQ_NOPROBE)) { |
61 | desc->status |= IRQ_AUTODETECT | IRQ_WAITING; | 76 | desc->status |= IRQ_AUTODETECT | IRQ_WAITING; |
62 | if (desc->handler->startup(i)) | 77 | if (desc->chip->startup(i)) |
63 | desc->status |= IRQ_PENDING; | 78 | desc->status |= IRQ_PENDING; |
64 | } | 79 | } |
65 | spin_unlock_irq(&desc->lock); | 80 | spin_unlock_irq(&desc->lock); |
@@ -73,11 +88,11 @@ unsigned long probe_irq_on(void) | |||
73 | /* | 88 | /* |
74 | * Now filter out any obviously spurious interrupts | 89 | * Now filter out any obviously spurious interrupts |
75 | */ | 90 | */ |
76 | val = 0; | 91 | mask = 0; |
77 | for (i = 0; i < NR_IRQS; i++) { | 92 | for (i = 0; i < NR_IRQS; i++) { |
78 | irq_desc_t *desc = irq_desc + i; | ||
79 | unsigned int status; | 93 | unsigned int status; |
80 | 94 | ||
95 | desc = irq_desc + i; | ||
81 | spin_lock_irq(&desc->lock); | 96 | spin_lock_irq(&desc->lock); |
82 | status = desc->status; | 97 | status = desc->status; |
83 | 98 | ||
@@ -85,17 +100,16 @@ unsigned long probe_irq_on(void) | |||
85 | /* It triggered already - consider it spurious. */ | 100 | /* It triggered already - consider it spurious. */ |
86 | if (!(status & IRQ_WAITING)) { | 101 | if (!(status & IRQ_WAITING)) { |
87 | desc->status = status & ~IRQ_AUTODETECT; | 102 | desc->status = status & ~IRQ_AUTODETECT; |
88 | desc->handler->shutdown(i); | 103 | desc->chip->shutdown(i); |
89 | } else | 104 | } else |
90 | if (i < 32) | 105 | if (i < 32) |
91 | val |= 1 << i; | 106 | mask |= 1 << i; |
92 | } | 107 | } |
93 | spin_unlock_irq(&desc->lock); | 108 | spin_unlock_irq(&desc->lock); |
94 | } | 109 | } |
95 | 110 | ||
96 | return val; | 111 | return mask; |
97 | } | 112 | } |
98 | |||
99 | EXPORT_SYMBOL(probe_irq_on); | 113 | EXPORT_SYMBOL(probe_irq_on); |
100 | 114 | ||
101 | /** | 115 | /** |
@@ -117,7 +131,7 @@ unsigned int probe_irq_mask(unsigned long val) | |||
117 | 131 | ||
118 | mask = 0; | 132 | mask = 0; |
119 | for (i = 0; i < NR_IRQS; i++) { | 133 | for (i = 0; i < NR_IRQS; i++) { |
120 | irq_desc_t *desc = irq_desc + i; | 134 | struct irq_desc *desc = irq_desc + i; |
121 | unsigned int status; | 135 | unsigned int status; |
122 | 136 | ||
123 | spin_lock_irq(&desc->lock); | 137 | spin_lock_irq(&desc->lock); |
@@ -128,11 +142,11 @@ unsigned int probe_irq_mask(unsigned long val) | |||
128 | mask |= 1 << i; | 142 | mask |= 1 << i; |
129 | 143 | ||
130 | desc->status = status & ~IRQ_AUTODETECT; | 144 | desc->status = status & ~IRQ_AUTODETECT; |
131 | desc->handler->shutdown(i); | 145 | desc->chip->shutdown(i); |
132 | } | 146 | } |
133 | spin_unlock_irq(&desc->lock); | 147 | spin_unlock_irq(&desc->lock); |
134 | } | 148 | } |
135 | up(&probe_sem); | 149 | mutex_unlock(&probing_active); |
136 | 150 | ||
137 | return mask & val; | 151 | return mask & val; |
138 | } | 152 | } |
@@ -160,7 +174,7 @@ int probe_irq_off(unsigned long val) | |||
160 | int i, irq_found = 0, nr_irqs = 0; | 174 | int i, irq_found = 0, nr_irqs = 0; |
161 | 175 | ||
162 | for (i = 0; i < NR_IRQS; i++) { | 176 | for (i = 0; i < NR_IRQS; i++) { |
163 | irq_desc_t *desc = irq_desc + i; | 177 | struct irq_desc *desc = irq_desc + i; |
164 | unsigned int status; | 178 | unsigned int status; |
165 | 179 | ||
166 | spin_lock_irq(&desc->lock); | 180 | spin_lock_irq(&desc->lock); |
@@ -173,16 +187,16 @@ int probe_irq_off(unsigned long val) | |||
173 | nr_irqs++; | 187 | nr_irqs++; |
174 | } | 188 | } |
175 | desc->status = status & ~IRQ_AUTODETECT; | 189 | desc->status = status & ~IRQ_AUTODETECT; |
176 | desc->handler->shutdown(i); | 190 | desc->chip->shutdown(i); |
177 | } | 191 | } |
178 | spin_unlock_irq(&desc->lock); | 192 | spin_unlock_irq(&desc->lock); |
179 | } | 193 | } |
180 | up(&probe_sem); | 194 | mutex_unlock(&probing_active); |
181 | 195 | ||
182 | if (nr_irqs > 1) | 196 | if (nr_irqs > 1) |
183 | irq_found = -irq_found; | 197 | irq_found = -irq_found; |
198 | |||
184 | return irq_found; | 199 | return irq_found; |
185 | } | 200 | } |
186 | |||
187 | EXPORT_SYMBOL(probe_irq_off); | 201 | EXPORT_SYMBOL(probe_irq_off); |
188 | 202 | ||
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c new file mode 100644 index 000000000000..54105bdfe20d --- /dev/null +++ b/kernel/irq/chip.c | |||
@@ -0,0 +1,534 @@ | |||
1 | /* | ||
2 | * linux/kernel/irq/chip.c | ||
3 | * | ||
4 | * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar | ||
5 | * Copyright (C) 2005-2006, Thomas Gleixner, Russell King | ||
6 | * | ||
7 | * This file contains the core interrupt handling code, for irq-chip | ||
8 | * based architectures. | ||
9 | * | ||
10 | * Detailed information is available in Documentation/DocBook/genericirq | ||
11 | */ | ||
12 | |||
13 | #include <linux/irq.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <linux/interrupt.h> | ||
16 | #include <linux/kernel_stat.h> | ||
17 | |||
18 | #include "internals.h" | ||
19 | |||
20 | /** | ||
21 | * set_irq_chip - set the irq chip for an irq | ||
22 | * @irq: irq number | ||
23 | * @chip: pointer to irq chip description structure | ||
24 | */ | ||
25 | int set_irq_chip(unsigned int irq, struct irq_chip *chip) | ||
26 | { | ||
27 | struct irq_desc *desc; | ||
28 | unsigned long flags; | ||
29 | |||
30 | if (irq >= NR_IRQS) { | ||
31 | printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq); | ||
32 | WARN_ON(1); | ||
33 | return -EINVAL; | ||
34 | } | ||
35 | |||
36 | if (!chip) | ||
37 | chip = &no_irq_chip; | ||
38 | |||
39 | desc = irq_desc + irq; | ||
40 | spin_lock_irqsave(&desc->lock, flags); | ||
41 | irq_chip_set_defaults(chip); | ||
42 | desc->chip = chip; | ||
43 | /* | ||
44 | * For compatibility only: | ||
45 | */ | ||
46 | desc->chip = chip; | ||
47 | spin_unlock_irqrestore(&desc->lock, flags); | ||
48 | |||
49 | return 0; | ||
50 | } | ||
51 | EXPORT_SYMBOL(set_irq_chip); | ||
52 | |||
53 | /** | ||
54 | * set_irq_type - set the irq type for an irq | ||
55 | * @irq: irq number | ||
56 | * @type: interrupt type - see include/linux/interrupt.h | ||
57 | */ | ||
58 | int set_irq_type(unsigned int irq, unsigned int type) | ||
59 | { | ||
60 | struct irq_desc *desc; | ||
61 | unsigned long flags; | ||
62 | int ret = -ENXIO; | ||
63 | |||
64 | if (irq >= NR_IRQS) { | ||
65 | printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); | ||
66 | return -ENODEV; | ||
67 | } | ||
68 | |||
69 | desc = irq_desc + irq; | ||
70 | if (desc->chip->set_type) { | ||
71 | spin_lock_irqsave(&desc->lock, flags); | ||
72 | ret = desc->chip->set_type(irq, type); | ||
73 | spin_unlock_irqrestore(&desc->lock, flags); | ||
74 | } | ||
75 | return ret; | ||
76 | } | ||
77 | EXPORT_SYMBOL(set_irq_type); | ||
78 | |||
79 | /** | ||
80 | * set_irq_data - set irq type data for an irq | ||
81 | * @irq: Interrupt number | ||
82 | * @data: Pointer to interrupt specific data | ||
83 | * | ||
84 | * Set the hardware irq controller data for an irq | ||
85 | */ | ||
86 | int set_irq_data(unsigned int irq, void *data) | ||
87 | { | ||
88 | struct irq_desc *desc; | ||
89 | unsigned long flags; | ||
90 | |||
91 | if (irq >= NR_IRQS) { | ||
92 | printk(KERN_ERR | ||
93 | "Trying to install controller data for IRQ%d\n", irq); | ||
94 | return -EINVAL; | ||
95 | } | ||
96 | |||
97 | desc = irq_desc + irq; | ||
98 | spin_lock_irqsave(&desc->lock, flags); | ||
99 | desc->handler_data = data; | ||
100 | spin_unlock_irqrestore(&desc->lock, flags); | ||
101 | return 0; | ||
102 | } | ||
103 | EXPORT_SYMBOL(set_irq_data); | ||
104 | |||
105 | /** | ||
106 | * set_irq_chip_data - set irq chip data for an irq | ||
107 | * @irq: Interrupt number | ||
108 | * @data: Pointer to chip specific data | ||
109 | * | ||
110 | * Set the hardware irq chip data for an irq | ||
111 | */ | ||
112 | int set_irq_chip_data(unsigned int irq, void *data) | ||
113 | { | ||
114 | struct irq_desc *desc = irq_desc + irq; | ||
115 | unsigned long flags; | ||
116 | |||
117 | if (irq >= NR_IRQS || !desc->chip) { | ||
118 | printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); | ||
119 | return -EINVAL; | ||
120 | } | ||
121 | |||
122 | spin_lock_irqsave(&desc->lock, flags); | ||
123 | desc->chip_data = data; | ||
124 | spin_unlock_irqrestore(&desc->lock, flags); | ||
125 | |||
126 | return 0; | ||
127 | } | ||
128 | EXPORT_SYMBOL(set_irq_chip_data); | ||
129 | |||
130 | /* | ||
131 | * default enable function | ||
132 | */ | ||
133 | static void default_enable(unsigned int irq) | ||
134 | { | ||
135 | struct irq_desc *desc = irq_desc + irq; | ||
136 | |||
137 | desc->chip->unmask(irq); | ||
138 | desc->status &= ~IRQ_MASKED; | ||
139 | } | ||
140 | |||
141 | /* | ||
142 | * default disable function | ||
143 | */ | ||
144 | static void default_disable(unsigned int irq) | ||
145 | { | ||
146 | struct irq_desc *desc = irq_desc + irq; | ||
147 | |||
148 | if (!(desc->status & IRQ_DELAYED_DISABLE)) | ||
149 | irq_desc[irq].chip->mask(irq); | ||
150 | } | ||
151 | |||
152 | /* | ||
153 | * default startup function | ||
154 | */ | ||
155 | static unsigned int default_startup(unsigned int irq) | ||
156 | { | ||
157 | irq_desc[irq].chip->enable(irq); | ||
158 | |||
159 | return 0; | ||
160 | } | ||
161 | |||
162 | /* | ||
163 | * Fixup enable/disable function pointers | ||
164 | */ | ||
165 | void irq_chip_set_defaults(struct irq_chip *chip) | ||
166 | { | ||
167 | if (!chip->enable) | ||
168 | chip->enable = default_enable; | ||
169 | if (!chip->disable) | ||
170 | chip->disable = default_disable; | ||
171 | if (!chip->startup) | ||
172 | chip->startup = default_startup; | ||
173 | if (!chip->shutdown) | ||
174 | chip->shutdown = chip->disable; | ||
175 | if (!chip->name) | ||
176 | chip->name = chip->typename; | ||
177 | } | ||
178 | |||
179 | static inline void mask_ack_irq(struct irq_desc *desc, int irq) | ||
180 | { | ||
181 | if (desc->chip->mask_ack) | ||
182 | desc->chip->mask_ack(irq); | ||
183 | else { | ||
184 | desc->chip->mask(irq); | ||
185 | desc->chip->ack(irq); | ||
186 | } | ||
187 | } | ||
188 | |||
189 | /** | ||
190 | * handle_simple_irq - Simple and software-decoded IRQs. | ||
191 | * @irq: the interrupt number | ||
192 | * @desc: the interrupt description structure for this irq | ||
193 | * @regs: pointer to a register structure | ||
194 | * | ||
195 | * Simple interrupts are either sent from a demultiplexing interrupt | ||
196 | * handler or come from hardware, where no interrupt hardware control | ||
197 | * is necessary. | ||
198 | * | ||
199 | * Note: The caller is expected to handle the ack, clear, mask and | ||
200 | * unmask issues if necessary. | ||
201 | */ | ||
202 | void fastcall | ||
203 | handle_simple_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) | ||
204 | { | ||
205 | struct irqaction *action; | ||
206 | irqreturn_t action_ret; | ||
207 | const unsigned int cpu = smp_processor_id(); | ||
208 | |||
209 | spin_lock(&desc->lock); | ||
210 | |||
211 | if (unlikely(desc->status & IRQ_INPROGRESS)) | ||
212 | goto out_unlock; | ||
213 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | ||
214 | kstat_cpu(cpu).irqs[irq]++; | ||
215 | |||
216 | action = desc->action; | ||
217 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) | ||
218 | goto out_unlock; | ||
219 | |||
220 | desc->status |= IRQ_INPROGRESS; | ||
221 | spin_unlock(&desc->lock); | ||
222 | |||
223 | action_ret = handle_IRQ_event(irq, regs, action); | ||
224 | if (!noirqdebug) | ||
225 | note_interrupt(irq, desc, action_ret, regs); | ||
226 | |||
227 | spin_lock(&desc->lock); | ||
228 | desc->status &= ~IRQ_INPROGRESS; | ||
229 | out_unlock: | ||
230 | spin_unlock(&desc->lock); | ||
231 | } | ||
232 | |||
233 | /** | ||
234 | * handle_level_irq - Level type irq handler | ||
235 | * @irq: the interrupt number | ||
236 | * @desc: the interrupt description structure for this irq | ||
237 | * @regs: pointer to a register structure | ||
238 | * | ||
239 | * Level type interrupts are active as long as the hardware line has | ||
240 | * the active level. This may require to mask the interrupt and unmask | ||
241 | * it after the associated handler has acknowledged the device, so the | ||
242 | * interrupt line is back to inactive. | ||
243 | */ | ||
244 | void fastcall | ||
245 | handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) | ||
246 | { | ||
247 | unsigned int cpu = smp_processor_id(); | ||
248 | struct irqaction *action; | ||
249 | irqreturn_t action_ret; | ||
250 | |||
251 | spin_lock(&desc->lock); | ||
252 | mask_ack_irq(desc, irq); | ||
253 | |||
254 | if (unlikely(desc->status & IRQ_INPROGRESS)) | ||
255 | goto out; | ||
256 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | ||
257 | kstat_cpu(cpu).irqs[irq]++; | ||
258 | |||
259 | /* | ||
260 | * If its disabled or no action available | ||
261 | * keep it masked and get out of here | ||
262 | */ | ||
263 | action = desc->action; | ||
264 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) | ||
265 | goto out; | ||
266 | |||
267 | desc->status |= IRQ_INPROGRESS; | ||
268 | spin_unlock(&desc->lock); | ||
269 | |||
270 | action_ret = handle_IRQ_event(irq, regs, action); | ||
271 | if (!noirqdebug) | ||
272 | note_interrupt(irq, desc, action_ret, regs); | ||
273 | |||
274 | spin_lock(&desc->lock); | ||
275 | desc->status &= ~IRQ_INPROGRESS; | ||
276 | out: | ||
277 | if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) | ||
278 | desc->chip->unmask(irq); | ||
279 | spin_unlock(&desc->lock); | ||
280 | } | ||
281 | |||
282 | /** | ||
283 | * handle_fasteoi_irq - irq handler for transparent controllers | ||
284 | * @irq: the interrupt number | ||
285 | * @desc: the interrupt description structure for this irq | ||
286 | * @regs: pointer to a register structure | ||
287 | * | ||
288 | * Only a single callback will be issued to the chip: an ->eoi() | ||
289 | * call when the interrupt has been serviced. This enables support | ||
290 | * for modern forms of interrupt handlers, which handle the flow | ||
291 | * details in hardware, transparently. | ||
292 | */ | ||
293 | void fastcall | ||
294 | handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc, | ||
295 | struct pt_regs *regs) | ||
296 | { | ||
297 | unsigned int cpu = smp_processor_id(); | ||
298 | struct irqaction *action; | ||
299 | irqreturn_t action_ret; | ||
300 | |||
301 | spin_lock(&desc->lock); | ||
302 | |||
303 | if (unlikely(desc->status & IRQ_INPROGRESS)) | ||
304 | goto out; | ||
305 | |||
306 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | ||
307 | kstat_cpu(cpu).irqs[irq]++; | ||
308 | |||
309 | /* | ||
310 | * If its disabled or no action available | ||
311 | * keep it masked and get out of here | ||
312 | */ | ||
313 | action = desc->action; | ||
314 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { | ||
315 | desc->status |= IRQ_PENDING; | ||
316 | goto out; | ||
317 | } | ||
318 | |||
319 | desc->status |= IRQ_INPROGRESS; | ||
320 | desc->status &= ~IRQ_PENDING; | ||
321 | spin_unlock(&desc->lock); | ||
322 | |||
323 | action_ret = handle_IRQ_event(irq, regs, action); | ||
324 | if (!noirqdebug) | ||
325 | note_interrupt(irq, desc, action_ret, regs); | ||
326 | |||
327 | spin_lock(&desc->lock); | ||
328 | desc->status &= ~IRQ_INPROGRESS; | ||
329 | out: | ||
330 | desc->chip->eoi(irq); | ||
331 | |||
332 | spin_unlock(&desc->lock); | ||
333 | } | ||
334 | |||
335 | /** | ||
336 | * handle_edge_irq - edge type IRQ handler | ||
337 | * @irq: the interrupt number | ||
338 | * @desc: the interrupt description structure for this irq | ||
339 | * @regs: pointer to a register structure | ||
340 | * | ||
341 | * Interrupt occures on the falling and/or rising edge of a hardware | ||
342 | * signal. The occurence is latched into the irq controller hardware | ||
343 | * and must be acked in order to be reenabled. After the ack another | ||
344 | * interrupt can happen on the same source even before the first one | ||
345 | * is handled by the assosiacted event handler. If this happens it | ||
346 | * might be necessary to disable (mask) the interrupt depending on the | ||
347 | * controller hardware. This requires to reenable the interrupt inside | ||
348 | * of the loop which handles the interrupts which have arrived while | ||
349 | * the handler was running. If all pending interrupts are handled, the | ||
350 | * loop is left. | ||
351 | */ | ||
352 | void fastcall | ||
353 | handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) | ||
354 | { | ||
355 | const unsigned int cpu = smp_processor_id(); | ||
356 | |||
357 | spin_lock(&desc->lock); | ||
358 | |||
359 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | ||
360 | |||
361 | /* | ||
362 | * If we're currently running this IRQ, or its disabled, | ||
363 | * we shouldn't process the IRQ. Mark it pending, handle | ||
364 | * the necessary masking and go out | ||
365 | */ | ||
366 | if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || | ||
367 | !desc->action)) { | ||
368 | desc->status |= (IRQ_PENDING | IRQ_MASKED); | ||
369 | mask_ack_irq(desc, irq); | ||
370 | goto out_unlock; | ||
371 | } | ||
372 | |||
373 | kstat_cpu(cpu).irqs[irq]++; | ||
374 | |||
375 | /* Start handling the irq */ | ||
376 | desc->chip->ack(irq); | ||
377 | |||
378 | /* Mark the IRQ currently in progress.*/ | ||
379 | desc->status |= IRQ_INPROGRESS; | ||
380 | |||
381 | do { | ||
382 | struct irqaction *action = desc->action; | ||
383 | irqreturn_t action_ret; | ||
384 | |||
385 | if (unlikely(!action)) { | ||
386 | desc->chip->mask(irq); | ||
387 | goto out_unlock; | ||
388 | } | ||
389 | |||
390 | /* | ||
391 | * When another irq arrived while we were handling | ||
392 | * one, we could have masked the irq. | ||
393 | * Renable it, if it was not disabled in meantime. | ||
394 | */ | ||
395 | if (unlikely((desc->status & | ||
396 | (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == | ||
397 | (IRQ_PENDING | IRQ_MASKED))) { | ||
398 | desc->chip->unmask(irq); | ||
399 | desc->status &= ~IRQ_MASKED; | ||
400 | } | ||
401 | |||
402 | desc->status &= ~IRQ_PENDING; | ||
403 | spin_unlock(&desc->lock); | ||
404 | action_ret = handle_IRQ_event(irq, regs, action); | ||
405 | if (!noirqdebug) | ||
406 | note_interrupt(irq, desc, action_ret, regs); | ||
407 | spin_lock(&desc->lock); | ||
408 | |||
409 | } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); | ||
410 | |||
411 | desc->status &= ~IRQ_INPROGRESS; | ||
412 | out_unlock: | ||
413 | spin_unlock(&desc->lock); | ||
414 | } | ||
415 | |||
416 | #ifdef CONFIG_SMP | ||
417 | /** | ||
418 | * handle_percpu_IRQ - Per CPU local irq handler | ||
419 | * @irq: the interrupt number | ||
420 | * @desc: the interrupt description structure for this irq | ||
421 | * @regs: pointer to a register structure | ||
422 | * | ||
423 | * Per CPU interrupts on SMP machines without locking requirements | ||
424 | */ | ||
425 | void fastcall | ||
426 | handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) | ||
427 | { | ||
428 | irqreturn_t action_ret; | ||
429 | |||
430 | kstat_this_cpu.irqs[irq]++; | ||
431 | |||
432 | if (desc->chip->ack) | ||
433 | desc->chip->ack(irq); | ||
434 | |||
435 | action_ret = handle_IRQ_event(irq, regs, desc->action); | ||
436 | if (!noirqdebug) | ||
437 | note_interrupt(irq, desc, action_ret, regs); | ||
438 | |||
439 | if (desc->chip->eoi) | ||
440 | desc->chip->eoi(irq); | ||
441 | } | ||
442 | |||
443 | #endif /* CONFIG_SMP */ | ||
444 | |||
445 | void | ||
446 | __set_irq_handler(unsigned int irq, | ||
447 | void fastcall (*handle)(unsigned int, irq_desc_t *, | ||
448 | struct pt_regs *), | ||
449 | int is_chained) | ||
450 | { | ||
451 | struct irq_desc *desc; | ||
452 | unsigned long flags; | ||
453 | |||
454 | if (irq >= NR_IRQS) { | ||
455 | printk(KERN_ERR | ||
456 | "Trying to install type control for IRQ%d\n", irq); | ||
457 | return; | ||
458 | } | ||
459 | |||
460 | desc = irq_desc + irq; | ||
461 | |||
462 | if (!handle) | ||
463 | handle = handle_bad_irq; | ||
464 | |||
465 | if (desc->chip == &no_irq_chip) { | ||
466 | printk(KERN_WARNING "Trying to install %sinterrupt handler " | ||
467 | "for IRQ%d\n", is_chained ? "chained " : " ", irq); | ||
468 | /* | ||
469 | * Some ARM implementations install a handler for really dumb | ||
470 | * interrupt hardware without setting an irq_chip. This worked | ||
471 | * with the ARM no_irq_chip but the check in setup_irq would | ||
472 | * prevent us to setup the interrupt at all. Switch it to | ||
473 | * dummy_irq_chip for easy transition. | ||
474 | */ | ||
475 | desc->chip = &dummy_irq_chip; | ||
476 | } | ||
477 | |||
478 | spin_lock_irqsave(&desc->lock, flags); | ||
479 | |||
480 | /* Uninstall? */ | ||
481 | if (handle == handle_bad_irq) { | ||
482 | if (desc->chip != &no_irq_chip) { | ||
483 | desc->chip->mask(irq); | ||
484 | desc->chip->ack(irq); | ||
485 | } | ||
486 | desc->status |= IRQ_DISABLED; | ||
487 | desc->depth = 1; | ||
488 | } | ||
489 | desc->handle_irq = handle; | ||
490 | |||
491 | if (handle != handle_bad_irq && is_chained) { | ||
492 | desc->status &= ~IRQ_DISABLED; | ||
493 | desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; | ||
494 | desc->depth = 0; | ||
495 | desc->chip->unmask(irq); | ||
496 | } | ||
497 | spin_unlock_irqrestore(&desc->lock, flags); | ||
498 | } | ||
499 | |||
500 | void | ||
501 | set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, | ||
502 | void fastcall (*handle)(unsigned int, | ||
503 | struct irq_desc *, | ||
504 | struct pt_regs *)) | ||
505 | { | ||
506 | set_irq_chip(irq, chip); | ||
507 | __set_irq_handler(irq, handle, 0); | ||
508 | } | ||
509 | |||
510 | /* | ||
511 | * Get a descriptive string for the highlevel handler, for | ||
512 | * /proc/interrupts output: | ||
513 | */ | ||
514 | const char * | ||
515 | handle_irq_name(void fastcall (*handle)(unsigned int, struct irq_desc *, | ||
516 | struct pt_regs *)) | ||
517 | { | ||
518 | if (handle == handle_level_irq) | ||
519 | return "level "; | ||
520 | if (handle == handle_fasteoi_irq) | ||
521 | return "fasteoi"; | ||
522 | if (handle == handle_edge_irq) | ||
523 | return "edge "; | ||
524 | if (handle == handle_simple_irq) | ||
525 | return "simple "; | ||
526 | #ifdef CONFIG_SMP | ||
527 | if (handle == handle_percpu_irq) | ||
528 | return "percpu "; | ||
529 | #endif | ||
530 | if (handle == handle_bad_irq) | ||
531 | return "bad "; | ||
532 | |||
533 | return NULL; | ||
534 | } | ||
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 51df337b37db..aeb6e391276c 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -1,9 +1,13 @@ | |||
1 | /* | 1 | /* |
2 | * linux/kernel/irq/handle.c | 2 | * linux/kernel/irq/handle.c |
3 | * | 3 | * |
4 | * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar | 4 | * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar |
5 | * Copyright (C) 2005-2006, Thomas Gleixner, Russell King | ||
5 | * | 6 | * |
6 | * This file contains the core interrupt handling code. | 7 | * This file contains the core interrupt handling code. |
8 | * | ||
9 | * Detailed information is available in Documentation/DocBook/genericirq | ||
10 | * | ||
7 | */ | 11 | */ |
8 | 12 | ||
9 | #include <linux/irq.h> | 13 | #include <linux/irq.h> |
@@ -14,11 +18,22 @@ | |||
14 | 18 | ||
15 | #include "internals.h" | 19 | #include "internals.h" |
16 | 20 | ||
21 | /** | ||
22 | * handle_bad_irq - handle spurious and unhandled irqs | ||
23 | */ | ||
24 | void fastcall | ||
25 | handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) | ||
26 | { | ||
27 | print_irq_desc(irq, desc); | ||
28 | kstat_this_cpu.irqs[irq]++; | ||
29 | ack_bad_irq(irq); | ||
30 | } | ||
31 | |||
17 | /* | 32 | /* |
18 | * Linux has a controller-independent interrupt architecture. | 33 | * Linux has a controller-independent interrupt architecture. |
19 | * Every controller has a 'controller-template', that is used | 34 | * Every controller has a 'controller-template', that is used |
20 | * by the main code to do the right thing. Each driver-visible | 35 | * by the main code to do the right thing. Each driver-visible |
21 | * interrupt source is transparently wired to the apropriate | 36 | * interrupt source is transparently wired to the appropriate |
22 | * controller. Thus drivers need not be aware of the | 37 | * controller. Thus drivers need not be aware of the |
23 | * interrupt-controller. | 38 | * interrupt-controller. |
24 | * | 39 | * |
@@ -28,41 +43,68 @@ | |||
28 | * | 43 | * |
29 | * Controller mappings for all interrupt sources: | 44 | * Controller mappings for all interrupt sources: |
30 | */ | 45 | */ |
31 | irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { | 46 | struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = { |
32 | [0 ... NR_IRQS-1] = { | 47 | [0 ... NR_IRQS-1] = { |
33 | .status = IRQ_DISABLED, | 48 | .status = IRQ_DISABLED, |
34 | .handler = &no_irq_type, | 49 | .chip = &no_irq_chip, |
35 | .lock = SPIN_LOCK_UNLOCKED | 50 | .handle_irq = handle_bad_irq, |
51 | .depth = 1, | ||
52 | .lock = SPIN_LOCK_UNLOCKED, | ||
53 | #ifdef CONFIG_SMP | ||
54 | .affinity = CPU_MASK_ALL | ||
55 | #endif | ||
36 | } | 56 | } |
37 | }; | 57 | }; |
38 | 58 | ||
39 | /* | 59 | /* |
40 | * Generic 'no controller' code | 60 | * What should we do if we get a hw irq event on an illegal vector? |
61 | * Each architecture has to answer this themself. | ||
41 | */ | 62 | */ |
42 | static void end_none(unsigned int irq) { } | 63 | static void ack_bad(unsigned int irq) |
43 | static void enable_none(unsigned int irq) { } | ||
44 | static void disable_none(unsigned int irq) { } | ||
45 | static void shutdown_none(unsigned int irq) { } | ||
46 | static unsigned int startup_none(unsigned int irq) { return 0; } | ||
47 | |||
48 | static void ack_none(unsigned int irq) | ||
49 | { | 64 | { |
50 | /* | 65 | print_irq_desc(irq, irq_desc + irq); |
51 | * 'what should we do if we get a hw irq event on an illegal vector'. | ||
52 | * each architecture has to answer this themself. | ||
53 | */ | ||
54 | ack_bad_irq(irq); | 66 | ack_bad_irq(irq); |
55 | } | 67 | } |
56 | 68 | ||
57 | struct hw_interrupt_type no_irq_type = { | 69 | /* |
58 | .typename = "none", | 70 | * NOP functions |
59 | .startup = startup_none, | 71 | */ |
60 | .shutdown = shutdown_none, | 72 | static void noop(unsigned int irq) |
61 | .enable = enable_none, | 73 | { |
62 | .disable = disable_none, | 74 | } |
63 | .ack = ack_none, | 75 | |
64 | .end = end_none, | 76 | static unsigned int noop_ret(unsigned int irq) |
65 | .set_affinity = NULL | 77 | { |
78 | return 0; | ||
79 | } | ||
80 | |||
81 | /* | ||
82 | * Generic no controller implementation | ||
83 | */ | ||
84 | struct irq_chip no_irq_chip = { | ||
85 | .name = "none", | ||
86 | .startup = noop_ret, | ||
87 | .shutdown = noop, | ||
88 | .enable = noop, | ||
89 | .disable = noop, | ||
90 | .ack = ack_bad, | ||
91 | .end = noop, | ||
92 | }; | ||
93 | |||
94 | /* | ||
95 | * Generic dummy implementation which can be used for | ||
96 | * real dumb interrupt sources | ||
97 | */ | ||
98 | struct irq_chip dummy_irq_chip = { | ||
99 | .name = "dummy", | ||
100 | .startup = noop_ret, | ||
101 | .shutdown = noop, | ||
102 | .enable = noop, | ||
103 | .disable = noop, | ||
104 | .ack = noop, | ||
105 | .mask = noop, | ||
106 | .unmask = noop, | ||
107 | .end = noop, | ||
66 | }; | 108 | }; |
67 | 109 | ||
68 | /* | 110 | /* |
@@ -73,15 +115,23 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs) | |||
73 | return IRQ_NONE; | 115 | return IRQ_NONE; |
74 | } | 116 | } |
75 | 117 | ||
76 | /* | 118 | /** |
77 | * Have got an event to handle: | 119 | * handle_IRQ_event - irq action chain handler |
120 | * @irq: the interrupt number | ||
121 | * @regs: pointer to a register structure | ||
122 | * @action: the interrupt action chain for this irq | ||
123 | * | ||
124 | * Handles the action chain of an irq event | ||
78 | */ | 125 | */ |
79 | fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, | 126 | irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, |
80 | struct irqaction *action) | 127 | struct irqaction *action) |
81 | { | 128 | { |
82 | int ret, retval = 0, status = 0; | 129 | irqreturn_t ret, retval = IRQ_NONE; |
130 | unsigned int status = 0; | ||
131 | |||
132 | handle_dynamic_tick(action); | ||
83 | 133 | ||
84 | if (!(action->flags & SA_INTERRUPT)) | 134 | if (!(action->flags & IRQF_DISABLED)) |
85 | local_irq_enable(); | 135 | local_irq_enable(); |
86 | 136 | ||
87 | do { | 137 | do { |
@@ -92,22 +142,29 @@ fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, | |||
92 | action = action->next; | 142 | action = action->next; |
93 | } while (action); | 143 | } while (action); |
94 | 144 | ||
95 | if (status & SA_SAMPLE_RANDOM) | 145 | if (status & IRQF_SAMPLE_RANDOM) |
96 | add_interrupt_randomness(irq); | 146 | add_interrupt_randomness(irq); |
97 | local_irq_disable(); | 147 | local_irq_disable(); |
98 | 148 | ||
99 | return retval; | 149 | return retval; |
100 | } | 150 | } |
101 | 151 | ||
102 | /* | 152 | /** |
103 | * do_IRQ handles all normal device IRQ's (the special | 153 | * __do_IRQ - original all in one highlevel IRQ handler |
154 | * @irq: the interrupt number | ||
155 | * @regs: pointer to a register structure | ||
156 | * | ||
157 | * __do_IRQ handles all normal device IRQ's (the special | ||
104 | * SMP cross-CPU interrupts have their own specific | 158 | * SMP cross-CPU interrupts have their own specific |
105 | * handlers). | 159 | * handlers). |
160 | * | ||
161 | * This is the original x86 implementation which is used for every | ||
162 | * interrupt type. | ||
106 | */ | 163 | */ |
107 | fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) | 164 | fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) |
108 | { | 165 | { |
109 | irq_desc_t *desc = irq_desc + irq; | 166 | struct irq_desc *desc = irq_desc + irq; |
110 | struct irqaction * action; | 167 | struct irqaction *action; |
111 | unsigned int status; | 168 | unsigned int status; |
112 | 169 | ||
113 | kstat_this_cpu.irqs[irq]++; | 170 | kstat_this_cpu.irqs[irq]++; |
@@ -117,16 +174,16 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) | |||
117 | /* | 174 | /* |
118 | * No locking required for CPU-local interrupts: | 175 | * No locking required for CPU-local interrupts: |
119 | */ | 176 | */ |
120 | if (desc->handler->ack) | 177 | if (desc->chip->ack) |
121 | desc->handler->ack(irq); | 178 | desc->chip->ack(irq); |
122 | action_ret = handle_IRQ_event(irq, regs, desc->action); | 179 | action_ret = handle_IRQ_event(irq, regs, desc->action); |
123 | desc->handler->end(irq); | 180 | desc->chip->end(irq); |
124 | return 1; | 181 | return 1; |
125 | } | 182 | } |
126 | 183 | ||
127 | spin_lock(&desc->lock); | 184 | spin_lock(&desc->lock); |
128 | if (desc->handler->ack) | 185 | if (desc->chip->ack) |
129 | desc->handler->ack(irq); | 186 | desc->chip->ack(irq); |
130 | /* | 187 | /* |
131 | * REPLAY is when Linux resends an IRQ that was dropped earlier | 188 | * REPLAY is when Linux resends an IRQ that was dropped earlier |
132 | * WAITING is used by probe to mark irqs that are being tested | 189 | * WAITING is used by probe to mark irqs that are being tested |
@@ -186,7 +243,7 @@ out: | |||
186 | * The ->end() handler has to deal with interrupts which got | 243 | * The ->end() handler has to deal with interrupts which got |
187 | * disabled while the handler was running. | 244 | * disabled while the handler was running. |
188 | */ | 245 | */ |
189 | desc->handler->end(irq); | 246 | desc->chip->end(irq); |
190 | spin_unlock(&desc->lock); | 247 | spin_unlock(&desc->lock); |
191 | 248 | ||
192 | return 1; | 249 | return 1; |
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 46feba630266..08a849a22447 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
@@ -4,6 +4,12 @@ | |||
4 | 4 | ||
5 | extern int noirqdebug; | 5 | extern int noirqdebug; |
6 | 6 | ||
7 | /* Set default functions for irq_chip structures: */ | ||
8 | extern void irq_chip_set_defaults(struct irq_chip *chip); | ||
9 | |||
10 | /* Set default handler: */ | ||
11 | extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); | ||
12 | |||
7 | #ifdef CONFIG_PROC_FS | 13 | #ifdef CONFIG_PROC_FS |
8 | extern void register_irq_proc(unsigned int irq); | 14 | extern void register_irq_proc(unsigned int irq); |
9 | extern void register_handler_proc(unsigned int irq, struct irqaction *action); | 15 | extern void register_handler_proc(unsigned int irq, struct irqaction *action); |
@@ -16,3 +22,43 @@ static inline void unregister_handler_proc(unsigned int irq, | |||
16 | struct irqaction *action) { } | 22 | struct irqaction *action) { } |
17 | #endif | 23 | #endif |
18 | 24 | ||
25 | /* | ||
26 | * Debugging printout: | ||
27 | */ | ||
28 | |||
29 | #include <linux/kallsyms.h> | ||
30 | |||
31 | #define P(f) if (desc->status & f) printk("%14s set\n", #f) | ||
32 | |||
33 | static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) | ||
34 | { | ||
35 | printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", | ||
36 | irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); | ||
37 | printk("->handle_irq(): %p, ", desc->handle_irq); | ||
38 | print_symbol("%s\n", (unsigned long)desc->handle_irq); | ||
39 | printk("->chip(): %p, ", desc->chip); | ||
40 | print_symbol("%s\n", (unsigned long)desc->chip); | ||
41 | printk("->action(): %p\n", desc->action); | ||
42 | if (desc->action) { | ||
43 | printk("->action->handler(): %p, ", desc->action->handler); | ||
44 | print_symbol("%s\n", (unsigned long)desc->action->handler); | ||
45 | } | ||
46 | |||
47 | P(IRQ_INPROGRESS); | ||
48 | P(IRQ_DISABLED); | ||
49 | P(IRQ_PENDING); | ||
50 | P(IRQ_REPLAY); | ||
51 | P(IRQ_AUTODETECT); | ||
52 | P(IRQ_WAITING); | ||
53 | P(IRQ_LEVEL); | ||
54 | P(IRQ_MASKED); | ||
55 | #ifdef CONFIG_IRQ_PER_CPU | ||
56 | P(IRQ_PER_CPU); | ||
57 | #endif | ||
58 | P(IRQ_NOPROBE); | ||
59 | P(IRQ_NOREQUEST); | ||
60 | P(IRQ_NOAUTOEN); | ||
61 | } | ||
62 | |||
63 | #undef P | ||
64 | |||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1279e3499534..c911c6ec4dd6 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -1,12 +1,12 @@ | |||
1 | /* | 1 | /* |
2 | * linux/kernel/irq/manage.c | 2 | * linux/kernel/irq/manage.c |
3 | * | 3 | * |
4 | * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar | 4 | * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar |
5 | * Copyright (C) 2005-2006 Thomas Gleixner | ||
5 | * | 6 | * |
6 | * This file contains driver APIs to the irq subsystem. | 7 | * This file contains driver APIs to the irq subsystem. |
7 | */ | 8 | */ |
8 | 9 | ||
9 | #include <linux/config.h> | ||
10 | #include <linux/irq.h> | 10 | #include <linux/irq.h> |
11 | #include <linux/module.h> | 11 | #include <linux/module.h> |
12 | #include <linux/random.h> | 12 | #include <linux/random.h> |
@@ -16,12 +16,6 @@ | |||
16 | 16 | ||
17 | #ifdef CONFIG_SMP | 17 | #ifdef CONFIG_SMP |
18 | 18 | ||
19 | cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; | ||
20 | |||
21 | #if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE) | ||
22 | cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS]; | ||
23 | #endif | ||
24 | |||
25 | /** | 19 | /** |
26 | * synchronize_irq - wait for pending IRQ handlers (on other CPUs) | 20 | * synchronize_irq - wait for pending IRQ handlers (on other CPUs) |
27 | * @irq: interrupt number to wait for | 21 | * @irq: interrupt number to wait for |
@@ -42,7 +36,6 @@ void synchronize_irq(unsigned int irq) | |||
42 | while (desc->status & IRQ_INPROGRESS) | 36 | while (desc->status & IRQ_INPROGRESS) |
43 | cpu_relax(); | 37 | cpu_relax(); |
44 | } | 38 | } |
45 | |||
46 | EXPORT_SYMBOL(synchronize_irq); | 39 | EXPORT_SYMBOL(synchronize_irq); |
47 | 40 | ||
48 | #endif | 41 | #endif |
@@ -60,7 +53,7 @@ EXPORT_SYMBOL(synchronize_irq); | |||
60 | */ | 53 | */ |
61 | void disable_irq_nosync(unsigned int irq) | 54 | void disable_irq_nosync(unsigned int irq) |
62 | { | 55 | { |
63 | irq_desc_t *desc = irq_desc + irq; | 56 | struct irq_desc *desc = irq_desc + irq; |
64 | unsigned long flags; | 57 | unsigned long flags; |
65 | 58 | ||
66 | if (irq >= NR_IRQS) | 59 | if (irq >= NR_IRQS) |
@@ -69,11 +62,10 @@ void disable_irq_nosync(unsigned int irq) | |||
69 | spin_lock_irqsave(&desc->lock, flags); | 62 | spin_lock_irqsave(&desc->lock, flags); |
70 | if (!desc->depth++) { | 63 | if (!desc->depth++) { |
71 | desc->status |= IRQ_DISABLED; | 64 | desc->status |= IRQ_DISABLED; |
72 | desc->handler->disable(irq); | 65 | desc->chip->disable(irq); |
73 | } | 66 | } |
74 | spin_unlock_irqrestore(&desc->lock, flags); | 67 | spin_unlock_irqrestore(&desc->lock, flags); |
75 | } | 68 | } |
76 | |||
77 | EXPORT_SYMBOL(disable_irq_nosync); | 69 | EXPORT_SYMBOL(disable_irq_nosync); |
78 | 70 | ||
79 | /** | 71 | /** |
@@ -90,7 +82,7 @@ EXPORT_SYMBOL(disable_irq_nosync); | |||
90 | */ | 82 | */ |
91 | void disable_irq(unsigned int irq) | 83 | void disable_irq(unsigned int irq) |
92 | { | 84 | { |
93 | irq_desc_t *desc = irq_desc + irq; | 85 | struct irq_desc *desc = irq_desc + irq; |
94 | 86 | ||
95 | if (irq >= NR_IRQS) | 87 | if (irq >= NR_IRQS) |
96 | return; | 88 | return; |
@@ -99,7 +91,6 @@ void disable_irq(unsigned int irq) | |||
99 | if (desc->action) | 91 | if (desc->action) |
100 | synchronize_irq(irq); | 92 | synchronize_irq(irq); |
101 | } | 93 | } |
102 | |||
103 | EXPORT_SYMBOL(disable_irq); | 94 | EXPORT_SYMBOL(disable_irq); |
104 | 95 | ||
105 | /** | 96 | /** |
@@ -114,7 +105,7 @@ EXPORT_SYMBOL(disable_irq); | |||
114 | */ | 105 | */ |
115 | void enable_irq(unsigned int irq) | 106 | void enable_irq(unsigned int irq) |
116 | { | 107 | { |
117 | irq_desc_t *desc = irq_desc + irq; | 108 | struct irq_desc *desc = irq_desc + irq; |
118 | unsigned long flags; | 109 | unsigned long flags; |
119 | 110 | ||
120 | if (irq >= NR_IRQS) | 111 | if (irq >= NR_IRQS) |
@@ -123,17 +114,15 @@ void enable_irq(unsigned int irq) | |||
123 | spin_lock_irqsave(&desc->lock, flags); | 114 | spin_lock_irqsave(&desc->lock, flags); |
124 | switch (desc->depth) { | 115 | switch (desc->depth) { |
125 | case 0: | 116 | case 0: |
117 | printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); | ||
126 | WARN_ON(1); | 118 | WARN_ON(1); |
127 | break; | 119 | break; |
128 | case 1: { | 120 | case 1: { |
129 | unsigned int status = desc->status & ~IRQ_DISABLED; | 121 | unsigned int status = desc->status & ~IRQ_DISABLED; |
130 | 122 | ||
131 | desc->status = status; | 123 | /* Prevent probing on this irq: */ |
132 | if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { | 124 | desc->status = status | IRQ_NOPROBE; |
133 | desc->status = status | IRQ_REPLAY; | 125 | check_irq_resend(desc, irq); |
134 | hw_resend_irq(desc->handler,irq); | ||
135 | } | ||
136 | desc->handler->enable(irq); | ||
137 | /* fall-through */ | 126 | /* fall-through */ |
138 | } | 127 | } |
139 | default: | 128 | default: |
@@ -141,9 +130,29 @@ void enable_irq(unsigned int irq) | |||
141 | } | 130 | } |
142 | spin_unlock_irqrestore(&desc->lock, flags); | 131 | spin_unlock_irqrestore(&desc->lock, flags); |
143 | } | 132 | } |
144 | |||
145 | EXPORT_SYMBOL(enable_irq); | 133 | EXPORT_SYMBOL(enable_irq); |
146 | 134 | ||
135 | /** | ||
136 | * set_irq_wake - control irq power management wakeup | ||
137 | * @irq: interrupt to control | ||
138 | * @on: enable/disable power management wakeup | ||
139 | * | ||
140 | * Enable/disable power management wakeup mode | ||
141 | */ | ||
142 | int set_irq_wake(unsigned int irq, unsigned int on) | ||
143 | { | ||
144 | struct irq_desc *desc = irq_desc + irq; | ||
145 | unsigned long flags; | ||
146 | int ret = -ENXIO; | ||
147 | |||
148 | spin_lock_irqsave(&desc->lock, flags); | ||
149 | if (desc->chip->set_wake) | ||
150 | ret = desc->chip->set_wake(irq, on); | ||
151 | spin_unlock_irqrestore(&desc->lock, flags); | ||
152 | return ret; | ||
153 | } | ||
154 | EXPORT_SYMBOL(set_irq_wake); | ||
155 | |||
147 | /* | 156 | /* |
148 | * Internal function that tells the architecture code whether a | 157 | * Internal function that tells the architecture code whether a |
149 | * particular irq has been exclusively allocated or is available | 158 | * particular irq has been exclusively allocated or is available |
@@ -153,22 +162,33 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) | |||
153 | { | 162 | { |
154 | struct irqaction *action; | 163 | struct irqaction *action; |
155 | 164 | ||
156 | if (irq >= NR_IRQS) | 165 | if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST) |
157 | return 0; | 166 | return 0; |
158 | 167 | ||
159 | action = irq_desc[irq].action; | 168 | action = irq_desc[irq].action; |
160 | if (action) | 169 | if (action) |
161 | if (irqflags & action->flags & SA_SHIRQ) | 170 | if (irqflags & action->flags & IRQF_SHARED) |
162 | action = NULL; | 171 | action = NULL; |
163 | 172 | ||
164 | return !action; | 173 | return !action; |
165 | } | 174 | } |
166 | 175 | ||
176 | void compat_irq_chip_set_default_handler(struct irq_desc *desc) | ||
177 | { | ||
178 | /* | ||
179 | * If the architecture still has not overriden | ||
180 | * the flow handler then zap the default. This | ||
181 | * should catch incorrect flow-type setting. | ||
182 | */ | ||
183 | if (desc->handle_irq == &handle_bad_irq) | ||
184 | desc->handle_irq = NULL; | ||
185 | } | ||
186 | |||
167 | /* | 187 | /* |
168 | * Internal function to register an irqaction - typically used to | 188 | * Internal function to register an irqaction - typically used to |
169 | * allocate special interrupts that are part of the architecture. | 189 | * allocate special interrupts that are part of the architecture. |
170 | */ | 190 | */ |
171 | int setup_irq(unsigned int irq, struct irqaction * new) | 191 | int setup_irq(unsigned int irq, struct irqaction *new) |
172 | { | 192 | { |
173 | struct irq_desc *desc = irq_desc + irq; | 193 | struct irq_desc *desc = irq_desc + irq; |
174 | struct irqaction *old, **p; | 194 | struct irqaction *old, **p; |
@@ -178,14 +198,14 @@ int setup_irq(unsigned int irq, struct irqaction * new) | |||
178 | if (irq >= NR_IRQS) | 198 | if (irq >= NR_IRQS) |
179 | return -EINVAL; | 199 | return -EINVAL; |
180 | 200 | ||
181 | if (desc->handler == &no_irq_type) | 201 | if (desc->chip == &no_irq_chip) |
182 | return -ENOSYS; | 202 | return -ENOSYS; |
183 | /* | 203 | /* |
184 | * Some drivers like serial.c use request_irq() heavily, | 204 | * Some drivers like serial.c use request_irq() heavily, |
185 | * so we have to be careful not to interfere with a | 205 | * so we have to be careful not to interfere with a |
186 | * running system. | 206 | * running system. |
187 | */ | 207 | */ |
188 | if (new->flags & SA_SAMPLE_RANDOM) { | 208 | if (new->flags & IRQF_SAMPLE_RANDOM) { |
189 | /* | 209 | /* |
190 | * This function might sleep, we want to call it first, | 210 | * This function might sleep, we want to call it first, |
191 | * outside of the atomic block. | 211 | * outside of the atomic block. |
@@ -200,16 +220,24 @@ int setup_irq(unsigned int irq, struct irqaction * new) | |||
200 | /* | 220 | /* |
201 | * The following block of code has to be executed atomically | 221 | * The following block of code has to be executed atomically |
202 | */ | 222 | */ |
203 | spin_lock_irqsave(&desc->lock,flags); | 223 | spin_lock_irqsave(&desc->lock, flags); |
204 | p = &desc->action; | 224 | p = &desc->action; |
205 | if ((old = *p) != NULL) { | 225 | old = *p; |
206 | /* Can't share interrupts unless both agree to */ | 226 | if (old) { |
207 | if (!(old->flags & new->flags & SA_SHIRQ)) | 227 | /* |
228 | * Can't share interrupts unless both agree to and are | ||
229 | * the same type (level, edge, polarity). So both flag | ||
230 | * fields must have IRQF_SHARED set and the bits which | ||
231 | * set the trigger type must match. | ||
232 | */ | ||
233 | if (!((old->flags & new->flags) & IRQF_SHARED) || | ||
234 | ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) | ||
208 | goto mismatch; | 235 | goto mismatch; |
209 | 236 | ||
210 | #if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) | 237 | #if defined(CONFIG_IRQ_PER_CPU) |
211 | /* All handlers must agree on per-cpuness */ | 238 | /* All handlers must agree on per-cpuness */ |
212 | if ((old->flags & IRQ_PER_CPU) != (new->flags & IRQ_PER_CPU)) | 239 | if ((old->flags & IRQF_PERCPU) != |
240 | (new->flags & IRQF_PERCPU)) | ||
213 | goto mismatch; | 241 | goto mismatch; |
214 | #endif | 242 | #endif |
215 | 243 | ||
@@ -222,20 +250,45 @@ int setup_irq(unsigned int irq, struct irqaction * new) | |||
222 | } | 250 | } |
223 | 251 | ||
224 | *p = new; | 252 | *p = new; |
225 | #if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) | 253 | #if defined(CONFIG_IRQ_PER_CPU) |
226 | if (new->flags & SA_PERCPU_IRQ) | 254 | if (new->flags & IRQF_PERCPU) |
227 | desc->status |= IRQ_PER_CPU; | 255 | desc->status |= IRQ_PER_CPU; |
228 | #endif | 256 | #endif |
229 | if (!shared) { | 257 | if (!shared) { |
230 | desc->depth = 0; | 258 | irq_chip_set_defaults(desc->chip); |
231 | desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | | 259 | |
232 | IRQ_WAITING | IRQ_INPROGRESS); | 260 | /* Setup the type (level, edge polarity) if configured: */ |
233 | if (desc->handler->startup) | 261 | if (new->flags & IRQF_TRIGGER_MASK) { |
234 | desc->handler->startup(irq); | 262 | if (desc->chip && desc->chip->set_type) |
235 | else | 263 | desc->chip->set_type(irq, |
236 | desc->handler->enable(irq); | 264 | new->flags & IRQF_TRIGGER_MASK); |
265 | else | ||
266 | /* | ||
267 | * IRQF_TRIGGER_* but the PIC does not support | ||
268 | * multiple flow-types? | ||
269 | */ | ||
270 | printk(KERN_WARNING "No IRQF_TRIGGER set_type " | ||
271 | "function for IRQ %d (%s)\n", irq, | ||
272 | desc->chip ? desc->chip->name : | ||
273 | "unknown"); | ||
274 | } else | ||
275 | compat_irq_chip_set_default_handler(desc); | ||
276 | |||
277 | desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | | ||
278 | IRQ_INPROGRESS); | ||
279 | |||
280 | if (!(desc->status & IRQ_NOAUTOEN)) { | ||
281 | desc->depth = 0; | ||
282 | desc->status &= ~IRQ_DISABLED; | ||
283 | if (desc->chip->startup) | ||
284 | desc->chip->startup(irq); | ||
285 | else | ||
286 | desc->chip->enable(irq); | ||
287 | } else | ||
288 | /* Undo nested disables: */ | ||
289 | desc->depth = 1; | ||
237 | } | 290 | } |
238 | spin_unlock_irqrestore(&desc->lock,flags); | 291 | spin_unlock_irqrestore(&desc->lock, flags); |
239 | 292 | ||
240 | new->irq = irq; | 293 | new->irq = irq; |
241 | register_irq_proc(irq); | 294 | register_irq_proc(irq); |
@@ -246,8 +299,8 @@ int setup_irq(unsigned int irq, struct irqaction * new) | |||
246 | 299 | ||
247 | mismatch: | 300 | mismatch: |
248 | spin_unlock_irqrestore(&desc->lock, flags); | 301 | spin_unlock_irqrestore(&desc->lock, flags); |
249 | if (!(new->flags & SA_PROBEIRQ)) { | 302 | if (!(new->flags & IRQF_PROBE_SHARED)) { |
250 | printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__); | 303 | printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); |
251 | dump_stack(); | 304 | dump_stack(); |
252 | } | 305 | } |
253 | return -EBUSY; | 306 | return -EBUSY; |
@@ -278,10 +331,10 @@ void free_irq(unsigned int irq, void *dev_id) | |||
278 | return; | 331 | return; |
279 | 332 | ||
280 | desc = irq_desc + irq; | 333 | desc = irq_desc + irq; |
281 | spin_lock_irqsave(&desc->lock,flags); | 334 | spin_lock_irqsave(&desc->lock, flags); |
282 | p = &desc->action; | 335 | p = &desc->action; |
283 | for (;;) { | 336 | for (;;) { |
284 | struct irqaction * action = *p; | 337 | struct irqaction *action = *p; |
285 | 338 | ||
286 | if (action) { | 339 | if (action) { |
287 | struct irqaction **pp = p; | 340 | struct irqaction **pp = p; |
@@ -295,18 +348,18 @@ void free_irq(unsigned int irq, void *dev_id) | |||
295 | 348 | ||
296 | /* Currently used only by UML, might disappear one day.*/ | 349 | /* Currently used only by UML, might disappear one day.*/ |
297 | #ifdef CONFIG_IRQ_RELEASE_METHOD | 350 | #ifdef CONFIG_IRQ_RELEASE_METHOD |
298 | if (desc->handler->release) | 351 | if (desc->chip->release) |
299 | desc->handler->release(irq, dev_id); | 352 | desc->chip->release(irq, dev_id); |
300 | #endif | 353 | #endif |
301 | 354 | ||
302 | if (!desc->action) { | 355 | if (!desc->action) { |
303 | desc->status |= IRQ_DISABLED; | 356 | desc->status |= IRQ_DISABLED; |
304 | if (desc->handler->shutdown) | 357 | if (desc->chip->shutdown) |
305 | desc->handler->shutdown(irq); | 358 | desc->chip->shutdown(irq); |
306 | else | 359 | else |
307 | desc->handler->disable(irq); | 360 | desc->chip->disable(irq); |
308 | } | 361 | } |
309 | spin_unlock_irqrestore(&desc->lock,flags); | 362 | spin_unlock_irqrestore(&desc->lock, flags); |
310 | unregister_handler_proc(irq, action); | 363 | unregister_handler_proc(irq, action); |
311 | 364 | ||
312 | /* Make sure it's not being used on another CPU */ | 365 | /* Make sure it's not being used on another CPU */ |
@@ -314,12 +367,11 @@ void free_irq(unsigned int irq, void *dev_id) | |||
314 | kfree(action); | 367 | kfree(action); |
315 | return; | 368 | return; |
316 | } | 369 | } |
317 | printk(KERN_ERR "Trying to free free IRQ%d\n",irq); | 370 | printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq); |
318 | spin_unlock_irqrestore(&desc->lock,flags); | 371 | spin_unlock_irqrestore(&desc->lock, flags); |
319 | return; | 372 | return; |
320 | } | 373 | } |
321 | } | 374 | } |
322 | |||
323 | EXPORT_SYMBOL(free_irq); | 375 | EXPORT_SYMBOL(free_irq); |
324 | 376 | ||
325 | /** | 377 | /** |
@@ -346,16 +398,16 @@ EXPORT_SYMBOL(free_irq); | |||
346 | * | 398 | * |
347 | * Flags: | 399 | * Flags: |
348 | * | 400 | * |
349 | * SA_SHIRQ Interrupt is shared | 401 | * IRQF_SHARED Interrupt is shared |
350 | * SA_INTERRUPT Disable local interrupts while processing | 402 | * IRQF_DISABLED Disable local interrupts while processing |
351 | * SA_SAMPLE_RANDOM The interrupt can be used for entropy | 403 | * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy |
352 | * | 404 | * |
353 | */ | 405 | */ |
354 | int request_irq(unsigned int irq, | 406 | int request_irq(unsigned int irq, |
355 | irqreturn_t (*handler)(int, void *, struct pt_regs *), | 407 | irqreturn_t (*handler)(int, void *, struct pt_regs *), |
356 | unsigned long irqflags, const char * devname, void *dev_id) | 408 | unsigned long irqflags, const char *devname, void *dev_id) |
357 | { | 409 | { |
358 | struct irqaction * action; | 410 | struct irqaction *action; |
359 | int retval; | 411 | int retval; |
360 | 412 | ||
361 | /* | 413 | /* |
@@ -364,10 +416,12 @@ int request_irq(unsigned int irq, | |||
364 | * which interrupt is which (messes up the interrupt freeing | 416 | * which interrupt is which (messes up the interrupt freeing |
365 | * logic etc). | 417 | * logic etc). |
366 | */ | 418 | */ |
367 | if ((irqflags & SA_SHIRQ) && !dev_id) | 419 | if ((irqflags & IRQF_SHARED) && !dev_id) |
368 | return -EINVAL; | 420 | return -EINVAL; |
369 | if (irq >= NR_IRQS) | 421 | if (irq >= NR_IRQS) |
370 | return -EINVAL; | 422 | return -EINVAL; |
423 | if (irq_desc[irq].status & IRQ_NOREQUEST) | ||
424 | return -EINVAL; | ||
371 | if (!handler) | 425 | if (!handler) |
372 | return -EINVAL; | 426 | return -EINVAL; |
373 | 427 | ||
@@ -390,6 +444,5 @@ int request_irq(unsigned int irq, | |||
390 | 444 | ||
391 | return retval; | 445 | return retval; |
392 | } | 446 | } |
393 | |||
394 | EXPORT_SYMBOL(request_irq); | 447 | EXPORT_SYMBOL(request_irq); |
395 | 448 | ||
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 134f9f2e0e39..a57ebe9fa6f6 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c | |||
@@ -3,19 +3,19 @@ | |||
3 | 3 | ||
4 | void set_pending_irq(unsigned int irq, cpumask_t mask) | 4 | void set_pending_irq(unsigned int irq, cpumask_t mask) |
5 | { | 5 | { |
6 | irq_desc_t *desc = irq_desc + irq; | 6 | struct irq_desc *desc = irq_desc + irq; |
7 | unsigned long flags; | 7 | unsigned long flags; |
8 | 8 | ||
9 | spin_lock_irqsave(&desc->lock, flags); | 9 | spin_lock_irqsave(&desc->lock, flags); |
10 | desc->move_irq = 1; | 10 | desc->move_irq = 1; |
11 | pending_irq_cpumask[irq] = mask; | 11 | irq_desc[irq].pending_mask = mask; |
12 | spin_unlock_irqrestore(&desc->lock, flags); | 12 | spin_unlock_irqrestore(&desc->lock, flags); |
13 | } | 13 | } |
14 | 14 | ||
15 | void move_native_irq(int irq) | 15 | void move_native_irq(int irq) |
16 | { | 16 | { |
17 | struct irq_desc *desc = irq_desc + irq; | ||
17 | cpumask_t tmp; | 18 | cpumask_t tmp; |
18 | irq_desc_t *desc = irq_descp(irq); | ||
19 | 19 | ||
20 | if (likely(!desc->move_irq)) | 20 | if (likely(!desc->move_irq)) |
21 | return; | 21 | return; |
@@ -30,15 +30,15 @@ void move_native_irq(int irq) | |||
30 | 30 | ||
31 | desc->move_irq = 0; | 31 | desc->move_irq = 0; |
32 | 32 | ||
33 | if (likely(cpus_empty(pending_irq_cpumask[irq]))) | 33 | if (unlikely(cpus_empty(irq_desc[irq].pending_mask))) |
34 | return; | 34 | return; |
35 | 35 | ||
36 | if (!desc->handler->set_affinity) | 36 | if (!desc->chip->set_affinity) |
37 | return; | 37 | return; |
38 | 38 | ||
39 | assert_spin_locked(&desc->lock); | 39 | assert_spin_locked(&desc->lock); |
40 | 40 | ||
41 | cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map); | 41 | cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map); |
42 | 42 | ||
43 | /* | 43 | /* |
44 | * If there was a valid mask to work with, please | 44 | * If there was a valid mask to work with, please |
@@ -49,14 +49,14 @@ void move_native_irq(int irq) | |||
49 | * cause some ioapics to mal-function. | 49 | * cause some ioapics to mal-function. |
50 | * Being paranoid i guess! | 50 | * Being paranoid i guess! |
51 | */ | 51 | */ |
52 | if (unlikely(!cpus_empty(tmp))) { | 52 | if (likely(!cpus_empty(tmp))) { |
53 | if (likely(!(desc->status & IRQ_DISABLED))) | 53 | if (likely(!(desc->status & IRQ_DISABLED))) |
54 | desc->handler->disable(irq); | 54 | desc->chip->disable(irq); |
55 | 55 | ||
56 | desc->handler->set_affinity(irq,tmp); | 56 | desc->chip->set_affinity(irq,tmp); |
57 | 57 | ||
58 | if (likely(!(desc->status & IRQ_DISABLED))) | 58 | if (likely(!(desc->status & IRQ_DISABLED))) |
59 | desc->handler->enable(irq); | 59 | desc->chip->enable(irq); |
60 | } | 60 | } |
61 | cpus_clear(pending_irq_cpumask[irq]); | 61 | cpus_clear(irq_desc[irq].pending_mask); |
62 | } | 62 | } |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index d03b5eef8ce0..607c7809ad01 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -12,18 +12,15 @@ | |||
12 | 12 | ||
13 | #include "internals.h" | 13 | #include "internals.h" |
14 | 14 | ||
15 | static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; | 15 | static struct proc_dir_entry *root_irq_dir; |
16 | 16 | ||
17 | #ifdef CONFIG_SMP | 17 | #ifdef CONFIG_SMP |
18 | 18 | ||
19 | /* | ||
20 | * The /proc/irq/<irq>/smp_affinity values: | ||
21 | */ | ||
22 | static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; | ||
23 | |||
24 | #ifdef CONFIG_GENERIC_PENDING_IRQ | 19 | #ifdef CONFIG_GENERIC_PENDING_IRQ |
25 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) | 20 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) |
26 | { | 21 | { |
22 | set_balance_irq_affinity(irq, mask_val); | ||
23 | |||
27 | /* | 24 | /* |
28 | * Save these away for later use. Re-progam when the | 25 | * Save these away for later use. Re-progam when the |
29 | * interrupt is pending | 26 | * interrupt is pending |
@@ -33,15 +30,16 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) | |||
33 | #else | 30 | #else |
34 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) | 31 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) |
35 | { | 32 | { |
36 | irq_affinity[irq] = mask_val; | 33 | set_balance_irq_affinity(irq, mask_val); |
37 | irq_desc[irq].handler->set_affinity(irq, mask_val); | 34 | irq_desc[irq].affinity = mask_val; |
35 | irq_desc[irq].chip->set_affinity(irq, mask_val); | ||
38 | } | 36 | } |
39 | #endif | 37 | #endif |
40 | 38 | ||
41 | static int irq_affinity_read_proc(char *page, char **start, off_t off, | 39 | static int irq_affinity_read_proc(char *page, char **start, off_t off, |
42 | int count, int *eof, void *data) | 40 | int count, int *eof, void *data) |
43 | { | 41 | { |
44 | int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]); | 42 | int len = cpumask_scnprintf(page, count, irq_desc[(long)data].affinity); |
45 | 43 | ||
46 | if (count - len < 2) | 44 | if (count - len < 2) |
47 | return -EINVAL; | 45 | return -EINVAL; |
@@ -56,7 +54,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, | |||
56 | unsigned int irq = (int)(long)data, full_count = count, err; | 54 | unsigned int irq = (int)(long)data, full_count = count, err; |
57 | cpumask_t new_value, tmp; | 55 | cpumask_t new_value, tmp; |
58 | 56 | ||
59 | if (!irq_desc[irq].handler->set_affinity || no_irq_affinity) | 57 | if (!irq_desc[irq].chip->set_affinity || no_irq_affinity) |
60 | return -EIO; | 58 | return -EIO; |
61 | 59 | ||
62 | err = cpumask_parse(buffer, count, new_value); | 60 | err = cpumask_parse(buffer, count, new_value); |
@@ -99,7 +97,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) | |||
99 | { | 97 | { |
100 | char name [MAX_NAMELEN]; | 98 | char name [MAX_NAMELEN]; |
101 | 99 | ||
102 | if (!irq_dir[irq] || action->dir || !action->name || | 100 | if (!irq_desc[irq].dir || action->dir || !action->name || |
103 | !name_unique(irq, action)) | 101 | !name_unique(irq, action)) |
104 | return; | 102 | return; |
105 | 103 | ||
@@ -107,7 +105,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) | |||
107 | snprintf(name, MAX_NAMELEN, "%s", action->name); | 105 | snprintf(name, MAX_NAMELEN, "%s", action->name); |
108 | 106 | ||
109 | /* create /proc/irq/1234/handler/ */ | 107 | /* create /proc/irq/1234/handler/ */ |
110 | action->dir = proc_mkdir(name, irq_dir[irq]); | 108 | action->dir = proc_mkdir(name, irq_desc[irq].dir); |
111 | } | 109 | } |
112 | 110 | ||
113 | #undef MAX_NAMELEN | 111 | #undef MAX_NAMELEN |
@@ -119,22 +117,22 @@ void register_irq_proc(unsigned int irq) | |||
119 | char name [MAX_NAMELEN]; | 117 | char name [MAX_NAMELEN]; |
120 | 118 | ||
121 | if (!root_irq_dir || | 119 | if (!root_irq_dir || |
122 | (irq_desc[irq].handler == &no_irq_type) || | 120 | (irq_desc[irq].chip == &no_irq_chip) || |
123 | irq_dir[irq]) | 121 | irq_desc[irq].dir) |
124 | return; | 122 | return; |
125 | 123 | ||
126 | memset(name, 0, MAX_NAMELEN); | 124 | memset(name, 0, MAX_NAMELEN); |
127 | sprintf(name, "%d", irq); | 125 | sprintf(name, "%d", irq); |
128 | 126 | ||
129 | /* create /proc/irq/1234 */ | 127 | /* create /proc/irq/1234 */ |
130 | irq_dir[irq] = proc_mkdir(name, root_irq_dir); | 128 | irq_desc[irq].dir = proc_mkdir(name, root_irq_dir); |
131 | 129 | ||
132 | #ifdef CONFIG_SMP | 130 | #ifdef CONFIG_SMP |
133 | { | 131 | { |
134 | struct proc_dir_entry *entry; | 132 | struct proc_dir_entry *entry; |
135 | 133 | ||
136 | /* create /proc/irq/<irq>/smp_affinity */ | 134 | /* create /proc/irq/<irq>/smp_affinity */ |
137 | entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]); | 135 | entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); |
138 | 136 | ||
139 | if (entry) { | 137 | if (entry) { |
140 | entry->nlink = 1; | 138 | entry->nlink = 1; |
@@ -142,7 +140,6 @@ void register_irq_proc(unsigned int irq) | |||
142 | entry->read_proc = irq_affinity_read_proc; | 140 | entry->read_proc = irq_affinity_read_proc; |
143 | entry->write_proc = irq_affinity_write_proc; | 141 | entry->write_proc = irq_affinity_write_proc; |
144 | } | 142 | } |
145 | smp_affinity_entry[irq] = entry; | ||
146 | } | 143 | } |
147 | #endif | 144 | #endif |
148 | } | 145 | } |
@@ -152,7 +149,7 @@ void register_irq_proc(unsigned int irq) | |||
152 | void unregister_handler_proc(unsigned int irq, struct irqaction *action) | 149 | void unregister_handler_proc(unsigned int irq, struct irqaction *action) |
153 | { | 150 | { |
154 | if (action->dir) | 151 | if (action->dir) |
155 | remove_proc_entry(action->dir->name, irq_dir[irq]); | 152 | remove_proc_entry(action->dir->name, irq_desc[irq].dir); |
156 | } | 153 | } |
157 | 154 | ||
158 | void init_irq_proc(void) | 155 | void init_irq_proc(void) |
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c new file mode 100644 index 000000000000..872f91ba2ce8 --- /dev/null +++ b/kernel/irq/resend.c | |||
@@ -0,0 +1,78 @@ | |||
1 | /* | ||
2 | * linux/kernel/irq/resend.c | ||
3 | * | ||
4 | * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar | ||
5 | * Copyright (C) 2005-2006, Thomas Gleixner | ||
6 | * | ||
7 | * This file contains the IRQ-resend code | ||
8 | * | ||
9 | * If the interrupt is waiting to be processed, we try to re-run it. | ||
10 | * We can't directly run it from here since the caller might be in an | ||
11 | * interrupt-protected region. Not all irq controller chips can | ||
12 | * retrigger interrupts at the hardware level, so in those cases | ||
13 | * we allow the resending of IRQs via a tasklet. | ||
14 | */ | ||
15 | |||
16 | #include <linux/irq.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/random.h> | ||
19 | #include <linux/interrupt.h> | ||
20 | |||
21 | #include "internals.h" | ||
22 | |||
23 | #ifdef CONFIG_HARDIRQS_SW_RESEND | ||
24 | |||
25 | /* Bitmap to handle software resend of interrupts: */ | ||
26 | static DECLARE_BITMAP(irqs_resend, NR_IRQS); | ||
27 | |||
28 | /* | ||
29 | * Run software resends of IRQ's | ||
30 | */ | ||
31 | static void resend_irqs(unsigned long arg) | ||
32 | { | ||
33 | struct irq_desc *desc; | ||
34 | int irq; | ||
35 | |||
36 | while (!bitmap_empty(irqs_resend, NR_IRQS)) { | ||
37 | irq = find_first_bit(irqs_resend, NR_IRQS); | ||
38 | clear_bit(irq, irqs_resend); | ||
39 | desc = irq_desc + irq; | ||
40 | local_irq_disable(); | ||
41 | desc->handle_irq(irq, desc, NULL); | ||
42 | local_irq_enable(); | ||
43 | } | ||
44 | } | ||
45 | |||
46 | /* Tasklet to handle resend: */ | ||
47 | static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); | ||
48 | |||
49 | #endif | ||
50 | |||
51 | /* | ||
52 | * IRQ resend | ||
53 | * | ||
54 | * Is called with interrupts disabled and desc->lock held. | ||
55 | */ | ||
56 | void check_irq_resend(struct irq_desc *desc, unsigned int irq) | ||
57 | { | ||
58 | unsigned int status = desc->status; | ||
59 | |||
60 | /* | ||
61 | * Make sure the interrupt is enabled, before resending it: | ||
62 | */ | ||
63 | desc->chip->enable(irq); | ||
64 | |||
65 | if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { | ||
66 | desc->status &= ~IRQ_PENDING; | ||
67 | desc->status = status | IRQ_REPLAY; | ||
68 | |||
69 | if (!desc->chip || !desc->chip->retrigger || | ||
70 | !desc->chip->retrigger(irq)) { | ||
71 | #ifdef CONFIG_HARDIRQS_SW_RESEND | ||
72 | /* Set it pending and activate the softirq: */ | ||
73 | set_bit(irq, irqs_resend); | ||
74 | tasklet_schedule(&resend_tasklet); | ||
75 | #endif | ||
76 | } | ||
77 | } | ||
78 | } | ||
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 7df9abd5ec86..417e98092cf2 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
@@ -11,44 +11,44 @@ | |||
11 | #include <linux/kallsyms.h> | 11 | #include <linux/kallsyms.h> |
12 | #include <linux/interrupt.h> | 12 | #include <linux/interrupt.h> |
13 | 13 | ||
14 | static int irqfixup; | 14 | static int irqfixup __read_mostly; |
15 | 15 | ||
16 | /* | 16 | /* |
17 | * Recovery handler for misrouted interrupts. | 17 | * Recovery handler for misrouted interrupts. |
18 | */ | 18 | */ |
19 | |||
20 | static int misrouted_irq(int irq, struct pt_regs *regs) | 19 | static int misrouted_irq(int irq, struct pt_regs *regs) |
21 | { | 20 | { |
22 | int i; | 21 | int i; |
23 | irq_desc_t *desc; | ||
24 | int ok = 0; | 22 | int ok = 0; |
25 | int work = 0; /* Did we do work for a real IRQ */ | 23 | int work = 0; /* Did we do work for a real IRQ */ |
26 | 24 | ||
27 | for(i = 1; i < NR_IRQS; i++) { | 25 | for (i = 1; i < NR_IRQS; i++) { |
26 | struct irq_desc *desc = irq_desc + i; | ||
28 | struct irqaction *action; | 27 | struct irqaction *action; |
29 | 28 | ||
30 | if (i == irq) /* Already tried */ | 29 | if (i == irq) /* Already tried */ |
31 | continue; | 30 | continue; |
32 | desc = &irq_desc[i]; | 31 | |
33 | spin_lock(&desc->lock); | 32 | spin_lock(&desc->lock); |
34 | action = desc->action; | ||
35 | /* Already running on another processor */ | 33 | /* Already running on another processor */ |
36 | if (desc->status & IRQ_INPROGRESS) { | 34 | if (desc->status & IRQ_INPROGRESS) { |
37 | /* | 35 | /* |
38 | * Already running: If it is shared get the other | 36 | * Already running: If it is shared get the other |
39 | * CPU to go looking for our mystery interrupt too | 37 | * CPU to go looking for our mystery interrupt too |
40 | */ | 38 | */ |
41 | if (desc->action && (desc->action->flags & SA_SHIRQ)) | 39 | if (desc->action && (desc->action->flags & IRQF_SHARED)) |
42 | desc->status |= IRQ_PENDING; | 40 | desc->status |= IRQ_PENDING; |
43 | spin_unlock(&desc->lock); | 41 | spin_unlock(&desc->lock); |
44 | continue; | 42 | continue; |
45 | } | 43 | } |
46 | /* Honour the normal IRQ locking */ | 44 | /* Honour the normal IRQ locking */ |
47 | desc->status |= IRQ_INPROGRESS; | 45 | desc->status |= IRQ_INPROGRESS; |
46 | action = desc->action; | ||
48 | spin_unlock(&desc->lock); | 47 | spin_unlock(&desc->lock); |
48 | |||
49 | while (action) { | 49 | while (action) { |
50 | /* Only shared IRQ handlers are safe to call */ | 50 | /* Only shared IRQ handlers are safe to call */ |
51 | if (action->flags & SA_SHIRQ) { | 51 | if (action->flags & IRQF_SHARED) { |
52 | if (action->handler(i, action->dev_id, regs) == | 52 | if (action->handler(i, action->dev_id, regs) == |
53 | IRQ_HANDLED) | 53 | IRQ_HANDLED) |
54 | ok = 1; | 54 | ok = 1; |
@@ -62,9 +62,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs) | |||
62 | 62 | ||
63 | /* | 63 | /* |
64 | * While we were looking for a fixup someone queued a real | 64 | * While we were looking for a fixup someone queued a real |
65 | * IRQ clashing with our walk | 65 | * IRQ clashing with our walk: |
66 | */ | 66 | */ |
67 | |||
68 | while ((desc->status & IRQ_PENDING) && action) { | 67 | while ((desc->status & IRQ_PENDING) && action) { |
69 | /* | 68 | /* |
70 | * Perform real IRQ processing for the IRQ we deferred | 69 | * Perform real IRQ processing for the IRQ we deferred |
@@ -80,8 +79,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs) | |||
80 | * If we did actual work for the real IRQ line we must let the | 79 | * If we did actual work for the real IRQ line we must let the |
81 | * IRQ controller clean up too | 80 | * IRQ controller clean up too |
82 | */ | 81 | */ |
83 | if(work) | 82 | if (work && desc->chip && desc->chip->end) |
84 | desc->handler->end(i); | 83 | desc->chip->end(i); |
85 | spin_unlock(&desc->lock); | 84 | spin_unlock(&desc->lock); |
86 | } | 85 | } |
87 | /* So the caller can adjust the irq error counts */ | 86 | /* So the caller can adjust the irq error counts */ |
@@ -100,7 +99,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs) | |||
100 | */ | 99 | */ |
101 | 100 | ||
102 | static void | 101 | static void |
103 | __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | 102 | __report_bad_irq(unsigned int irq, struct irq_desc *desc, |
103 | irqreturn_t action_ret) | ||
104 | { | 104 | { |
105 | struct irqaction *action; | 105 | struct irqaction *action; |
106 | 106 | ||
@@ -113,6 +113,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | |||
113 | } | 113 | } |
114 | dump_stack(); | 114 | dump_stack(); |
115 | printk(KERN_ERR "handlers:\n"); | 115 | printk(KERN_ERR "handlers:\n"); |
116 | |||
116 | action = desc->action; | 117 | action = desc->action; |
117 | while (action) { | 118 | while (action) { |
118 | printk(KERN_ERR "[<%p>]", action->handler); | 119 | printk(KERN_ERR "[<%p>]", action->handler); |
@@ -123,7 +124,8 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | |||
123 | } | 124 | } |
124 | } | 125 | } |
125 | 126 | ||
126 | static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | 127 | static void |
128 | report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) | ||
127 | { | 129 | { |
128 | static int count = 100; | 130 | static int count = 100; |
129 | 131 | ||
@@ -133,12 +135,12 @@ static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t actio | |||
133 | } | 135 | } |
134 | } | 136 | } |
135 | 137 | ||
136 | void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, | 138 | void note_interrupt(unsigned int irq, struct irq_desc *desc, |
137 | struct pt_regs *regs) | 139 | irqreturn_t action_ret, struct pt_regs *regs) |
138 | { | 140 | { |
139 | if (action_ret != IRQ_HANDLED) { | 141 | if (unlikely(action_ret != IRQ_HANDLED)) { |
140 | desc->irqs_unhandled++; | 142 | desc->irqs_unhandled++; |
141 | if (action_ret != IRQ_NONE) | 143 | if (unlikely(action_ret != IRQ_NONE)) |
142 | report_bad_irq(irq, desc, action_ret); | 144 | report_bad_irq(irq, desc, action_ret); |
143 | } | 145 | } |
144 | 146 | ||
@@ -152,11 +154,11 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, | |||
152 | } | 154 | } |
153 | 155 | ||
154 | desc->irq_count++; | 156 | desc->irq_count++; |
155 | if (desc->irq_count < 100000) | 157 | if (likely(desc->irq_count < 100000)) |
156 | return; | 158 | return; |
157 | 159 | ||
158 | desc->irq_count = 0; | 160 | desc->irq_count = 0; |
159 | if (desc->irqs_unhandled > 99900) { | 161 | if (unlikely(desc->irqs_unhandled > 99900)) { |
160 | /* | 162 | /* |
161 | * The interrupt is stuck | 163 | * The interrupt is stuck |
162 | */ | 164 | */ |
@@ -166,17 +168,19 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, | |||
166 | */ | 168 | */ |
167 | printk(KERN_EMERG "Disabling IRQ #%d\n", irq); | 169 | printk(KERN_EMERG "Disabling IRQ #%d\n", irq); |
168 | desc->status |= IRQ_DISABLED; | 170 | desc->status |= IRQ_DISABLED; |
169 | desc->handler->disable(irq); | 171 | desc->depth = 1; |
172 | desc->chip->disable(irq); | ||
170 | } | 173 | } |
171 | desc->irqs_unhandled = 0; | 174 | desc->irqs_unhandled = 0; |
172 | } | 175 | } |
173 | 176 | ||
174 | int noirqdebug; | 177 | int noirqdebug __read_mostly; |
175 | 178 | ||
176 | int __init noirqdebug_setup(char *str) | 179 | int __init noirqdebug_setup(char *str) |
177 | { | 180 | { |
178 | noirqdebug = 1; | 181 | noirqdebug = 1; |
179 | printk(KERN_INFO "IRQ lockup detection disabled\n"); | 182 | printk(KERN_INFO "IRQ lockup detection disabled\n"); |
183 | |||
180 | return 1; | 184 | return 1; |
181 | } | 185 | } |
182 | 186 | ||
@@ -187,6 +191,7 @@ static int __init irqfixup_setup(char *str) | |||
187 | irqfixup = 1; | 191 | irqfixup = 1; |
188 | printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); | 192 | printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); |
189 | printk(KERN_WARNING "This may impact system performance.\n"); | 193 | printk(KERN_WARNING "This may impact system performance.\n"); |
194 | |||
190 | return 1; | 195 | return 1; |
191 | } | 196 | } |
192 | 197 | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c index bf39d28e4c0e..50087ecf337e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -902,14 +902,14 @@ static int kimage_load_segment(struct kimage *image, | |||
902 | * kexec does not sync, or unmount filesystems so if you need | 902 | * kexec does not sync, or unmount filesystems so if you need |
903 | * that to happen you need to do that yourself. | 903 | * that to happen you need to do that yourself. |
904 | */ | 904 | */ |
905 | struct kimage *kexec_image = NULL; | 905 | struct kimage *kexec_image; |
906 | static struct kimage *kexec_crash_image = NULL; | 906 | struct kimage *kexec_crash_image; |
907 | /* | 907 | /* |
908 | * A home grown binary mutex. | 908 | * A home grown binary mutex. |
909 | * Nothing can wait so this mutex is safe to use | 909 | * Nothing can wait so this mutex is safe to use |
910 | * in interrupt context :) | 910 | * in interrupt context :) |
911 | */ | 911 | */ |
912 | static int kexec_lock = 0; | 912 | static int kexec_lock; |
913 | 913 | ||
914 | asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, | 914 | asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, |
915 | struct kexec_segment __user *segments, | 915 | struct kexec_segment __user *segments, |
@@ -1042,7 +1042,6 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry, | |||
1042 | 1042 | ||
1043 | void crash_kexec(struct pt_regs *regs) | 1043 | void crash_kexec(struct pt_regs *regs) |
1044 | { | 1044 | { |
1045 | struct kimage *image; | ||
1046 | int locked; | 1045 | int locked; |
1047 | 1046 | ||
1048 | 1047 | ||
@@ -1056,12 +1055,11 @@ void crash_kexec(struct pt_regs *regs) | |||
1056 | */ | 1055 | */ |
1057 | locked = xchg(&kexec_lock, 1); | 1056 | locked = xchg(&kexec_lock, 1); |
1058 | if (!locked) { | 1057 | if (!locked) { |
1059 | image = xchg(&kexec_crash_image, NULL); | 1058 | if (kexec_crash_image) { |
1060 | if (image) { | ||
1061 | struct pt_regs fixed_regs; | 1059 | struct pt_regs fixed_regs; |
1062 | crash_setup_regs(&fixed_regs, regs); | 1060 | crash_setup_regs(&fixed_regs, regs); |
1063 | machine_crash_shutdown(&fixed_regs); | 1061 | machine_crash_shutdown(&fixed_regs); |
1064 | machine_kexec(image); | 1062 | machine_kexec(kexec_crash_image); |
1065 | } | 1063 | } |
1066 | xchg(&kexec_lock, 0); | 1064 | xchg(&kexec_lock, 0); |
1067 | } | 1065 | } |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 20a997c73c3d..1b7157af051c 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -20,7 +20,6 @@ | |||
20 | */ | 20 | */ |
21 | #define __KERNEL_SYSCALLS__ | 21 | #define __KERNEL_SYSCALLS__ |
22 | 22 | ||
23 | #include <linux/config.h> | ||
24 | #include <linux/module.h> | 23 | #include <linux/module.h> |
25 | #include <linux/sched.h> | 24 | #include <linux/sched.h> |
26 | #include <linux/syscalls.h> | 25 | #include <linux/syscalls.h> |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1fbf466a29aa..64aab081153b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -47,11 +47,17 @@ | |||
47 | 47 | ||
48 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; | 48 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; |
49 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; | 49 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; |
50 | static atomic_t kprobe_count; | ||
50 | 51 | ||
51 | DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ | 52 | DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ |
52 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ | 53 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ |
53 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; | 54 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; |
54 | 55 | ||
56 | static struct notifier_block kprobe_page_fault_nb = { | ||
57 | .notifier_call = kprobe_exceptions_notify, | ||
58 | .priority = 0x7fffffff /* we need to notified first */ | ||
59 | }; | ||
60 | |||
55 | #ifdef __ARCH_WANT_KPROBES_INSN_SLOT | 61 | #ifdef __ARCH_WANT_KPROBES_INSN_SLOT |
56 | /* | 62 | /* |
57 | * kprobe->ainsn.insn points to the copy of the instruction to be | 63 | * kprobe->ainsn.insn points to the copy of the instruction to be |
@@ -368,16 +374,15 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) | |||
368 | */ | 374 | */ |
369 | static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) | 375 | static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) |
370 | { | 376 | { |
371 | struct kprobe *kp; | ||
372 | |||
373 | if (p->break_handler) { | 377 | if (p->break_handler) { |
374 | list_for_each_entry_rcu(kp, &old_p->list, list) { | 378 | if (old_p->break_handler) |
375 | if (kp->break_handler) | 379 | return -EEXIST; |
376 | return -EEXIST; | ||
377 | } | ||
378 | list_add_tail_rcu(&p->list, &old_p->list); | 380 | list_add_tail_rcu(&p->list, &old_p->list); |
381 | old_p->break_handler = aggr_break_handler; | ||
379 | } else | 382 | } else |
380 | list_add_rcu(&p->list, &old_p->list); | 383 | list_add_rcu(&p->list, &old_p->list); |
384 | if (p->post_handler && !old_p->post_handler) | ||
385 | old_p->post_handler = aggr_post_handler; | ||
381 | return 0; | 386 | return 0; |
382 | } | 387 | } |
383 | 388 | ||
@@ -390,9 +395,11 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | |||
390 | copy_kprobe(p, ap); | 395 | copy_kprobe(p, ap); |
391 | ap->addr = p->addr; | 396 | ap->addr = p->addr; |
392 | ap->pre_handler = aggr_pre_handler; | 397 | ap->pre_handler = aggr_pre_handler; |
393 | ap->post_handler = aggr_post_handler; | ||
394 | ap->fault_handler = aggr_fault_handler; | 398 | ap->fault_handler = aggr_fault_handler; |
395 | ap->break_handler = aggr_break_handler; | 399 | if (p->post_handler) |
400 | ap->post_handler = aggr_post_handler; | ||
401 | if (p->break_handler) | ||
402 | ap->break_handler = aggr_break_handler; | ||
396 | 403 | ||
397 | INIT_LIST_HEAD(&ap->list); | 404 | INIT_LIST_HEAD(&ap->list); |
398 | list_add_rcu(&p->list, &ap->list); | 405 | list_add_rcu(&p->list, &ap->list); |
@@ -464,6 +471,8 @@ static int __kprobes __register_kprobe(struct kprobe *p, | |||
464 | old_p = get_kprobe(p->addr); | 471 | old_p = get_kprobe(p->addr); |
465 | if (old_p) { | 472 | if (old_p) { |
466 | ret = register_aggr_kprobe(old_p, p); | 473 | ret = register_aggr_kprobe(old_p, p); |
474 | if (!ret) | ||
475 | atomic_inc(&kprobe_count); | ||
467 | goto out; | 476 | goto out; |
468 | } | 477 | } |
469 | 478 | ||
@@ -474,6 +483,10 @@ static int __kprobes __register_kprobe(struct kprobe *p, | |||
474 | hlist_add_head_rcu(&p->hlist, | 483 | hlist_add_head_rcu(&p->hlist, |
475 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); | 484 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); |
476 | 485 | ||
486 | if (atomic_add_return(1, &kprobe_count) == \ | ||
487 | (ARCH_INACTIVE_KPROBE_COUNT + 1)) | ||
488 | register_page_fault_notifier(&kprobe_page_fault_nb); | ||
489 | |||
477 | arch_arm_kprobe(p); | 490 | arch_arm_kprobe(p); |
478 | 491 | ||
479 | out: | 492 | out: |
@@ -536,14 +549,40 @@ valid_p: | |||
536 | kfree(old_p); | 549 | kfree(old_p); |
537 | } | 550 | } |
538 | arch_remove_kprobe(p); | 551 | arch_remove_kprobe(p); |
552 | } else { | ||
553 | mutex_lock(&kprobe_mutex); | ||
554 | if (p->break_handler) | ||
555 | old_p->break_handler = NULL; | ||
556 | if (p->post_handler){ | ||
557 | list_for_each_entry_rcu(list_p, &old_p->list, list){ | ||
558 | if (list_p->post_handler){ | ||
559 | cleanup_p = 2; | ||
560 | break; | ||
561 | } | ||
562 | } | ||
563 | if (cleanup_p == 0) | ||
564 | old_p->post_handler = NULL; | ||
565 | } | ||
566 | mutex_unlock(&kprobe_mutex); | ||
539 | } | 567 | } |
568 | |||
569 | /* Call unregister_page_fault_notifier() | ||
570 | * if no probes are active | ||
571 | */ | ||
572 | mutex_lock(&kprobe_mutex); | ||
573 | if (atomic_add_return(-1, &kprobe_count) == \ | ||
574 | ARCH_INACTIVE_KPROBE_COUNT) | ||
575 | unregister_page_fault_notifier(&kprobe_page_fault_nb); | ||
576 | mutex_unlock(&kprobe_mutex); | ||
577 | return; | ||
540 | } | 578 | } |
541 | 579 | ||
542 | static struct notifier_block kprobe_exceptions_nb = { | 580 | static struct notifier_block kprobe_exceptions_nb = { |
543 | .notifier_call = kprobe_exceptions_notify, | 581 | .notifier_call = kprobe_exceptions_notify, |
544 | .priority = 0x7fffffff /* we need to notified first */ | 582 | .priority = 0x7fffffff /* we need to be notified first */ |
545 | }; | 583 | }; |
546 | 584 | ||
585 | |||
547 | int __kprobes register_jprobe(struct jprobe *jp) | 586 | int __kprobes register_jprobe(struct jprobe *jp) |
548 | { | 587 | { |
549 | /* Todo: Verify probepoint is a function entry point */ | 588 | /* Todo: Verify probepoint is a function entry point */ |
@@ -652,6 +691,7 @@ static int __init init_kprobes(void) | |||
652 | INIT_HLIST_HEAD(&kprobe_table[i]); | 691 | INIT_HLIST_HEAD(&kprobe_table[i]); |
653 | INIT_HLIST_HEAD(&kretprobe_inst_table[i]); | 692 | INIT_HLIST_HEAD(&kretprobe_inst_table[i]); |
654 | } | 693 | } |
694 | atomic_set(&kprobe_count, 0); | ||
655 | 695 | ||
656 | err = arch_init_kprobes(); | 696 | err = arch_init_kprobes(); |
657 | if (!err) | 697 | if (!err) |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index f119e098e67b..e0ffe4ab0917 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -8,12 +8,12 @@ | |||
8 | * | 8 | * |
9 | */ | 9 | */ |
10 | 10 | ||
11 | #include <linux/config.h> | ||
12 | #include <linux/kobject.h> | 11 | #include <linux/kobject.h> |
13 | #include <linux/string.h> | 12 | #include <linux/string.h> |
14 | #include <linux/sysfs.h> | 13 | #include <linux/sysfs.h> |
15 | #include <linux/module.h> | 14 | #include <linux/module.h> |
16 | #include <linux/init.h> | 15 | #include <linux/init.h> |
16 | #include <linux/kexec.h> | ||
17 | 17 | ||
18 | #define KERNEL_ATTR_RO(_name) \ | 18 | #define KERNEL_ATTR_RO(_name) \ |
19 | static struct subsys_attribute _name##_attr = __ATTR_RO(_name) | 19 | static struct subsys_attribute _name##_attr = __ATTR_RO(_name) |
@@ -48,6 +48,20 @@ static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, s | |||
48 | KERNEL_ATTR_RW(uevent_helper); | 48 | KERNEL_ATTR_RW(uevent_helper); |
49 | #endif | 49 | #endif |
50 | 50 | ||
51 | #ifdef CONFIG_KEXEC | ||
52 | static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page) | ||
53 | { | ||
54 | return sprintf(page, "%d\n", !!kexec_image); | ||
55 | } | ||
56 | KERNEL_ATTR_RO(kexec_loaded); | ||
57 | |||
58 | static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page) | ||
59 | { | ||
60 | return sprintf(page, "%d\n", !!kexec_crash_image); | ||
61 | } | ||
62 | KERNEL_ATTR_RO(kexec_crash_loaded); | ||
63 | #endif /* CONFIG_KEXEC */ | ||
64 | |||
51 | decl_subsys(kernel, NULL, NULL); | 65 | decl_subsys(kernel, NULL, NULL); |
52 | EXPORT_SYMBOL_GPL(kernel_subsys); | 66 | EXPORT_SYMBOL_GPL(kernel_subsys); |
53 | 67 | ||
@@ -56,6 +70,10 @@ static struct attribute * kernel_attrs[] = { | |||
56 | &uevent_seqnum_attr.attr, | 70 | &uevent_seqnum_attr.attr, |
57 | &uevent_helper_attr.attr, | 71 | &uevent_helper_attr.attr, |
58 | #endif | 72 | #endif |
73 | #ifdef CONFIG_KEXEC | ||
74 | &kexec_loaded_attr.attr, | ||
75 | &kexec_crash_loaded_attr.attr, | ||
76 | #endif | ||
59 | NULL | 77 | NULL |
60 | }; | 78 | }; |
61 | 79 | ||
diff --git a/kernel/kthread.c b/kernel/kthread.c index c5f3c6613b6d..24be714b04c7 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -45,6 +45,13 @@ struct kthread_stop_info | |||
45 | static DEFINE_MUTEX(kthread_stop_lock); | 45 | static DEFINE_MUTEX(kthread_stop_lock); |
46 | static struct kthread_stop_info kthread_stop_info; | 46 | static struct kthread_stop_info kthread_stop_info; |
47 | 47 | ||
48 | /** | ||
49 | * kthread_should_stop - should this kthread return now? | ||
50 | * | ||
51 | * When someone calls kthread_stop on your kthread, it will be woken | ||
52 | * and this will return true. You should then return, and your return | ||
53 | * value will be passed through to kthread_stop(). | ||
54 | */ | ||
48 | int kthread_should_stop(void) | 55 | int kthread_should_stop(void) |
49 | { | 56 | { |
50 | return (kthread_stop_info.k == current); | 57 | return (kthread_stop_info.k == current); |
@@ -122,6 +129,25 @@ static void keventd_create_kthread(void *_create) | |||
122 | complete(&create->done); | 129 | complete(&create->done); |
123 | } | 130 | } |
124 | 131 | ||
132 | /** | ||
133 | * kthread_create - create a kthread. | ||
134 | * @threadfn: the function to run until signal_pending(current). | ||
135 | * @data: data ptr for @threadfn. | ||
136 | * @namefmt: printf-style name for the thread. | ||
137 | * | ||
138 | * Description: This helper function creates and names a kernel | ||
139 | * thread. The thread will be stopped: use wake_up_process() to start | ||
140 | * it. See also kthread_run(), kthread_create_on_cpu(). | ||
141 | * | ||
142 | * When woken, the thread will run @threadfn() with @data as its | ||
143 | * argument. @threadfn can either call do_exit() directly if it is a | ||
144 | * standalone thread for which noone will call kthread_stop(), or | ||
145 | * return when 'kthread_should_stop()' is true (which means | ||
146 | * kthread_stop() has been called). The return value should be zero | ||
147 | * or a negative error number; it will be passed to kthread_stop(). | ||
148 | * | ||
149 | * Returns a task_struct or ERR_PTR(-ENOMEM). | ||
150 | */ | ||
125 | struct task_struct *kthread_create(int (*threadfn)(void *data), | 151 | struct task_struct *kthread_create(int (*threadfn)(void *data), |
126 | void *data, | 152 | void *data, |
127 | const char namefmt[], | 153 | const char namefmt[], |
@@ -156,6 +182,15 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), | |||
156 | } | 182 | } |
157 | EXPORT_SYMBOL(kthread_create); | 183 | EXPORT_SYMBOL(kthread_create); |
158 | 184 | ||
185 | /** | ||
186 | * kthread_bind - bind a just-created kthread to a cpu. | ||
187 | * @k: thread created by kthread_create(). | ||
188 | * @cpu: cpu (might not be online, must be possible) for @k to run on. | ||
189 | * | ||
190 | * Description: This function is equivalent to set_cpus_allowed(), | ||
191 | * except that @cpu doesn't need to be online, and the thread must be | ||
192 | * stopped (i.e., just returned from kthread_create(). | ||
193 | */ | ||
159 | void kthread_bind(struct task_struct *k, unsigned int cpu) | 194 | void kthread_bind(struct task_struct *k, unsigned int cpu) |
160 | { | 195 | { |
161 | BUG_ON(k->state != TASK_INTERRUPTIBLE); | 196 | BUG_ON(k->state != TASK_INTERRUPTIBLE); |
@@ -166,12 +201,36 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) | |||
166 | } | 201 | } |
167 | EXPORT_SYMBOL(kthread_bind); | 202 | EXPORT_SYMBOL(kthread_bind); |
168 | 203 | ||
204 | /** | ||
205 | * kthread_stop - stop a thread created by kthread_create(). | ||
206 | * @k: thread created by kthread_create(). | ||
207 | * | ||
208 | * Sets kthread_should_stop() for @k to return true, wakes it, and | ||
209 | * waits for it to exit. Your threadfn() must not call do_exit() | ||
210 | * itself if you use this function! This can also be called after | ||
211 | * kthread_create() instead of calling wake_up_process(): the thread | ||
212 | * will exit without calling threadfn(). | ||
213 | * | ||
214 | * Returns the result of threadfn(), or %-EINTR if wake_up_process() | ||
215 | * was never called. | ||
216 | */ | ||
169 | int kthread_stop(struct task_struct *k) | 217 | int kthread_stop(struct task_struct *k) |
170 | { | 218 | { |
171 | return kthread_stop_sem(k, NULL); | 219 | return kthread_stop_sem(k, NULL); |
172 | } | 220 | } |
173 | EXPORT_SYMBOL(kthread_stop); | 221 | EXPORT_SYMBOL(kthread_stop); |
174 | 222 | ||
223 | /** | ||
224 | * kthread_stop_sem - stop a thread created by kthread_create(). | ||
225 | * @k: thread created by kthread_create(). | ||
226 | * @s: semaphore that @k waits on while idle. | ||
227 | * | ||
228 | * Does essentially the same thing as kthread_stop() above, but wakes | ||
229 | * @k by calling up(@s). | ||
230 | * | ||
231 | * Returns the result of threadfn(), or %-EINTR if wake_up_process() | ||
232 | * was never called. | ||
233 | */ | ||
175 | int kthread_stop_sem(struct task_struct *k, struct semaphore *s) | 234 | int kthread_stop_sem(struct task_struct *k, struct semaphore *s) |
176 | { | 235 | { |
177 | int ret; | 236 | int ret; |
@@ -210,5 +269,5 @@ static __init int helper_init(void) | |||
210 | 269 | ||
211 | return 0; | 270 | return 0; |
212 | } | 271 | } |
213 | core_initcall(helper_init); | ||
214 | 272 | ||
273 | core_initcall(helper_init); | ||
diff --git a/kernel/module.c b/kernel/module.c index bbe04862e1b0..281172f01e9a 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -1,4 +1,4 @@ | |||
1 | /* Rewritten by Rusty Russell, on the backs of many others... | 1 | /* |
2 | Copyright (C) 2002 Richard Henderson | 2 | Copyright (C) 2002 Richard Henderson |
3 | Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. | 3 | Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. |
4 | 4 | ||
@@ -16,7 +16,6 @@ | |||
16 | along with this program; if not, write to the Free Software | 16 | along with this program; if not, write to the Free Software |
17 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 17 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
18 | */ | 18 | */ |
19 | #include <linux/config.h> | ||
20 | #include <linux/module.h> | 19 | #include <linux/module.h> |
21 | #include <linux/moduleloader.h> | 20 | #include <linux/moduleloader.h> |
22 | #include <linux/init.h> | 21 | #include <linux/init.h> |
@@ -40,9 +39,11 @@ | |||
40 | #include <linux/string.h> | 39 | #include <linux/string.h> |
41 | #include <linux/sched.h> | 40 | #include <linux/sched.h> |
42 | #include <linux/mutex.h> | 41 | #include <linux/mutex.h> |
42 | #include <linux/unwind.h> | ||
43 | #include <asm/uaccess.h> | 43 | #include <asm/uaccess.h> |
44 | #include <asm/semaphore.h> | 44 | #include <asm/semaphore.h> |
45 | #include <asm/cacheflush.h> | 45 | #include <asm/cacheflush.h> |
46 | #include <linux/license.h> | ||
46 | 47 | ||
47 | #if 0 | 48 | #if 0 |
48 | #define DEBUGP printk | 49 | #define DEBUGP printk |
@@ -120,9 +121,17 @@ extern const struct kernel_symbol __start___ksymtab_gpl[]; | |||
120 | extern const struct kernel_symbol __stop___ksymtab_gpl[]; | 121 | extern const struct kernel_symbol __stop___ksymtab_gpl[]; |
121 | extern const struct kernel_symbol __start___ksymtab_gpl_future[]; | 122 | extern const struct kernel_symbol __start___ksymtab_gpl_future[]; |
122 | extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; | 123 | extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; |
124 | extern const struct kernel_symbol __start___ksymtab_unused[]; | ||
125 | extern const struct kernel_symbol __stop___ksymtab_unused[]; | ||
126 | extern const struct kernel_symbol __start___ksymtab_unused_gpl[]; | ||
127 | extern const struct kernel_symbol __stop___ksymtab_unused_gpl[]; | ||
128 | extern const struct kernel_symbol __start___ksymtab_gpl_future[]; | ||
129 | extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; | ||
123 | extern const unsigned long __start___kcrctab[]; | 130 | extern const unsigned long __start___kcrctab[]; |
124 | extern const unsigned long __start___kcrctab_gpl[]; | 131 | extern const unsigned long __start___kcrctab_gpl[]; |
125 | extern const unsigned long __start___kcrctab_gpl_future[]; | 132 | extern const unsigned long __start___kcrctab_gpl_future[]; |
133 | extern const unsigned long __start___kcrctab_unused[]; | ||
134 | extern const unsigned long __start___kcrctab_unused_gpl[]; | ||
126 | 135 | ||
127 | #ifndef CONFIG_MODVERSIONS | 136 | #ifndef CONFIG_MODVERSIONS |
128 | #define symversion(base, idx) NULL | 137 | #define symversion(base, idx) NULL |
@@ -142,6 +151,17 @@ static const struct kernel_symbol *lookup_symbol(const char *name, | |||
142 | return NULL; | 151 | return NULL; |
143 | } | 152 | } |
144 | 153 | ||
154 | static void printk_unused_warning(const char *name) | ||
155 | { | ||
156 | printk(KERN_WARNING "Symbol %s is marked as UNUSED, " | ||
157 | "however this module is using it.\n", name); | ||
158 | printk(KERN_WARNING "This symbol will go away in the future.\n"); | ||
159 | printk(KERN_WARNING "Please evalute if this is the right api to use, " | ||
160 | "and if it really is, submit a report the linux kernel " | ||
161 | "mailinglist together with submitting your code for " | ||
162 | "inclusion.\n"); | ||
163 | } | ||
164 | |||
145 | /* Find a symbol, return value, crc and module which owns it */ | 165 | /* Find a symbol, return value, crc and module which owns it */ |
146 | static unsigned long __find_symbol(const char *name, | 166 | static unsigned long __find_symbol(const char *name, |
147 | struct module **owner, | 167 | struct module **owner, |
@@ -184,6 +204,25 @@ static unsigned long __find_symbol(const char *name, | |||
184 | return ks->value; | 204 | return ks->value; |
185 | } | 205 | } |
186 | 206 | ||
207 | ks = lookup_symbol(name, __start___ksymtab_unused, | ||
208 | __stop___ksymtab_unused); | ||
209 | if (ks) { | ||
210 | printk_unused_warning(name); | ||
211 | *crc = symversion(__start___kcrctab_unused, | ||
212 | (ks - __start___ksymtab_unused)); | ||
213 | return ks->value; | ||
214 | } | ||
215 | |||
216 | if (gplok) | ||
217 | ks = lookup_symbol(name, __start___ksymtab_unused_gpl, | ||
218 | __stop___ksymtab_unused_gpl); | ||
219 | if (ks) { | ||
220 | printk_unused_warning(name); | ||
221 | *crc = symversion(__start___kcrctab_unused_gpl, | ||
222 | (ks - __start___ksymtab_unused_gpl)); | ||
223 | return ks->value; | ||
224 | } | ||
225 | |||
187 | /* Now try modules. */ | 226 | /* Now try modules. */ |
188 | list_for_each_entry(mod, &modules, list) { | 227 | list_for_each_entry(mod, &modules, list) { |
189 | *owner = mod; | 228 | *owner = mod; |
@@ -202,6 +241,23 @@ static unsigned long __find_symbol(const char *name, | |||
202 | return ks->value; | 241 | return ks->value; |
203 | } | 242 | } |
204 | } | 243 | } |
244 | ks = lookup_symbol(name, mod->unused_syms, mod->unused_syms + mod->num_unused_syms); | ||
245 | if (ks) { | ||
246 | printk_unused_warning(name); | ||
247 | *crc = symversion(mod->unused_crcs, (ks - mod->unused_syms)); | ||
248 | return ks->value; | ||
249 | } | ||
250 | |||
251 | if (gplok) { | ||
252 | ks = lookup_symbol(name, mod->unused_gpl_syms, | ||
253 | mod->unused_gpl_syms + mod->num_unused_gpl_syms); | ||
254 | if (ks) { | ||
255 | printk_unused_warning(name); | ||
256 | *crc = symversion(mod->unused_gpl_crcs, | ||
257 | (ks - mod->unused_gpl_syms)); | ||
258 | return ks->value; | ||
259 | } | ||
260 | } | ||
205 | ks = lookup_symbol(name, mod->gpl_future_syms, | 261 | ks = lookup_symbol(name, mod->gpl_future_syms, |
206 | (mod->gpl_future_syms + | 262 | (mod->gpl_future_syms + |
207 | mod->num_gpl_future_syms)); | 263 | mod->num_gpl_future_syms)); |
@@ -1051,6 +1107,8 @@ static void free_module(struct module *mod) | |||
1051 | remove_sect_attrs(mod); | 1107 | remove_sect_attrs(mod); |
1052 | mod_kobject_remove(mod); | 1108 | mod_kobject_remove(mod); |
1053 | 1109 | ||
1110 | unwind_remove_table(mod->unwind_info, 0); | ||
1111 | |||
1054 | /* Arch-specific cleanup. */ | 1112 | /* Arch-specific cleanup. */ |
1055 | module_arch_cleanup(mod); | 1113 | module_arch_cleanup(mod); |
1056 | 1114 | ||
@@ -1248,16 +1306,6 @@ static void layout_sections(struct module *mod, | |||
1248 | } | 1306 | } |
1249 | } | 1307 | } |
1250 | 1308 | ||
1251 | static inline int license_is_gpl_compatible(const char *license) | ||
1252 | { | ||
1253 | return (strcmp(license, "GPL") == 0 | ||
1254 | || strcmp(license, "GPL v2") == 0 | ||
1255 | || strcmp(license, "GPL and additional rights") == 0 | ||
1256 | || strcmp(license, "Dual BSD/GPL") == 0 | ||
1257 | || strcmp(license, "Dual MIT/GPL") == 0 | ||
1258 | || strcmp(license, "Dual MPL/GPL") == 0); | ||
1259 | } | ||
1260 | |||
1261 | static void set_license(struct module *mod, const char *license) | 1309 | static void set_license(struct module *mod, const char *license) |
1262 | { | 1310 | { |
1263 | if (!license) | 1311 | if (!license) |
@@ -1326,7 +1374,7 @@ int is_exported(const char *name, const struct module *mod) | |||
1326 | if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) | 1374 | if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) |
1327 | return 1; | 1375 | return 1; |
1328 | else | 1376 | else |
1329 | if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) | 1377 | if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) |
1330 | return 1; | 1378 | return 1; |
1331 | else | 1379 | else |
1332 | return 0; | 1380 | return 0; |
@@ -1409,10 +1457,27 @@ static struct module *load_module(void __user *umod, | |||
1409 | Elf_Ehdr *hdr; | 1457 | Elf_Ehdr *hdr; |
1410 | Elf_Shdr *sechdrs; | 1458 | Elf_Shdr *sechdrs; |
1411 | char *secstrings, *args, *modmagic, *strtab = NULL; | 1459 | char *secstrings, *args, *modmagic, *strtab = NULL; |
1412 | unsigned int i, symindex = 0, strindex = 0, setupindex, exindex, | 1460 | unsigned int i; |
1413 | exportindex, modindex, obsparmindex, infoindex, gplindex, | 1461 | unsigned int symindex = 0; |
1414 | crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex, | 1462 | unsigned int strindex = 0; |
1415 | gplfuturecrcindex; | 1463 | unsigned int setupindex; |
1464 | unsigned int exindex; | ||
1465 | unsigned int exportindex; | ||
1466 | unsigned int modindex; | ||
1467 | unsigned int obsparmindex; | ||
1468 | unsigned int infoindex; | ||
1469 | unsigned int gplindex; | ||
1470 | unsigned int crcindex; | ||
1471 | unsigned int gplcrcindex; | ||
1472 | unsigned int versindex; | ||
1473 | unsigned int pcpuindex; | ||
1474 | unsigned int gplfutureindex; | ||
1475 | unsigned int gplfuturecrcindex; | ||
1476 | unsigned int unwindex = 0; | ||
1477 | unsigned int unusedindex; | ||
1478 | unsigned int unusedcrcindex; | ||
1479 | unsigned int unusedgplindex; | ||
1480 | unsigned int unusedgplcrcindex; | ||
1416 | struct module *mod; | 1481 | struct module *mod; |
1417 | long err = 0; | 1482 | long err = 0; |
1418 | void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ | 1483 | void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ |
@@ -1493,15 +1558,22 @@ static struct module *load_module(void __user *umod, | |||
1493 | exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); | 1558 | exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); |
1494 | gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); | 1559 | gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); |
1495 | gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future"); | 1560 | gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future"); |
1561 | unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused"); | ||
1562 | unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl"); | ||
1496 | crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); | 1563 | crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); |
1497 | gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); | 1564 | gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); |
1498 | gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future"); | 1565 | gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future"); |
1566 | unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused"); | ||
1567 | unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl"); | ||
1499 | setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); | 1568 | setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); |
1500 | exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); | 1569 | exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); |
1501 | obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); | 1570 | obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); |
1502 | versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); | 1571 | versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); |
1503 | infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); | 1572 | infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); |
1504 | pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); | 1573 | pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); |
1574 | #ifdef ARCH_UNWIND_SECTION_NAME | ||
1575 | unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME); | ||
1576 | #endif | ||
1505 | 1577 | ||
1506 | /* Don't keep modinfo section */ | 1578 | /* Don't keep modinfo section */ |
1507 | sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; | 1579 | sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; |
@@ -1510,6 +1582,8 @@ static struct module *load_module(void __user *umod, | |||
1510 | sechdrs[symindex].sh_flags |= SHF_ALLOC; | 1582 | sechdrs[symindex].sh_flags |= SHF_ALLOC; |
1511 | sechdrs[strindex].sh_flags |= SHF_ALLOC; | 1583 | sechdrs[strindex].sh_flags |= SHF_ALLOC; |
1512 | #endif | 1584 | #endif |
1585 | if (unwindex) | ||
1586 | sechdrs[unwindex].sh_flags |= SHF_ALLOC; | ||
1513 | 1587 | ||
1514 | /* Check module struct version now, before we try to use module. */ | 1588 | /* Check module struct version now, before we try to use module. */ |
1515 | if (!check_modstruct_version(sechdrs, versindex, mod)) { | 1589 | if (!check_modstruct_version(sechdrs, versindex, mod)) { |
@@ -1639,14 +1713,27 @@ static struct module *load_module(void __user *umod, | |||
1639 | mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; | 1713 | mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; |
1640 | mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / | 1714 | mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / |
1641 | sizeof(*mod->gpl_future_syms); | 1715 | sizeof(*mod->gpl_future_syms); |
1716 | mod->num_unused_syms = sechdrs[unusedindex].sh_size / | ||
1717 | sizeof(*mod->unused_syms); | ||
1718 | mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size / | ||
1719 | sizeof(*mod->unused_gpl_syms); | ||
1642 | mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; | 1720 | mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; |
1643 | if (gplfuturecrcindex) | 1721 | if (gplfuturecrcindex) |
1644 | mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; | 1722 | mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; |
1645 | 1723 | ||
1724 | mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr; | ||
1725 | if (unusedcrcindex) | ||
1726 | mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr; | ||
1727 | mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr; | ||
1728 | if (unusedgplcrcindex) | ||
1729 | mod->unused_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr; | ||
1730 | |||
1646 | #ifdef CONFIG_MODVERSIONS | 1731 | #ifdef CONFIG_MODVERSIONS |
1647 | if ((mod->num_syms && !crcindex) || | 1732 | if ((mod->num_syms && !crcindex) || |
1648 | (mod->num_gpl_syms && !gplcrcindex) || | 1733 | (mod->num_gpl_syms && !gplcrcindex) || |
1649 | (mod->num_gpl_future_syms && !gplfuturecrcindex)) { | 1734 | (mod->num_gpl_future_syms && !gplfuturecrcindex) || |
1735 | (mod->num_unused_syms && !unusedcrcindex) || | ||
1736 | (mod->num_unused_gpl_syms && !unusedgplcrcindex)) { | ||
1650 | printk(KERN_WARNING "%s: No versions for exported symbols." | 1737 | printk(KERN_WARNING "%s: No versions for exported symbols." |
1651 | " Tainting kernel.\n", mod->name); | 1738 | " Tainting kernel.\n", mod->name); |
1652 | add_taint(TAINT_FORCED_MODULE); | 1739 | add_taint(TAINT_FORCED_MODULE); |
@@ -1738,6 +1825,11 @@ static struct module *load_module(void __user *umod, | |||
1738 | goto arch_cleanup; | 1825 | goto arch_cleanup; |
1739 | add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); | 1826 | add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); |
1740 | 1827 | ||
1828 | /* Size of section 0 is 0, so this works well if no unwind info. */ | ||
1829 | mod->unwind_info = unwind_add_table(mod, | ||
1830 | (void *)sechdrs[unwindex].sh_addr, | ||
1831 | sechdrs[unwindex].sh_size); | ||
1832 | |||
1741 | /* Get rid of temporary copy */ | 1833 | /* Get rid of temporary copy */ |
1742 | vfree(hdr); | 1834 | vfree(hdr); |
1743 | 1835 | ||
@@ -1836,6 +1928,7 @@ sys_init_module(void __user *umod, | |||
1836 | mod->state = MODULE_STATE_LIVE; | 1928 | mod->state = MODULE_STATE_LIVE; |
1837 | /* Drop initial reference. */ | 1929 | /* Drop initial reference. */ |
1838 | module_put(mod); | 1930 | module_put(mod); |
1931 | unwind_remove_table(mod->unwind_info, 1); | ||
1839 | module_free(mod, mod->module_init); | 1932 | module_free(mod, mod->module_init); |
1840 | mod->module_init = NULL; | 1933 | mod->module_init = NULL; |
1841 | mod->init_size = 0; | 1934 | mod->init_size = 0; |
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index f4913c376950..e38e4bac97ca 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <linux/delay.h> | 17 | #include <linux/delay.h> |
18 | #include <linux/module.h> | 18 | #include <linux/module.h> |
19 | #include <linux/poison.h> | ||
19 | #include <linux/spinlock.h> | 20 | #include <linux/spinlock.h> |
20 | #include <linux/kallsyms.h> | 21 | #include <linux/kallsyms.h> |
21 | #include <linux/interrupt.h> | 22 | #include <linux/interrupt.h> |
@@ -153,13 +154,13 @@ next: | |||
153 | continue; | 154 | continue; |
154 | count++; | 155 | count++; |
155 | cursor = curr->next; | 156 | cursor = curr->next; |
156 | debug_spin_lock_restore(&debug_mutex_lock, flags); | 157 | debug_spin_unlock_restore(&debug_mutex_lock, flags); |
157 | 158 | ||
158 | printk("\n#%03d: ", count); | 159 | printk("\n#%03d: ", count); |
159 | printk_lock(lock, filter ? 0 : 1); | 160 | printk_lock(lock, filter ? 0 : 1); |
160 | goto next; | 161 | goto next; |
161 | } | 162 | } |
162 | debug_spin_lock_restore(&debug_mutex_lock, flags); | 163 | debug_spin_unlock_restore(&debug_mutex_lock, flags); |
163 | printk("\n"); | 164 | printk("\n"); |
164 | } | 165 | } |
165 | 166 | ||
@@ -316,7 +317,7 @@ void mutex_debug_check_no_locks_held(struct task_struct *task) | |||
316 | continue; | 317 | continue; |
317 | list_del_init(curr); | 318 | list_del_init(curr); |
318 | DEBUG_OFF(); | 319 | DEBUG_OFF(); |
319 | debug_spin_lock_restore(&debug_mutex_lock, flags); | 320 | debug_spin_unlock_restore(&debug_mutex_lock, flags); |
320 | 321 | ||
321 | printk("BUG: %s/%d, lock held at task exit time!\n", | 322 | printk("BUG: %s/%d, lock held at task exit time!\n", |
322 | task->comm, task->pid); | 323 | task->comm, task->pid); |
@@ -325,7 +326,7 @@ void mutex_debug_check_no_locks_held(struct task_struct *task) | |||
325 | printk("exiting task is not even the owner??\n"); | 326 | printk("exiting task is not even the owner??\n"); |
326 | return; | 327 | return; |
327 | } | 328 | } |
328 | debug_spin_lock_restore(&debug_mutex_lock, flags); | 329 | debug_spin_unlock_restore(&debug_mutex_lock, flags); |
329 | } | 330 | } |
330 | 331 | ||
331 | /* | 332 | /* |
@@ -352,7 +353,7 @@ void mutex_debug_check_no_locks_freed(const void *from, unsigned long len) | |||
352 | continue; | 353 | continue; |
353 | list_del_init(curr); | 354 | list_del_init(curr); |
354 | DEBUG_OFF(); | 355 | DEBUG_OFF(); |
355 | debug_spin_lock_restore(&debug_mutex_lock, flags); | 356 | debug_spin_unlock_restore(&debug_mutex_lock, flags); |
356 | 357 | ||
357 | printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", | 358 | printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", |
358 | current->comm, current->pid, lock, from, to); | 359 | current->comm, current->pid, lock, from, to); |
@@ -362,7 +363,7 @@ void mutex_debug_check_no_locks_freed(const void *from, unsigned long len) | |||
362 | printk("freeing task is not even the owner??\n"); | 363 | printk("freeing task is not even the owner??\n"); |
363 | return; | 364 | return; |
364 | } | 365 | } |
365 | debug_spin_lock_restore(&debug_mutex_lock, flags); | 366 | debug_spin_unlock_restore(&debug_mutex_lock, flags); |
366 | } | 367 | } |
367 | 368 | ||
368 | /* | 369 | /* |
@@ -381,7 +382,7 @@ void debug_mutex_set_owner(struct mutex *lock, | |||
381 | 382 | ||
382 | void debug_mutex_init_waiter(struct mutex_waiter *waiter) | 383 | void debug_mutex_init_waiter(struct mutex_waiter *waiter) |
383 | { | 384 | { |
384 | memset(waiter, 0x11, sizeof(*waiter)); | 385 | memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); |
385 | waiter->magic = waiter; | 386 | waiter->magic = waiter; |
386 | INIT_LIST_HEAD(&waiter->list); | 387 | INIT_LIST_HEAD(&waiter->list); |
387 | } | 388 | } |
@@ -397,7 +398,7 @@ void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) | |||
397 | void debug_mutex_free_waiter(struct mutex_waiter *waiter) | 398 | void debug_mutex_free_waiter(struct mutex_waiter *waiter) |
398 | { | 399 | { |
399 | DEBUG_WARN_ON(!list_empty(&waiter->list)); | 400 | DEBUG_WARN_ON(!list_empty(&waiter->list)); |
400 | memset(waiter, 0x22, sizeof(*waiter)); | 401 | memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter)); |
401 | } | 402 | } |
402 | 403 | ||
403 | void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, | 404 | void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, |
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index fd384050acb1..a5196c36a5fd 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h | |||
@@ -46,21 +46,6 @@ extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, | |||
46 | extern void debug_mutex_unlock(struct mutex *lock); | 46 | extern void debug_mutex_unlock(struct mutex *lock); |
47 | extern void debug_mutex_init(struct mutex *lock, const char *name); | 47 | extern void debug_mutex_init(struct mutex *lock, const char *name); |
48 | 48 | ||
49 | #define debug_spin_lock(lock) \ | ||
50 | do { \ | ||
51 | local_irq_disable(); \ | ||
52 | if (debug_mutex_on) \ | ||
53 | spin_lock(lock); \ | ||
54 | } while (0) | ||
55 | |||
56 | #define debug_spin_unlock(lock) \ | ||
57 | do { \ | ||
58 | if (debug_mutex_on) \ | ||
59 | spin_unlock(lock); \ | ||
60 | local_irq_enable(); \ | ||
61 | preempt_check_resched(); \ | ||
62 | } while (0) | ||
63 | |||
64 | #define debug_spin_lock_save(lock, flags) \ | 49 | #define debug_spin_lock_save(lock, flags) \ |
65 | do { \ | 50 | do { \ |
66 | local_irq_save(flags); \ | 51 | local_irq_save(flags); \ |
@@ -68,7 +53,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name); | |||
68 | spin_lock(lock); \ | 53 | spin_lock(lock); \ |
69 | } while (0) | 54 | } while (0) |
70 | 55 | ||
71 | #define debug_spin_lock_restore(lock, flags) \ | 56 | #define debug_spin_unlock_restore(lock, flags) \ |
72 | do { \ | 57 | do { \ |
73 | if (debug_mutex_on) \ | 58 | if (debug_mutex_on) \ |
74 | spin_unlock(lock); \ | 59 | spin_unlock(lock); \ |
@@ -76,20 +61,20 @@ extern void debug_mutex_init(struct mutex *lock, const char *name); | |||
76 | preempt_check_resched(); \ | 61 | preempt_check_resched(); \ |
77 | } while (0) | 62 | } while (0) |
78 | 63 | ||
79 | #define spin_lock_mutex(lock) \ | 64 | #define spin_lock_mutex(lock, flags) \ |
80 | do { \ | 65 | do { \ |
81 | struct mutex *l = container_of(lock, struct mutex, wait_lock); \ | 66 | struct mutex *l = container_of(lock, struct mutex, wait_lock); \ |
82 | \ | 67 | \ |
83 | DEBUG_WARN_ON(in_interrupt()); \ | 68 | DEBUG_WARN_ON(in_interrupt()); \ |
84 | debug_spin_lock(&debug_mutex_lock); \ | 69 | debug_spin_lock_save(&debug_mutex_lock, flags); \ |
85 | spin_lock(lock); \ | 70 | spin_lock(lock); \ |
86 | DEBUG_WARN_ON(l->magic != l); \ | 71 | DEBUG_WARN_ON(l->magic != l); \ |
87 | } while (0) | 72 | } while (0) |
88 | 73 | ||
89 | #define spin_unlock_mutex(lock) \ | 74 | #define spin_unlock_mutex(lock, flags) \ |
90 | do { \ | 75 | do { \ |
91 | spin_unlock(lock); \ | 76 | spin_unlock(lock); \ |
92 | debug_spin_unlock(&debug_mutex_lock); \ | 77 | debug_spin_unlock_restore(&debug_mutex_lock, flags); \ |
93 | } while (0) | 78 | } while (0) |
94 | 79 | ||
95 | #define DEBUG_OFF() \ | 80 | #define DEBUG_OFF() \ |
diff --git a/kernel/mutex.c b/kernel/mutex.c index 5449b210d9ed..7043db21bbce 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -125,10 +125,11 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) | |||
125 | struct task_struct *task = current; | 125 | struct task_struct *task = current; |
126 | struct mutex_waiter waiter; | 126 | struct mutex_waiter waiter; |
127 | unsigned int old_val; | 127 | unsigned int old_val; |
128 | unsigned long flags; | ||
128 | 129 | ||
129 | debug_mutex_init_waiter(&waiter); | 130 | debug_mutex_init_waiter(&waiter); |
130 | 131 | ||
131 | spin_lock_mutex(&lock->wait_lock); | 132 | spin_lock_mutex(&lock->wait_lock, flags); |
132 | 133 | ||
133 | debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip); | 134 | debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip); |
134 | 135 | ||
@@ -157,7 +158,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) | |||
157 | if (unlikely(state == TASK_INTERRUPTIBLE && | 158 | if (unlikely(state == TASK_INTERRUPTIBLE && |
158 | signal_pending(task))) { | 159 | signal_pending(task))) { |
159 | mutex_remove_waiter(lock, &waiter, task->thread_info); | 160 | mutex_remove_waiter(lock, &waiter, task->thread_info); |
160 | spin_unlock_mutex(&lock->wait_lock); | 161 | spin_unlock_mutex(&lock->wait_lock, flags); |
161 | 162 | ||
162 | debug_mutex_free_waiter(&waiter); | 163 | debug_mutex_free_waiter(&waiter); |
163 | return -EINTR; | 164 | return -EINTR; |
@@ -165,9 +166,9 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) | |||
165 | __set_task_state(task, state); | 166 | __set_task_state(task, state); |
166 | 167 | ||
167 | /* didnt get the lock, go to sleep: */ | 168 | /* didnt get the lock, go to sleep: */ |
168 | spin_unlock_mutex(&lock->wait_lock); | 169 | spin_unlock_mutex(&lock->wait_lock, flags); |
169 | schedule(); | 170 | schedule(); |
170 | spin_lock_mutex(&lock->wait_lock); | 171 | spin_lock_mutex(&lock->wait_lock, flags); |
171 | } | 172 | } |
172 | 173 | ||
173 | /* got the lock - rejoice! */ | 174 | /* got the lock - rejoice! */ |
@@ -178,7 +179,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) | |||
178 | if (likely(list_empty(&lock->wait_list))) | 179 | if (likely(list_empty(&lock->wait_list))) |
179 | atomic_set(&lock->count, 0); | 180 | atomic_set(&lock->count, 0); |
180 | 181 | ||
181 | spin_unlock_mutex(&lock->wait_lock); | 182 | spin_unlock_mutex(&lock->wait_lock, flags); |
182 | 183 | ||
183 | debug_mutex_free_waiter(&waiter); | 184 | debug_mutex_free_waiter(&waiter); |
184 | 185 | ||
@@ -203,10 +204,11 @@ static fastcall noinline void | |||
203 | __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) | 204 | __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) |
204 | { | 205 | { |
205 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 206 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
207 | unsigned long flags; | ||
206 | 208 | ||
207 | DEBUG_WARN_ON(lock->owner != current_thread_info()); | 209 | DEBUG_WARN_ON(lock->owner != current_thread_info()); |
208 | 210 | ||
209 | spin_lock_mutex(&lock->wait_lock); | 211 | spin_lock_mutex(&lock->wait_lock, flags); |
210 | 212 | ||
211 | /* | 213 | /* |
212 | * some architectures leave the lock unlocked in the fastpath failure | 214 | * some architectures leave the lock unlocked in the fastpath failure |
@@ -231,7 +233,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) | |||
231 | 233 | ||
232 | debug_mutex_clear_owner(lock); | 234 | debug_mutex_clear_owner(lock); |
233 | 235 | ||
234 | spin_unlock_mutex(&lock->wait_lock); | 236 | spin_unlock_mutex(&lock->wait_lock, flags); |
235 | } | 237 | } |
236 | 238 | ||
237 | /* | 239 | /* |
@@ -276,9 +278,10 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__) | |||
276 | static inline int __mutex_trylock_slowpath(atomic_t *lock_count) | 278 | static inline int __mutex_trylock_slowpath(atomic_t *lock_count) |
277 | { | 279 | { |
278 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 280 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
281 | unsigned long flags; | ||
279 | int prev; | 282 | int prev; |
280 | 283 | ||
281 | spin_lock_mutex(&lock->wait_lock); | 284 | spin_lock_mutex(&lock->wait_lock, flags); |
282 | 285 | ||
283 | prev = atomic_xchg(&lock->count, -1); | 286 | prev = atomic_xchg(&lock->count, -1); |
284 | if (likely(prev == 1)) | 287 | if (likely(prev == 1)) |
@@ -287,7 +290,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) | |||
287 | if (likely(list_empty(&lock->wait_list))) | 290 | if (likely(list_empty(&lock->wait_list))) |
288 | atomic_set(&lock->count, 0); | 291 | atomic_set(&lock->count, 0); |
289 | 292 | ||
290 | spin_unlock_mutex(&lock->wait_lock); | 293 | spin_unlock_mutex(&lock->wait_lock, flags); |
291 | 294 | ||
292 | return prev == 1; | 295 | return prev == 1; |
293 | } | 296 | } |
diff --git a/kernel/mutex.h b/kernel/mutex.h index 00fe84e7b672..069189947257 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h | |||
@@ -9,8 +9,10 @@ | |||
9 | * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: | 9 | * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #define spin_lock_mutex(lock) spin_lock(lock) | 12 | #define spin_lock_mutex(lock, flags) \ |
13 | #define spin_unlock_mutex(lock) spin_unlock(lock) | 13 | do { spin_lock(lock); (void)(flags); } while (0) |
14 | #define spin_unlock_mutex(lock, flags) \ | ||
15 | do { spin_unlock(lock); (void)(flags); } while (0) | ||
14 | #define mutex_remove_waiter(lock, waiter, ti) \ | 16 | #define mutex_remove_waiter(lock, waiter, ti) \ |
15 | __list_del((waiter)->list.prev, (waiter)->list.next) | 17 | __list_del((waiter)->list.prev, (waiter)->list.next) |
16 | 18 | ||
diff --git a/kernel/panic.c b/kernel/panic.c index cc2a4c9c36ac..ab13f0f668b5 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -8,7 +8,6 @@ | |||
8 | * This function is used through-out the kernel (including mm and fs) | 8 | * This function is used through-out the kernel (including mm and fs) |
9 | * to indicate a major problem. | 9 | * to indicate a major problem. |
10 | */ | 10 | */ |
11 | #include <linux/config.h> | ||
12 | #include <linux/module.h> | 11 | #include <linux/module.h> |
13 | #include <linux/sched.h> | 12 | #include <linux/sched.h> |
14 | #include <linux/delay.h> | 13 | #include <linux/delay.h> |
diff --git a/kernel/params.c b/kernel/params.c index af43ecdc8d9b..91aea7aa532e 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -15,7 +15,6 @@ | |||
15 | along with this program; if not, write to the Free Software | 15 | along with this program; if not, write to the Free Software |
16 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 16 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
17 | */ | 17 | */ |
18 | #include <linux/config.h> | ||
19 | #include <linux/moduleparam.h> | 18 | #include <linux/moduleparam.h> |
20 | #include <linux/kernel.h> | 19 | #include <linux/kernel.h> |
21 | #include <linux/string.h> | 20 | #include <linux/string.h> |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ce0dfb8f4a4e..ae44a70aae8a 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -36,6 +36,24 @@ config PM_DEBUG | |||
36 | code. This is helpful when debugging and reporting various PM bugs, | 36 | code. This is helpful when debugging and reporting various PM bugs, |
37 | like suspend support. | 37 | like suspend support. |
38 | 38 | ||
39 | config PM_TRACE | ||
40 | bool "Suspend/resume event tracing" | ||
41 | depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL | ||
42 | default n | ||
43 | ---help--- | ||
44 | This enables some cheesy code to save the last PM event point in the | ||
45 | RTC across reboots, so that you can debug a machine that just hangs | ||
46 | during suspend (or more commonly, during resume). | ||
47 | |||
48 | To use this debugging feature you should attempt to suspend the machine, | ||
49 | then reboot it, then run | ||
50 | |||
51 | dmesg -s 1000000 | grep 'hash matches' | ||
52 | |||
53 | CAUTION: this option will cause your machine's real-time clock to be | ||
54 | set to an invalid time after a resume. | ||
55 | |||
56 | |||
39 | config SOFTWARE_SUSPEND | 57 | config SOFTWARE_SUSPEND |
40 | bool "Software Suspend" | 58 | bool "Software Suspend" |
41 | depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) | 59 | depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) |
@@ -82,18 +100,6 @@ config PM_STD_PARTITION | |||
82 | suspended image to. It will simply pick the first available swap | 100 | suspended image to. It will simply pick the first available swap |
83 | device. | 101 | device. |
84 | 102 | ||
85 | config SWSUSP_ENCRYPT | ||
86 | bool "Encrypt suspend image" | ||
87 | depends on SOFTWARE_SUSPEND && CRYPTO=y && (CRYPTO_AES=y || CRYPTO_AES_586=y || CRYPTO_AES_X86_64=y) | ||
88 | default "" | ||
89 | ---help--- | ||
90 | To prevent data gathering from swap after resume you can encrypt | ||
91 | the suspend image with a temporary key that is deleted on | ||
92 | resume. | ||
93 | |||
94 | Note that the temporary key is stored unencrypted on disk while the | ||
95 | system is suspended. | ||
96 | |||
97 | config SUSPEND_SMP | 103 | config SUSPEND_SMP |
98 | bool | 104 | bool |
99 | depends on HOTPLUG_CPU && X86 && PM | 105 | depends on HOTPLUG_CPU && X86 && PM |
diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 81d4d982f3f0..e13e74067845 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c | |||
@@ -231,7 +231,7 @@ static int software_resume(void) | |||
231 | late_initcall(software_resume); | 231 | late_initcall(software_resume); |
232 | 232 | ||
233 | 233 | ||
234 | static char * pm_disk_modes[] = { | 234 | static const char * const pm_disk_modes[] = { |
235 | [PM_DISK_FIRMWARE] = "firmware", | 235 | [PM_DISK_FIRMWARE] = "firmware", |
236 | [PM_DISK_PLATFORM] = "platform", | 236 | [PM_DISK_PLATFORM] = "platform", |
237 | [PM_DISK_SHUTDOWN] = "shutdown", | 237 | [PM_DISK_SHUTDOWN] = "shutdown", |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 0a907f0dc56b..6d295c776794 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -15,7 +15,7 @@ | |||
15 | #include <linux/errno.h> | 15 | #include <linux/errno.h> |
16 | #include <linux/init.h> | 16 | #include <linux/init.h> |
17 | #include <linux/pm.h> | 17 | #include <linux/pm.h> |
18 | 18 | #include <linux/console.h> | |
19 | 19 | ||
20 | #include "power.h" | 20 | #include "power.h" |
21 | 21 | ||
@@ -145,7 +145,7 @@ static void suspend_finish(suspend_state_t state) | |||
145 | 145 | ||
146 | 146 | ||
147 | 147 | ||
148 | static char *pm_states[PM_SUSPEND_MAX] = { | 148 | static const char * const pm_states[PM_SUSPEND_MAX] = { |
149 | [PM_SUSPEND_STANDBY] = "standby", | 149 | [PM_SUSPEND_STANDBY] = "standby", |
150 | [PM_SUSPEND_MEM] = "mem", | 150 | [PM_SUSPEND_MEM] = "mem", |
151 | #ifdef CONFIG_SOFTWARE_SUSPEND | 151 | #ifdef CONFIG_SOFTWARE_SUSPEND |
@@ -262,7 +262,7 @@ static ssize_t state_show(struct subsystem * subsys, char * buf) | |||
262 | static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n) | 262 | static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n) |
263 | { | 263 | { |
264 | suspend_state_t state = PM_SUSPEND_STANDBY; | 264 | suspend_state_t state = PM_SUSPEND_STANDBY; |
265 | char ** s; | 265 | const char * const *s; |
266 | char *p; | 266 | char *p; |
267 | int error; | 267 | int error; |
268 | int len; | 268 | int len; |
diff --git a/kernel/power/power.h b/kernel/power/power.h index f06f12f21767..57a792982fb9 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -55,7 +55,7 @@ struct snapshot_handle { | |||
55 | unsigned int page; | 55 | unsigned int page; |
56 | unsigned int page_offset; | 56 | unsigned int page_offset; |
57 | unsigned int prev; | 57 | unsigned int prev; |
58 | struct pbe *pbe; | 58 | struct pbe *pbe, *last_pbe; |
59 | void *buffer; | 59 | void *buffer; |
60 | unsigned int buf_offset; | 60 | unsigned int buf_offset; |
61 | }; | 61 | }; |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3eeedbb13b78..24c96f354231 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -150,6 +150,10 @@ int restore_highmem(void) | |||
150 | } | 150 | } |
151 | return 0; | 151 | return 0; |
152 | } | 152 | } |
153 | #else | ||
154 | static inline unsigned int count_highmem_pages(void) {return 0;} | ||
155 | static inline int save_highmem(void) {return 0;} | ||
156 | static inline int restore_highmem(void) {return 0;} | ||
153 | #endif | 157 | #endif |
154 | 158 | ||
155 | static int pfn_is_nosave(unsigned long pfn) | 159 | static int pfn_is_nosave(unsigned long pfn) |
@@ -293,62 +297,29 @@ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) | |||
293 | } | 297 | } |
294 | } | 298 | } |
295 | 299 | ||
296 | /** | 300 | static unsigned int unsafe_pages; |
297 | * On resume it is necessary to trace and eventually free the unsafe | ||
298 | * pages that have been allocated, because they are needed for I/O | ||
299 | * (on x86-64 we likely will "eat" these pages once again while | ||
300 | * creating the temporary page translation tables) | ||
301 | */ | ||
302 | |||
303 | struct eaten_page { | ||
304 | struct eaten_page *next; | ||
305 | char padding[PAGE_SIZE - sizeof(void *)]; | ||
306 | }; | ||
307 | |||
308 | static struct eaten_page *eaten_pages = NULL; | ||
309 | |||
310 | static void release_eaten_pages(void) | ||
311 | { | ||
312 | struct eaten_page *p, *q; | ||
313 | |||
314 | p = eaten_pages; | ||
315 | while (p) { | ||
316 | q = p->next; | ||
317 | /* We don't want swsusp_free() to free this page again */ | ||
318 | ClearPageNosave(virt_to_page(p)); | ||
319 | free_page((unsigned long)p); | ||
320 | p = q; | ||
321 | } | ||
322 | eaten_pages = NULL; | ||
323 | } | ||
324 | 301 | ||
325 | /** | 302 | /** |
326 | * @safe_needed - on resume, for storing the PBE list and the image, | 303 | * @safe_needed - on resume, for storing the PBE list and the image, |
327 | * we can only use memory pages that do not conflict with the pages | 304 | * we can only use memory pages that do not conflict with the pages |
328 | * which had been used before suspend. | 305 | * used before suspend. |
329 | * | 306 | * |
330 | * The unsafe pages are marked with the PG_nosave_free flag | 307 | * The unsafe pages are marked with the PG_nosave_free flag |
331 | * | 308 | * and we count them using unsafe_pages |
332 | * Allocated but unusable (ie eaten) memory pages should be marked | ||
333 | * so that swsusp_free() can release them | ||
334 | */ | 309 | */ |
335 | 310 | ||
336 | static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) | 311 | static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) |
337 | { | 312 | { |
338 | void *res; | 313 | void *res; |
339 | 314 | ||
315 | res = (void *)get_zeroed_page(gfp_mask); | ||
340 | if (safe_needed) | 316 | if (safe_needed) |
341 | do { | 317 | while (res && PageNosaveFree(virt_to_page(res))) { |
318 | /* The page is unsafe, mark it for swsusp_free() */ | ||
319 | SetPageNosave(virt_to_page(res)); | ||
320 | unsafe_pages++; | ||
342 | res = (void *)get_zeroed_page(gfp_mask); | 321 | res = (void *)get_zeroed_page(gfp_mask); |
343 | if (res && PageNosaveFree(virt_to_page(res))) { | 322 | } |
344 | /* This is for swsusp_free() */ | ||
345 | SetPageNosave(virt_to_page(res)); | ||
346 | ((struct eaten_page *)res)->next = eaten_pages; | ||
347 | eaten_pages = res; | ||
348 | } | ||
349 | } while (res && PageNosaveFree(virt_to_page(res))); | ||
350 | else | ||
351 | res = (void *)get_zeroed_page(gfp_mask); | ||
352 | if (res) { | 323 | if (res) { |
353 | SetPageNosave(virt_to_page(res)); | 324 | SetPageNosave(virt_to_page(res)); |
354 | SetPageNosaveFree(virt_to_page(res)); | 325 | SetPageNosaveFree(virt_to_page(res)); |
@@ -374,7 +345,8 @@ unsigned long get_safe_page(gfp_t gfp_mask) | |||
374 | * On each page we set up a list of struct_pbe elements. | 345 | * On each page we set up a list of struct_pbe elements. |
375 | */ | 346 | */ |
376 | 347 | ||
377 | struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed) | 348 | static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, |
349 | int safe_needed) | ||
378 | { | 350 | { |
379 | unsigned int num; | 351 | unsigned int num; |
380 | struct pbe *pblist, *pbe; | 352 | struct pbe *pblist, *pbe; |
@@ -642,6 +614,8 @@ static int mark_unsafe_pages(struct pbe *pblist) | |||
642 | return -EFAULT; | 614 | return -EFAULT; |
643 | } | 615 | } |
644 | 616 | ||
617 | unsafe_pages = 0; | ||
618 | |||
645 | return 0; | 619 | return 0; |
646 | } | 620 | } |
647 | 621 | ||
@@ -719,42 +693,99 @@ static inline struct pbe *unpack_orig_addresses(unsigned long *buf, | |||
719 | } | 693 | } |
720 | 694 | ||
721 | /** | 695 | /** |
722 | * create_image - use metadata contained in the PBE list | 696 | * prepare_image - use metadata contained in the PBE list |
723 | * pointed to by pagedir_nosave to mark the pages that will | 697 | * pointed to by pagedir_nosave to mark the pages that will |
724 | * be overwritten in the process of restoring the system | 698 | * be overwritten in the process of restoring the system |
725 | * memory state from the image and allocate memory for | 699 | * memory state from the image ("unsafe" pages) and allocate |
726 | * the image avoiding these pages | 700 | * memory for the image |
701 | * | ||
702 | * The idea is to allocate the PBE list first and then | ||
703 | * allocate as many pages as it's needed for the image data, | ||
704 | * but not to assign these pages to the PBEs initially. | ||
705 | * Instead, we just mark them as allocated and create a list | ||
706 | * of "safe" which will be used later | ||
727 | */ | 707 | */ |
728 | 708 | ||
729 | static int create_image(struct snapshot_handle *handle) | 709 | struct safe_page { |
710 | struct safe_page *next; | ||
711 | char padding[PAGE_SIZE - sizeof(void *)]; | ||
712 | }; | ||
713 | |||
714 | static struct safe_page *safe_pages; | ||
715 | |||
716 | static int prepare_image(struct snapshot_handle *handle) | ||
730 | { | 717 | { |
731 | int error = 0; | 718 | int error = 0; |
732 | struct pbe *p, *pblist; | 719 | unsigned int nr_pages = nr_copy_pages; |
720 | struct pbe *p, *pblist = NULL; | ||
733 | 721 | ||
734 | p = pagedir_nosave; | 722 | p = pagedir_nosave; |
735 | error = mark_unsafe_pages(p); | 723 | error = mark_unsafe_pages(p); |
736 | if (!error) { | 724 | if (!error) { |
737 | pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); | 725 | pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); |
738 | if (pblist) | 726 | if (pblist) |
739 | copy_page_backup_list(pblist, p); | 727 | copy_page_backup_list(pblist, p); |
740 | free_pagedir(p, 0); | 728 | free_pagedir(p, 0); |
741 | if (!pblist) | 729 | if (!pblist) |
742 | error = -ENOMEM; | 730 | error = -ENOMEM; |
743 | } | 731 | } |
744 | if (!error) | 732 | safe_pages = NULL; |
745 | error = alloc_data_pages(pblist, GFP_ATOMIC, 1); | 733 | if (!error && nr_pages > unsafe_pages) { |
734 | nr_pages -= unsafe_pages; | ||
735 | while (nr_pages--) { | ||
736 | struct safe_page *ptr; | ||
737 | |||
738 | ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC); | ||
739 | if (!ptr) { | ||
740 | error = -ENOMEM; | ||
741 | break; | ||
742 | } | ||
743 | if (!PageNosaveFree(virt_to_page(ptr))) { | ||
744 | /* The page is "safe", add it to the list */ | ||
745 | ptr->next = safe_pages; | ||
746 | safe_pages = ptr; | ||
747 | } | ||
748 | /* Mark the page as allocated */ | ||
749 | SetPageNosave(virt_to_page(ptr)); | ||
750 | SetPageNosaveFree(virt_to_page(ptr)); | ||
751 | } | ||
752 | } | ||
746 | if (!error) { | 753 | if (!error) { |
747 | release_eaten_pages(); | ||
748 | pagedir_nosave = pblist; | 754 | pagedir_nosave = pblist; |
749 | } else { | 755 | } else { |
750 | pagedir_nosave = NULL; | ||
751 | handle->pbe = NULL; | 756 | handle->pbe = NULL; |
752 | nr_copy_pages = 0; | 757 | swsusp_free(); |
753 | nr_meta_pages = 0; | ||
754 | } | 758 | } |
755 | return error; | 759 | return error; |
756 | } | 760 | } |
757 | 761 | ||
762 | static void *get_buffer(struct snapshot_handle *handle) | ||
763 | { | ||
764 | struct pbe *pbe = handle->pbe, *last = handle->last_pbe; | ||
765 | struct page *page = virt_to_page(pbe->orig_address); | ||
766 | |||
767 | if (PageNosave(page) && PageNosaveFree(page)) { | ||
768 | /* | ||
769 | * We have allocated the "original" page frame and we can | ||
770 | * use it directly to store the read page | ||
771 | */ | ||
772 | pbe->address = 0; | ||
773 | if (last && last->next) | ||
774 | last->next = NULL; | ||
775 | return (void *)pbe->orig_address; | ||
776 | } | ||
777 | /* | ||
778 | * The "original" page frame has not been allocated and we have to | ||
779 | * use a "safe" page frame to store the read page | ||
780 | */ | ||
781 | pbe->address = (unsigned long)safe_pages; | ||
782 | safe_pages = safe_pages->next; | ||
783 | if (last) | ||
784 | last->next = pbe; | ||
785 | handle->last_pbe = pbe; | ||
786 | return (void *)pbe->address; | ||
787 | } | ||
788 | |||
758 | /** | 789 | /** |
759 | * snapshot_write_next - used for writing the system memory snapshot. | 790 | * snapshot_write_next - used for writing the system memory snapshot. |
760 | * | 791 | * |
@@ -799,15 +830,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) | |||
799 | } else if (handle->prev <= nr_meta_pages) { | 830 | } else if (handle->prev <= nr_meta_pages) { |
800 | handle->pbe = unpack_orig_addresses(buffer, handle->pbe); | 831 | handle->pbe = unpack_orig_addresses(buffer, handle->pbe); |
801 | if (!handle->pbe) { | 832 | if (!handle->pbe) { |
802 | error = create_image(handle); | 833 | error = prepare_image(handle); |
803 | if (error) | 834 | if (error) |
804 | return error; | 835 | return error; |
805 | handle->pbe = pagedir_nosave; | 836 | handle->pbe = pagedir_nosave; |
806 | handle->buffer = (void *)handle->pbe->address; | 837 | handle->last_pbe = NULL; |
838 | handle->buffer = get_buffer(handle); | ||
807 | } | 839 | } |
808 | } else { | 840 | } else { |
809 | handle->pbe = handle->pbe->next; | 841 | handle->pbe = handle->pbe->next; |
810 | handle->buffer = (void *)handle->pbe->address; | 842 | handle->buffer = get_buffer(handle); |
811 | } | 843 | } |
812 | handle->prev = handle->page; | 844 | handle->prev = handle->page; |
813 | } | 845 | } |
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index c4016cbbd3e0..17f669c83012 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c | |||
@@ -67,9 +67,9 @@ unsigned int count_highmem_pages(void); | |||
67 | int save_highmem(void); | 67 | int save_highmem(void); |
68 | int restore_highmem(void); | 68 | int restore_highmem(void); |
69 | #else | 69 | #else |
70 | static int save_highmem(void) { return 0; } | 70 | static inline int save_highmem(void) { return 0; } |
71 | static int restore_highmem(void) { return 0; } | 71 | static inline int restore_highmem(void) { return 0; } |
72 | static unsigned int count_highmem_pages(void) { return 0; } | 72 | static inline unsigned int count_highmem_pages(void) { return 0; } |
73 | #endif | 73 | #endif |
74 | 74 | ||
75 | /** | 75 | /** |
@@ -175,6 +175,12 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap) | |||
175 | */ | 175 | */ |
176 | 176 | ||
177 | #define SHRINK_BITE 10000 | 177 | #define SHRINK_BITE 10000 |
178 | static inline unsigned long __shrink_memory(long tmp) | ||
179 | { | ||
180 | if (tmp > SHRINK_BITE) | ||
181 | tmp = SHRINK_BITE; | ||
182 | return shrink_all_memory(tmp); | ||
183 | } | ||
178 | 184 | ||
179 | int swsusp_shrink_memory(void) | 185 | int swsusp_shrink_memory(void) |
180 | { | 186 | { |
@@ -192,15 +198,17 @@ int swsusp_shrink_memory(void) | |||
192 | PAGES_FOR_IO; | 198 | PAGES_FOR_IO; |
193 | tmp = size; | 199 | tmp = size; |
194 | for_each_zone (zone) | 200 | for_each_zone (zone) |
195 | if (!is_highmem(zone)) | 201 | if (!is_highmem(zone) && populated_zone(zone)) { |
196 | tmp -= zone->free_pages; | 202 | tmp -= zone->free_pages; |
203 | tmp += zone->lowmem_reserve[ZONE_NORMAL]; | ||
204 | } | ||
197 | if (tmp > 0) { | 205 | if (tmp > 0) { |
198 | tmp = shrink_all_memory(SHRINK_BITE); | 206 | tmp = __shrink_memory(tmp); |
199 | if (!tmp) | 207 | if (!tmp) |
200 | return -ENOMEM; | 208 | return -ENOMEM; |
201 | pages += tmp; | 209 | pages += tmp; |
202 | } else if (size > image_size / PAGE_SIZE) { | 210 | } else if (size > image_size / PAGE_SIZE) { |
203 | tmp = shrink_all_memory(SHRINK_BITE); | 211 | tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); |
204 | pages += tmp; | 212 | pages += tmp; |
205 | } | 213 | } |
206 | printk("\b%c", p[i++%4]); | 214 | printk("\b%c", p[i++%4]); |
diff --git a/kernel/printk.c b/kernel/printk.c index 416b8f3fb265..9772b9e8feee 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -24,8 +24,8 @@ | |||
24 | #include <linux/console.h> | 24 | #include <linux/console.h> |
25 | #include <linux/init.h> | 25 | #include <linux/init.h> |
26 | #include <linux/module.h> | 26 | #include <linux/module.h> |
27 | #include <linux/moduleparam.h> | ||
27 | #include <linux/interrupt.h> /* For in_interrupt() */ | 28 | #include <linux/interrupt.h> /* For in_interrupt() */ |
28 | #include <linux/config.h> | ||
29 | #include <linux/delay.h> | 29 | #include <linux/delay.h> |
30 | #include <linux/smp.h> | 30 | #include <linux/smp.h> |
31 | #include <linux/security.h> | 31 | #include <linux/security.h> |
@@ -327,7 +327,9 @@ static void __call_console_drivers(unsigned long start, unsigned long end) | |||
327 | struct console *con; | 327 | struct console *con; |
328 | 328 | ||
329 | for (con = console_drivers; con; con = con->next) { | 329 | for (con = console_drivers; con; con = con->next) { |
330 | if ((con->flags & CON_ENABLED) && con->write) | 330 | if ((con->flags & CON_ENABLED) && con->write && |
331 | (cpu_online(smp_processor_id()) || | ||
332 | (con->flags & CON_ANYTIME))) | ||
331 | con->write(con, &LOG_BUF(start), end - start); | 333 | con->write(con, &LOG_BUF(start), end - start); |
332 | } | 334 | } |
333 | } | 335 | } |
@@ -437,6 +439,7 @@ static int printk_time = 1; | |||
437 | #else | 439 | #else |
438 | static int printk_time = 0; | 440 | static int printk_time = 0; |
439 | #endif | 441 | #endif |
442 | module_param(printk_time, int, S_IRUGO | S_IWUSR); | ||
440 | 443 | ||
441 | static int __init printk_time_setup(char *str) | 444 | static int __init printk_time_setup(char *str) |
442 | { | 445 | { |
@@ -453,6 +456,18 @@ __attribute__((weak)) unsigned long long printk_clock(void) | |||
453 | return sched_clock(); | 456 | return sched_clock(); |
454 | } | 457 | } |
455 | 458 | ||
459 | /* Check if we have any console registered that can be called early in boot. */ | ||
460 | static int have_callable_console(void) | ||
461 | { | ||
462 | struct console *con; | ||
463 | |||
464 | for (con = console_drivers; con; con = con->next) | ||
465 | if (con->flags & CON_ANYTIME) | ||
466 | return 1; | ||
467 | |||
468 | return 0; | ||
469 | } | ||
470 | |||
456 | /** | 471 | /** |
457 | * printk - print a kernel message | 472 | * printk - print a kernel message |
458 | * @fmt: format string | 473 | * @fmt: format string |
@@ -566,27 +581,29 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
566 | log_level_unknown = 1; | 581 | log_level_unknown = 1; |
567 | } | 582 | } |
568 | 583 | ||
569 | if (!cpu_online(smp_processor_id())) { | 584 | if (!down_trylock(&console_sem)) { |
570 | /* | 585 | /* |
571 | * Some console drivers may assume that per-cpu resources have | 586 | * We own the drivers. We can drop the spinlock and |
572 | * been allocated. So don't allow them to be called by this | 587 | * let release_console_sem() print the text, maybe ... |
573 | * CPU until it is officially up. We shouldn't be calling into | ||
574 | * random console drivers on a CPU which doesn't exist yet.. | ||
575 | */ | 588 | */ |
589 | console_locked = 1; | ||
576 | printk_cpu = UINT_MAX; | 590 | printk_cpu = UINT_MAX; |
577 | spin_unlock_irqrestore(&logbuf_lock, flags); | 591 | spin_unlock_irqrestore(&logbuf_lock, flags); |
578 | goto out; | 592 | |
579 | } | ||
580 | if (!down_trylock(&console_sem)) { | ||
581 | console_locked = 1; | ||
582 | /* | 593 | /* |
583 | * We own the drivers. We can drop the spinlock and let | 594 | * Console drivers may assume that per-cpu resources have |
584 | * release_console_sem() print the text | 595 | * been allocated. So unless they're explicitly marked as |
596 | * being able to cope (CON_ANYTIME) don't call them until | ||
597 | * this CPU is officially up. | ||
585 | */ | 598 | */ |
586 | printk_cpu = UINT_MAX; | 599 | if (cpu_online(smp_processor_id()) || have_callable_console()) { |
587 | spin_unlock_irqrestore(&logbuf_lock, flags); | 600 | console_may_schedule = 0; |
588 | console_may_schedule = 0; | 601 | release_console_sem(); |
589 | release_console_sem(); | 602 | } else { |
603 | /* Release by hand to avoid flushing the buffer. */ | ||
604 | console_locked = 0; | ||
605 | up(&console_sem); | ||
606 | } | ||
590 | } else { | 607 | } else { |
591 | /* | 608 | /* |
592 | * Someone else owns the drivers. We drop the spinlock, which | 609 | * Someone else owns the drivers. We drop the spinlock, which |
@@ -596,7 +613,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
596 | printk_cpu = UINT_MAX; | 613 | printk_cpu = UINT_MAX; |
597 | spin_unlock_irqrestore(&logbuf_lock, flags); | 614 | spin_unlock_irqrestore(&logbuf_lock, flags); |
598 | } | 615 | } |
599 | out: | 616 | |
600 | preempt_enable(); | 617 | preempt_enable(); |
601 | return printed_len; | 618 | return printed_len; |
602 | } | 619 | } |
diff --git a/kernel/profile.c b/kernel/profile.c index 68afe121e507..d5bd75e7501c 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -13,7 +13,6 @@ | |||
13 | * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 | 13 | * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/config.h> | ||
17 | #include <linux/module.h> | 16 | #include <linux/module.h> |
18 | #include <linux/profile.h> | 17 | #include <linux/profile.h> |
19 | #include <linux/bootmem.h> | 18 | #include <linux/bootmem.h> |
@@ -299,7 +298,7 @@ out: | |||
299 | } | 298 | } |
300 | 299 | ||
301 | #ifdef CONFIG_HOTPLUG_CPU | 300 | #ifdef CONFIG_HOTPLUG_CPU |
302 | static int profile_cpu_callback(struct notifier_block *info, | 301 | static int __devinit profile_cpu_callback(struct notifier_block *info, |
303 | unsigned long action, void *__cpu) | 302 | unsigned long action, void *__cpu) |
304 | { | 303 | { |
305 | int node, cpu = (unsigned long)__cpu; | 304 | int node, cpu = (unsigned long)__cpu; |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 921c22ad16e4..335c5b932e14 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -120,8 +120,18 @@ int ptrace_check_attach(struct task_struct *child, int kill) | |||
120 | 120 | ||
121 | static int may_attach(struct task_struct *task) | 121 | static int may_attach(struct task_struct *task) |
122 | { | 122 | { |
123 | if (!task->mm) | 123 | /* May we inspect the given task? |
124 | return -EPERM; | 124 | * This check is used both for attaching with ptrace |
125 | * and for allowing access to sensitive information in /proc. | ||
126 | * | ||
127 | * ptrace_attach denies several cases that /proc allows | ||
128 | * because setting up the necessary parent/child relationship | ||
129 | * or halting the specified task is impossible. | ||
130 | */ | ||
131 | int dumpable = 0; | ||
132 | /* Don't let security modules deny introspection */ | ||
133 | if (task == current) | ||
134 | return 0; | ||
125 | if (((current->uid != task->euid) || | 135 | if (((current->uid != task->euid) || |
126 | (current->uid != task->suid) || | 136 | (current->uid != task->suid) || |
127 | (current->uid != task->uid) || | 137 | (current->uid != task->uid) || |
@@ -130,7 +140,9 @@ static int may_attach(struct task_struct *task) | |||
130 | (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) | 140 | (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) |
131 | return -EPERM; | 141 | return -EPERM; |
132 | smp_rmb(); | 142 | smp_rmb(); |
133 | if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) | 143 | if (task->mm) |
144 | dumpable = task->mm->dumpable; | ||
145 | if (!dumpable && !capable(CAP_SYS_PTRACE)) | ||
134 | return -EPERM; | 146 | return -EPERM; |
135 | 147 | ||
136 | return security_ptrace(current, task); | 148 | return security_ptrace(current, task); |
@@ -176,6 +188,8 @@ repeat: | |||
176 | goto repeat; | 188 | goto repeat; |
177 | } | 189 | } |
178 | 190 | ||
191 | if (!task->mm) | ||
192 | goto bad; | ||
179 | /* the same process cannot be attached many times */ | 193 | /* the same process cannot be attached many times */ |
180 | if (task->ptrace & PT_PTRACED) | 194 | if (task->ptrace & PT_PTRACED) |
181 | goto bad; | 195 | goto bad; |
@@ -200,7 +214,7 @@ out: | |||
200 | return retval; | 214 | return retval; |
201 | } | 215 | } |
202 | 216 | ||
203 | void __ptrace_detach(struct task_struct *child, unsigned int data) | 217 | static inline void __ptrace_detach(struct task_struct *child, unsigned int data) |
204 | { | 218 | { |
205 | child->exit_code = data; | 219 | child->exit_code = data; |
206 | /* .. re-parent .. */ | 220 | /* .. re-parent .. */ |
@@ -219,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data) | |||
219 | ptrace_disable(child); | 233 | ptrace_disable(child); |
220 | 234 | ||
221 | write_lock_irq(&tasklist_lock); | 235 | write_lock_irq(&tasklist_lock); |
236 | /* protect against de_thread()->release_task() */ | ||
222 | if (child->ptrace) | 237 | if (child->ptrace) |
223 | __ptrace_detach(child, data); | 238 | __ptrace_detach(child, data); |
224 | write_unlock_irq(&tasklist_lock); | 239 | write_unlock_irq(&tasklist_lock); |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 2058f88c7bbb..f464f5ae3f11 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -182,6 +182,15 @@ long rcu_batches_completed(void) | |||
182 | return rcu_ctrlblk.completed; | 182 | return rcu_ctrlblk.completed; |
183 | } | 183 | } |
184 | 184 | ||
185 | /* | ||
186 | * Return the number of RCU batches processed thus far. Useful | ||
187 | * for debug and statistics. | ||
188 | */ | ||
189 | long rcu_batches_completed_bh(void) | ||
190 | { | ||
191 | return rcu_bh_ctrlblk.completed; | ||
192 | } | ||
193 | |||
185 | static void rcu_barrier_callback(struct rcu_head *notused) | 194 | static void rcu_barrier_callback(struct rcu_head *notused) |
186 | { | 195 | { |
187 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 196 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) |
@@ -539,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu) | |||
539 | tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); | 548 | tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); |
540 | } | 549 | } |
541 | 550 | ||
542 | static int rcu_cpu_notify(struct notifier_block *self, | 551 | static int __devinit rcu_cpu_notify(struct notifier_block *self, |
543 | unsigned long action, void *hcpu) | 552 | unsigned long action, void *hcpu) |
544 | { | 553 | { |
545 | long cpu = (long)hcpu; | 554 | long cpu = (long)hcpu; |
@@ -556,7 +565,7 @@ static int rcu_cpu_notify(struct notifier_block *self, | |||
556 | return NOTIFY_OK; | 565 | return NOTIFY_OK; |
557 | } | 566 | } |
558 | 567 | ||
559 | static struct notifier_block rcu_nb = { | 568 | static struct notifier_block __devinitdata rcu_nb = { |
560 | .notifier_call = rcu_cpu_notify, | 569 | .notifier_call = rcu_cpu_notify, |
561 | }; | 570 | }; |
562 | 571 | ||
@@ -612,14 +621,6 @@ void synchronize_rcu(void) | |||
612 | wait_for_completion(&rcu.completion); | 621 | wait_for_completion(&rcu.completion); |
613 | } | 622 | } |
614 | 623 | ||
615 | /* | ||
616 | * Deprecated, use synchronize_rcu() or synchronize_sched() instead. | ||
617 | */ | ||
618 | void synchronize_kernel(void) | ||
619 | { | ||
620 | synchronize_rcu(); | ||
621 | } | ||
622 | |||
623 | module_param(blimit, int, 0); | 624 | module_param(blimit, int, 0); |
624 | module_param(qhimark, int, 0); | 625 | module_param(qhimark, int, 0); |
625 | module_param(qlowmark, int, 0); | 626 | module_param(qlowmark, int, 0); |
@@ -627,7 +628,7 @@ module_param(qlowmark, int, 0); | |||
627 | module_param(rsinterval, int, 0); | 628 | module_param(rsinterval, int, 0); |
628 | #endif | 629 | #endif |
629 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | 630 | EXPORT_SYMBOL_GPL(rcu_batches_completed); |
630 | EXPORT_SYMBOL_GPL_FUTURE(call_rcu); /* WARNING: GPL-only in April 2006. */ | 631 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); |
631 | EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ | 632 | EXPORT_SYMBOL_GPL(call_rcu); |
633 | EXPORT_SYMBOL_GPL(call_rcu_bh); | ||
632 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 634 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
633 | EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */ | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 8154e7589d12..4d1c3d247127 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Read-Copy Update /proc-based torture test facility | 2 | * Read-Copy Update module-based torture test facility |
3 | * | 3 | * |
4 | * This program is free software; you can redistribute it and/or modify | 4 | * This program is free software; you can redistribute it and/or modify |
5 | * it under the terms of the GNU General Public License as published by | 5 | * it under the terms of the GNU General Public License as published by |
@@ -53,6 +53,7 @@ static int stat_interval; /* Interval between stats, in seconds. */ | |||
53 | static int verbose; /* Print more debug info. */ | 53 | static int verbose; /* Print more debug info. */ |
54 | static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ | 54 | static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ |
55 | static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ | 55 | static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ |
56 | static char *torture_type = "rcu"; /* What to torture. */ | ||
56 | 57 | ||
57 | module_param(nreaders, int, 0); | 58 | module_param(nreaders, int, 0); |
58 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | 59 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); |
@@ -64,13 +65,16 @@ module_param(test_no_idle_hz, bool, 0); | |||
64 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); | 65 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); |
65 | module_param(shuffle_interval, int, 0); | 66 | module_param(shuffle_interval, int, 0); |
66 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); | 67 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); |
67 | #define TORTURE_FLAG "rcutorture: " | 68 | module_param(torture_type, charp, 0); |
69 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)"); | ||
70 | |||
71 | #define TORTURE_FLAG "-torture:" | ||
68 | #define PRINTK_STRING(s) \ | 72 | #define PRINTK_STRING(s) \ |
69 | do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) | 73 | do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) |
70 | #define VERBOSE_PRINTK_STRING(s) \ | 74 | #define VERBOSE_PRINTK_STRING(s) \ |
71 | do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) | 75 | do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) |
72 | #define VERBOSE_PRINTK_ERRSTRING(s) \ | 76 | #define VERBOSE_PRINTK_ERRSTRING(s) \ |
73 | do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) | 77 | do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) |
74 | 78 | ||
75 | static char printk_buf[4096]; | 79 | static char printk_buf[4096]; |
76 | 80 | ||
@@ -139,28 +143,6 @@ rcu_torture_free(struct rcu_torture *p) | |||
139 | spin_unlock_bh(&rcu_torture_lock); | 143 | spin_unlock_bh(&rcu_torture_lock); |
140 | } | 144 | } |
141 | 145 | ||
142 | static void | ||
143 | rcu_torture_cb(struct rcu_head *p) | ||
144 | { | ||
145 | int i; | ||
146 | struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); | ||
147 | |||
148 | if (fullstop) { | ||
149 | /* Test is ending, just drop callbacks on the floor. */ | ||
150 | /* The next initialization will pick up the pieces. */ | ||
151 | return; | ||
152 | } | ||
153 | i = rp->rtort_pipe_count; | ||
154 | if (i > RCU_TORTURE_PIPE_LEN) | ||
155 | i = RCU_TORTURE_PIPE_LEN; | ||
156 | atomic_inc(&rcu_torture_wcount[i]); | ||
157 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | ||
158 | rp->rtort_mbtest = 0; | ||
159 | rcu_torture_free(rp); | ||
160 | } else | ||
161 | call_rcu(p, rcu_torture_cb); | ||
162 | } | ||
163 | |||
164 | struct rcu_random_state { | 146 | struct rcu_random_state { |
165 | unsigned long rrs_state; | 147 | unsigned long rrs_state; |
166 | unsigned long rrs_count; | 148 | unsigned long rrs_count; |
@@ -191,6 +173,119 @@ rcu_random(struct rcu_random_state *rrsp) | |||
191 | } | 173 | } |
192 | 174 | ||
193 | /* | 175 | /* |
176 | * Operations vector for selecting different types of tests. | ||
177 | */ | ||
178 | |||
179 | struct rcu_torture_ops { | ||
180 | void (*init)(void); | ||
181 | void (*cleanup)(void); | ||
182 | int (*readlock)(void); | ||
183 | void (*readunlock)(int idx); | ||
184 | int (*completed)(void); | ||
185 | void (*deferredfree)(struct rcu_torture *p); | ||
186 | int (*stats)(char *page); | ||
187 | char *name; | ||
188 | }; | ||
189 | static struct rcu_torture_ops *cur_ops = NULL; | ||
190 | |||
191 | /* | ||
192 | * Definitions for rcu torture testing. | ||
193 | */ | ||
194 | |||
195 | static int rcu_torture_read_lock(void) | ||
196 | { | ||
197 | rcu_read_lock(); | ||
198 | return 0; | ||
199 | } | ||
200 | |||
201 | static void rcu_torture_read_unlock(int idx) | ||
202 | { | ||
203 | rcu_read_unlock(); | ||
204 | } | ||
205 | |||
206 | static int rcu_torture_completed(void) | ||
207 | { | ||
208 | return rcu_batches_completed(); | ||
209 | } | ||
210 | |||
211 | static void | ||
212 | rcu_torture_cb(struct rcu_head *p) | ||
213 | { | ||
214 | int i; | ||
215 | struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); | ||
216 | |||
217 | if (fullstop) { | ||
218 | /* Test is ending, just drop callbacks on the floor. */ | ||
219 | /* The next initialization will pick up the pieces. */ | ||
220 | return; | ||
221 | } | ||
222 | i = rp->rtort_pipe_count; | ||
223 | if (i > RCU_TORTURE_PIPE_LEN) | ||
224 | i = RCU_TORTURE_PIPE_LEN; | ||
225 | atomic_inc(&rcu_torture_wcount[i]); | ||
226 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | ||
227 | rp->rtort_mbtest = 0; | ||
228 | rcu_torture_free(rp); | ||
229 | } else | ||
230 | cur_ops->deferredfree(rp); | ||
231 | } | ||
232 | |||
233 | static void rcu_torture_deferred_free(struct rcu_torture *p) | ||
234 | { | ||
235 | call_rcu(&p->rtort_rcu, rcu_torture_cb); | ||
236 | } | ||
237 | |||
238 | static struct rcu_torture_ops rcu_ops = { | ||
239 | .init = NULL, | ||
240 | .cleanup = NULL, | ||
241 | .readlock = rcu_torture_read_lock, | ||
242 | .readunlock = rcu_torture_read_unlock, | ||
243 | .completed = rcu_torture_completed, | ||
244 | .deferredfree = rcu_torture_deferred_free, | ||
245 | .stats = NULL, | ||
246 | .name = "rcu" | ||
247 | }; | ||
248 | |||
249 | /* | ||
250 | * Definitions for rcu_bh torture testing. | ||
251 | */ | ||
252 | |||
253 | static int rcu_bh_torture_read_lock(void) | ||
254 | { | ||
255 | rcu_read_lock_bh(); | ||
256 | return 0; | ||
257 | } | ||
258 | |||
259 | static void rcu_bh_torture_read_unlock(int idx) | ||
260 | { | ||
261 | rcu_read_unlock_bh(); | ||
262 | } | ||
263 | |||
264 | static int rcu_bh_torture_completed(void) | ||
265 | { | ||
266 | return rcu_batches_completed_bh(); | ||
267 | } | ||
268 | |||
269 | static void rcu_bh_torture_deferred_free(struct rcu_torture *p) | ||
270 | { | ||
271 | call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); | ||
272 | } | ||
273 | |||
274 | static struct rcu_torture_ops rcu_bh_ops = { | ||
275 | .init = NULL, | ||
276 | .cleanup = NULL, | ||
277 | .readlock = rcu_bh_torture_read_lock, | ||
278 | .readunlock = rcu_bh_torture_read_unlock, | ||
279 | .completed = rcu_bh_torture_completed, | ||
280 | .deferredfree = rcu_bh_torture_deferred_free, | ||
281 | .stats = NULL, | ||
282 | .name = "rcu_bh" | ||
283 | }; | ||
284 | |||
285 | static struct rcu_torture_ops *torture_ops[] = | ||
286 | { &rcu_ops, &rcu_bh_ops, NULL }; | ||
287 | |||
288 | /* | ||
194 | * RCU torture writer kthread. Repeatedly substitutes a new structure | 289 | * RCU torture writer kthread. Repeatedly substitutes a new structure |
195 | * for that pointed to by rcu_torture_current, freeing the old structure | 290 | * for that pointed to by rcu_torture_current, freeing the old structure |
196 | * after a series of grace periods (the "pipeline"). | 291 | * after a series of grace periods (the "pipeline"). |
@@ -209,8 +304,6 @@ rcu_torture_writer(void *arg) | |||
209 | 304 | ||
210 | do { | 305 | do { |
211 | schedule_timeout_uninterruptible(1); | 306 | schedule_timeout_uninterruptible(1); |
212 | if (rcu_batches_completed() == oldbatch) | ||
213 | continue; | ||
214 | if ((rp = rcu_torture_alloc()) == NULL) | 307 | if ((rp = rcu_torture_alloc()) == NULL) |
215 | continue; | 308 | continue; |
216 | rp->rtort_pipe_count = 0; | 309 | rp->rtort_pipe_count = 0; |
@@ -225,10 +318,10 @@ rcu_torture_writer(void *arg) | |||
225 | i = RCU_TORTURE_PIPE_LEN; | 318 | i = RCU_TORTURE_PIPE_LEN; |
226 | atomic_inc(&rcu_torture_wcount[i]); | 319 | atomic_inc(&rcu_torture_wcount[i]); |
227 | old_rp->rtort_pipe_count++; | 320 | old_rp->rtort_pipe_count++; |
228 | call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); | 321 | cur_ops->deferredfree(old_rp); |
229 | } | 322 | } |
230 | rcu_torture_current_version++; | 323 | rcu_torture_current_version++; |
231 | oldbatch = rcu_batches_completed(); | 324 | oldbatch = cur_ops->completed(); |
232 | } while (!kthread_should_stop() && !fullstop); | 325 | } while (!kthread_should_stop() && !fullstop); |
233 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); | 326 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); |
234 | while (!kthread_should_stop()) | 327 | while (!kthread_should_stop()) |
@@ -246,6 +339,7 @@ static int | |||
246 | rcu_torture_reader(void *arg) | 339 | rcu_torture_reader(void *arg) |
247 | { | 340 | { |
248 | int completed; | 341 | int completed; |
342 | int idx; | ||
249 | DEFINE_RCU_RANDOM(rand); | 343 | DEFINE_RCU_RANDOM(rand); |
250 | struct rcu_torture *p; | 344 | struct rcu_torture *p; |
251 | int pipe_count; | 345 | int pipe_count; |
@@ -254,12 +348,12 @@ rcu_torture_reader(void *arg) | |||
254 | set_user_nice(current, 19); | 348 | set_user_nice(current, 19); |
255 | 349 | ||
256 | do { | 350 | do { |
257 | rcu_read_lock(); | 351 | idx = cur_ops->readlock(); |
258 | completed = rcu_batches_completed(); | 352 | completed = cur_ops->completed(); |
259 | p = rcu_dereference(rcu_torture_current); | 353 | p = rcu_dereference(rcu_torture_current); |
260 | if (p == NULL) { | 354 | if (p == NULL) { |
261 | /* Wait for rcu_torture_writer to get underway */ | 355 | /* Wait for rcu_torture_writer to get underway */ |
262 | rcu_read_unlock(); | 356 | cur_ops->readunlock(idx); |
263 | schedule_timeout_interruptible(HZ); | 357 | schedule_timeout_interruptible(HZ); |
264 | continue; | 358 | continue; |
265 | } | 359 | } |
@@ -273,14 +367,14 @@ rcu_torture_reader(void *arg) | |||
273 | pipe_count = RCU_TORTURE_PIPE_LEN; | 367 | pipe_count = RCU_TORTURE_PIPE_LEN; |
274 | } | 368 | } |
275 | ++__get_cpu_var(rcu_torture_count)[pipe_count]; | 369 | ++__get_cpu_var(rcu_torture_count)[pipe_count]; |
276 | completed = rcu_batches_completed() - completed; | 370 | completed = cur_ops->completed() - completed; |
277 | if (completed > RCU_TORTURE_PIPE_LEN) { | 371 | if (completed > RCU_TORTURE_PIPE_LEN) { |
278 | /* Should not happen, but... */ | 372 | /* Should not happen, but... */ |
279 | completed = RCU_TORTURE_PIPE_LEN; | 373 | completed = RCU_TORTURE_PIPE_LEN; |
280 | } | 374 | } |
281 | ++__get_cpu_var(rcu_torture_batch)[completed]; | 375 | ++__get_cpu_var(rcu_torture_batch)[completed]; |
282 | preempt_enable(); | 376 | preempt_enable(); |
283 | rcu_read_unlock(); | 377 | cur_ops->readunlock(idx); |
284 | schedule(); | 378 | schedule(); |
285 | } while (!kthread_should_stop() && !fullstop); | 379 | } while (!kthread_should_stop() && !fullstop); |
286 | VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); | 380 | VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); |
@@ -311,7 +405,7 @@ rcu_torture_printk(char *page) | |||
311 | if (pipesummary[i] != 0) | 405 | if (pipesummary[i] != 0) |
312 | break; | 406 | break; |
313 | } | 407 | } |
314 | cnt += sprintf(&page[cnt], "rcutorture: "); | 408 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); |
315 | cnt += sprintf(&page[cnt], | 409 | cnt += sprintf(&page[cnt], |
316 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " | 410 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " |
317 | "rtmbe: %d", | 411 | "rtmbe: %d", |
@@ -324,7 +418,7 @@ rcu_torture_printk(char *page) | |||
324 | atomic_read(&n_rcu_torture_mberror)); | 418 | atomic_read(&n_rcu_torture_mberror)); |
325 | if (atomic_read(&n_rcu_torture_mberror) != 0) | 419 | if (atomic_read(&n_rcu_torture_mberror) != 0) |
326 | cnt += sprintf(&page[cnt], " !!!"); | 420 | cnt += sprintf(&page[cnt], " !!!"); |
327 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | 421 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
328 | if (i > 1) { | 422 | if (i > 1) { |
329 | cnt += sprintf(&page[cnt], "!!! "); | 423 | cnt += sprintf(&page[cnt], "!!! "); |
330 | atomic_inc(&n_rcu_torture_error); | 424 | atomic_inc(&n_rcu_torture_error); |
@@ -332,17 +426,19 @@ rcu_torture_printk(char *page) | |||
332 | cnt += sprintf(&page[cnt], "Reader Pipe: "); | 426 | cnt += sprintf(&page[cnt], "Reader Pipe: "); |
333 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 427 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
334 | cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); | 428 | cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); |
335 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | 429 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
336 | cnt += sprintf(&page[cnt], "Reader Batch: "); | 430 | cnt += sprintf(&page[cnt], "Reader Batch: "); |
337 | for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) | 431 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
338 | cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); | 432 | cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); |
339 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | 433 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
340 | cnt += sprintf(&page[cnt], "Free-Block Circulation: "); | 434 | cnt += sprintf(&page[cnt], "Free-Block Circulation: "); |
341 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | 435 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { |
342 | cnt += sprintf(&page[cnt], " %d", | 436 | cnt += sprintf(&page[cnt], " %d", |
343 | atomic_read(&rcu_torture_wcount[i])); | 437 | atomic_read(&rcu_torture_wcount[i])); |
344 | } | 438 | } |
345 | cnt += sprintf(&page[cnt], "\n"); | 439 | cnt += sprintf(&page[cnt], "\n"); |
440 | if (cur_ops->stats != NULL) | ||
441 | cnt += cur_ops->stats(&page[cnt]); | ||
346 | return cnt; | 442 | return cnt; |
347 | } | 443 | } |
348 | 444 | ||
@@ -444,11 +540,11 @@ rcu_torture_shuffle(void *arg) | |||
444 | static inline void | 540 | static inline void |
445 | rcu_torture_print_module_parms(char *tag) | 541 | rcu_torture_print_module_parms(char *tag) |
446 | { | 542 | { |
447 | printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d " | 543 | printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d " |
448 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " | 544 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " |
449 | "shuffle_interval = %d\n", | 545 | "shuffle_interval = %d\n", |
450 | tag, nrealreaders, stat_interval, verbose, test_no_idle_hz, | 546 | torture_type, tag, nrealreaders, stat_interval, verbose, |
451 | shuffle_interval); | 547 | test_no_idle_hz, shuffle_interval); |
452 | } | 548 | } |
453 | 549 | ||
454 | static void | 550 | static void |
@@ -493,6 +589,9 @@ rcu_torture_cleanup(void) | |||
493 | rcu_barrier(); | 589 | rcu_barrier(); |
494 | 590 | ||
495 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ | 591 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ |
592 | |||
593 | if (cur_ops->cleanup != NULL) | ||
594 | cur_ops->cleanup(); | ||
496 | if (atomic_read(&n_rcu_torture_error)) | 595 | if (atomic_read(&n_rcu_torture_error)) |
497 | rcu_torture_print_module_parms("End of test: FAILURE"); | 596 | rcu_torture_print_module_parms("End of test: FAILURE"); |
498 | else | 597 | else |
@@ -508,6 +607,20 @@ rcu_torture_init(void) | |||
508 | 607 | ||
509 | /* Process args and tell the world that the torturer is on the job. */ | 608 | /* Process args and tell the world that the torturer is on the job. */ |
510 | 609 | ||
610 | for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) { | ||
611 | cur_ops = torture_ops[i]; | ||
612 | if (strcmp(torture_type, cur_ops->name) == 0) { | ||
613 | break; | ||
614 | } | ||
615 | } | ||
616 | if (cur_ops == NULL) { | ||
617 | printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", | ||
618 | torture_type); | ||
619 | return (-EINVAL); | ||
620 | } | ||
621 | if (cur_ops->init != NULL) | ||
622 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ | ||
623 | |||
511 | if (nreaders >= 0) | 624 | if (nreaders >= 0) |
512 | nrealreaders = nreaders; | 625 | nrealreaders = nreaders; |
513 | else | 626 | else |
diff --git a/kernel/resource.c b/kernel/resource.c index e3080fcc66a3..129cf046e561 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -7,7 +7,6 @@ | |||
7 | * Arbitrary resource management. | 7 | * Arbitrary resource management. |
8 | */ | 8 | */ |
9 | 9 | ||
10 | #include <linux/config.h> | ||
11 | #include <linux/module.h> | 10 | #include <linux/module.h> |
12 | #include <linux/sched.h> | 11 | #include <linux/sched.h> |
13 | #include <linux/errno.h> | 12 | #include <linux/errno.h> |
@@ -23,20 +22,18 @@ | |||
23 | 22 | ||
24 | struct resource ioport_resource = { | 23 | struct resource ioport_resource = { |
25 | .name = "PCI IO", | 24 | .name = "PCI IO", |
26 | .start = 0x0000, | 25 | .start = 0, |
27 | .end = IO_SPACE_LIMIT, | 26 | .end = IO_SPACE_LIMIT, |
28 | .flags = IORESOURCE_IO, | 27 | .flags = IORESOURCE_IO, |
29 | }; | 28 | }; |
30 | |||
31 | EXPORT_SYMBOL(ioport_resource); | 29 | EXPORT_SYMBOL(ioport_resource); |
32 | 30 | ||
33 | struct resource iomem_resource = { | 31 | struct resource iomem_resource = { |
34 | .name = "PCI mem", | 32 | .name = "PCI mem", |
35 | .start = 0UL, | 33 | .start = 0, |
36 | .end = ~0UL, | 34 | .end = -1, |
37 | .flags = IORESOURCE_MEM, | 35 | .flags = IORESOURCE_MEM, |
38 | }; | 36 | }; |
39 | |||
40 | EXPORT_SYMBOL(iomem_resource); | 37 | EXPORT_SYMBOL(iomem_resource); |
41 | 38 | ||
42 | static DEFINE_RWLOCK(resource_lock); | 39 | static DEFINE_RWLOCK(resource_lock); |
@@ -83,10 +80,10 @@ static int r_show(struct seq_file *m, void *v) | |||
83 | for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) | 80 | for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) |
84 | if (p->parent == root) | 81 | if (p->parent == root) |
85 | break; | 82 | break; |
86 | seq_printf(m, "%*s%0*lx-%0*lx : %s\n", | 83 | seq_printf(m, "%*s%0*llx-%0*llx : %s\n", |
87 | depth * 2, "", | 84 | depth * 2, "", |
88 | width, r->start, | 85 | width, (unsigned long long) r->start, |
89 | width, r->end, | 86 | width, (unsigned long long) r->end, |
90 | r->name ? r->name : "<BAD>"); | 87 | r->name ? r->name : "<BAD>"); |
91 | return 0; | 88 | return 0; |
92 | } | 89 | } |
@@ -151,8 +148,8 @@ __initcall(ioresources_init); | |||
151 | /* Return the conflict entry if you can't request it */ | 148 | /* Return the conflict entry if you can't request it */ |
152 | static struct resource * __request_resource(struct resource *root, struct resource *new) | 149 | static struct resource * __request_resource(struct resource *root, struct resource *new) |
153 | { | 150 | { |
154 | unsigned long start = new->start; | 151 | resource_size_t start = new->start; |
155 | unsigned long end = new->end; | 152 | resource_size_t end = new->end; |
156 | struct resource *tmp, **p; | 153 | struct resource *tmp, **p; |
157 | 154 | ||
158 | if (end < start) | 155 | if (end < start) |
@@ -232,15 +229,52 @@ int release_resource(struct resource *old) | |||
232 | 229 | ||
233 | EXPORT_SYMBOL(release_resource); | 230 | EXPORT_SYMBOL(release_resource); |
234 | 231 | ||
232 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
233 | /* | ||
234 | * Finds the lowest memory reosurce exists within [res->start.res->end) | ||
235 | * the caller must specify res->start, res->end, res->flags. | ||
236 | * If found, returns 0, res is overwritten, if not found, returns -1. | ||
237 | */ | ||
238 | int find_next_system_ram(struct resource *res) | ||
239 | { | ||
240 | resource_size_t start, end; | ||
241 | struct resource *p; | ||
242 | |||
243 | BUG_ON(!res); | ||
244 | |||
245 | start = res->start; | ||
246 | end = res->end; | ||
247 | |||
248 | read_lock(&resource_lock); | ||
249 | for (p = iomem_resource.child; p ; p = p->sibling) { | ||
250 | /* system ram is just marked as IORESOURCE_MEM */ | ||
251 | if (p->flags != res->flags) | ||
252 | continue; | ||
253 | if (p->start > end) { | ||
254 | p = NULL; | ||
255 | break; | ||
256 | } | ||
257 | if (p->start >= start) | ||
258 | break; | ||
259 | } | ||
260 | read_unlock(&resource_lock); | ||
261 | if (!p) | ||
262 | return -1; | ||
263 | /* copy data */ | ||
264 | res->start = p->start; | ||
265 | res->end = p->end; | ||
266 | return 0; | ||
267 | } | ||
268 | #endif | ||
269 | |||
235 | /* | 270 | /* |
236 | * Find empty slot in the resource tree given range and alignment. | 271 | * Find empty slot in the resource tree given range and alignment. |
237 | */ | 272 | */ |
238 | static int find_resource(struct resource *root, struct resource *new, | 273 | static int find_resource(struct resource *root, struct resource *new, |
239 | unsigned long size, | 274 | resource_size_t size, resource_size_t min, |
240 | unsigned long min, unsigned long max, | 275 | resource_size_t max, resource_size_t align, |
241 | unsigned long align, | ||
242 | void (*alignf)(void *, struct resource *, | 276 | void (*alignf)(void *, struct resource *, |
243 | unsigned long, unsigned long), | 277 | resource_size_t, resource_size_t), |
244 | void *alignf_data) | 278 | void *alignf_data) |
245 | { | 279 | { |
246 | struct resource *this = root->child; | 280 | struct resource *this = root->child; |
@@ -282,11 +316,10 @@ static int find_resource(struct resource *root, struct resource *new, | |||
282 | * Allocate empty slot in the resource tree given range and alignment. | 316 | * Allocate empty slot in the resource tree given range and alignment. |
283 | */ | 317 | */ |
284 | int allocate_resource(struct resource *root, struct resource *new, | 318 | int allocate_resource(struct resource *root, struct resource *new, |
285 | unsigned long size, | 319 | resource_size_t size, resource_size_t min, |
286 | unsigned long min, unsigned long max, | 320 | resource_size_t max, resource_size_t align, |
287 | unsigned long align, | ||
288 | void (*alignf)(void *, struct resource *, | 321 | void (*alignf)(void *, struct resource *, |
289 | unsigned long, unsigned long), | 322 | resource_size_t, resource_size_t), |
290 | void *alignf_data) | 323 | void *alignf_data) |
291 | { | 324 | { |
292 | int err; | 325 | int err; |
@@ -378,10 +411,10 @@ EXPORT_SYMBOL(insert_resource); | |||
378 | * arguments. Returns -EBUSY if it can't fit. Existing children of | 411 | * arguments. Returns -EBUSY if it can't fit. Existing children of |
379 | * the resource are assumed to be immutable. | 412 | * the resource are assumed to be immutable. |
380 | */ | 413 | */ |
381 | int adjust_resource(struct resource *res, unsigned long start, unsigned long size) | 414 | int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) |
382 | { | 415 | { |
383 | struct resource *tmp, *parent = res->parent; | 416 | struct resource *tmp, *parent = res->parent; |
384 | unsigned long end = start + size - 1; | 417 | resource_size_t end = start + size - 1; |
385 | int result = -EBUSY; | 418 | int result = -EBUSY; |
386 | 419 | ||
387 | write_lock(&resource_lock); | 420 | write_lock(&resource_lock); |
@@ -428,7 +461,9 @@ EXPORT_SYMBOL(adjust_resource); | |||
428 | * | 461 | * |
429 | * Release-region releases a matching busy region. | 462 | * Release-region releases a matching busy region. |
430 | */ | 463 | */ |
431 | struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name) | 464 | struct resource * __request_region(struct resource *parent, |
465 | resource_size_t start, resource_size_t n, | ||
466 | const char *name) | ||
432 | { | 467 | { |
433 | struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); | 468 | struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); |
434 | 469 | ||
@@ -464,7 +499,8 @@ struct resource * __request_region(struct resource *parent, unsigned long start, | |||
464 | 499 | ||
465 | EXPORT_SYMBOL(__request_region); | 500 | EXPORT_SYMBOL(__request_region); |
466 | 501 | ||
467 | int __check_region(struct resource *parent, unsigned long start, unsigned long n) | 502 | int __check_region(struct resource *parent, resource_size_t start, |
503 | resource_size_t n) | ||
468 | { | 504 | { |
469 | struct resource * res; | 505 | struct resource * res; |
470 | 506 | ||
@@ -479,10 +515,11 @@ int __check_region(struct resource *parent, unsigned long start, unsigned long n | |||
479 | 515 | ||
480 | EXPORT_SYMBOL(__check_region); | 516 | EXPORT_SYMBOL(__check_region); |
481 | 517 | ||
482 | void __release_region(struct resource *parent, unsigned long start, unsigned long n) | 518 | void __release_region(struct resource *parent, resource_size_t start, |
519 | resource_size_t n) | ||
483 | { | 520 | { |
484 | struct resource **p; | 521 | struct resource **p; |
485 | unsigned long end; | 522 | resource_size_t end; |
486 | 523 | ||
487 | p = &parent->child; | 524 | p = &parent->child; |
488 | end = start + n - 1; | 525 | end = start + n - 1; |
@@ -511,7 +548,9 @@ void __release_region(struct resource *parent, unsigned long start, unsigned lon | |||
511 | 548 | ||
512 | write_unlock(&resource_lock); | 549 | write_unlock(&resource_lock); |
513 | 550 | ||
514 | printk(KERN_WARNING "Trying to free nonexistent resource <%08lx-%08lx>\n", start, end); | 551 | printk(KERN_WARNING "Trying to free nonexistent resource " |
552 | "<%016llx-%016llx>\n", (unsigned long long)start, | ||
553 | (unsigned long long)end); | ||
515 | } | 554 | } |
516 | 555 | ||
517 | EXPORT_SYMBOL(__release_region); | 556 | EXPORT_SYMBOL(__release_region); |
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c new file mode 100644 index 000000000000..4aa8a2c9f453 --- /dev/null +++ b/kernel/rtmutex-debug.c | |||
@@ -0,0 +1,513 @@ | |||
1 | /* | ||
2 | * RT-Mutexes: blocking mutual exclusion locks with PI support | ||
3 | * | ||
4 | * started by Ingo Molnar and Thomas Gleixner: | ||
5 | * | ||
6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
8 | * | ||
9 | * This code is based on the rt.c implementation in the preempt-rt tree. | ||
10 | * Portions of said code are | ||
11 | * | ||
12 | * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey | ||
13 | * Copyright (C) 2006 Esben Nielsen | ||
14 | * Copyright (C) 2006 Kihon Technologies Inc., | ||
15 | * Steven Rostedt <rostedt@goodmis.org> | ||
16 | * | ||
17 | * See rt.c in preempt-rt for proper credits and further information | ||
18 | */ | ||
19 | #include <linux/config.h> | ||
20 | #include <linux/sched.h> | ||
21 | #include <linux/delay.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <linux/spinlock.h> | ||
24 | #include <linux/kallsyms.h> | ||
25 | #include <linux/syscalls.h> | ||
26 | #include <linux/interrupt.h> | ||
27 | #include <linux/plist.h> | ||
28 | #include <linux/fs.h> | ||
29 | |||
30 | #include "rtmutex_common.h" | ||
31 | |||
32 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
33 | # include "rtmutex-debug.h" | ||
34 | #else | ||
35 | # include "rtmutex.h" | ||
36 | #endif | ||
37 | |||
38 | # define TRACE_WARN_ON(x) WARN_ON(x) | ||
39 | # define TRACE_BUG_ON(x) BUG_ON(x) | ||
40 | |||
41 | # define TRACE_OFF() \ | ||
42 | do { \ | ||
43 | if (rt_trace_on) { \ | ||
44 | rt_trace_on = 0; \ | ||
45 | console_verbose(); \ | ||
46 | if (spin_is_locked(¤t->pi_lock)) \ | ||
47 | spin_unlock(¤t->pi_lock); \ | ||
48 | if (spin_is_locked(¤t->held_list_lock)) \ | ||
49 | spin_unlock(¤t->held_list_lock); \ | ||
50 | } \ | ||
51 | } while (0) | ||
52 | |||
53 | # define TRACE_OFF_NOLOCK() \ | ||
54 | do { \ | ||
55 | if (rt_trace_on) { \ | ||
56 | rt_trace_on = 0; \ | ||
57 | console_verbose(); \ | ||
58 | } \ | ||
59 | } while (0) | ||
60 | |||
61 | # define TRACE_BUG_LOCKED() \ | ||
62 | do { \ | ||
63 | TRACE_OFF(); \ | ||
64 | BUG(); \ | ||
65 | } while (0) | ||
66 | |||
67 | # define TRACE_WARN_ON_LOCKED(c) \ | ||
68 | do { \ | ||
69 | if (unlikely(c)) { \ | ||
70 | TRACE_OFF(); \ | ||
71 | WARN_ON(1); \ | ||
72 | } \ | ||
73 | } while (0) | ||
74 | |||
75 | # define TRACE_BUG_ON_LOCKED(c) \ | ||
76 | do { \ | ||
77 | if (unlikely(c)) \ | ||
78 | TRACE_BUG_LOCKED(); \ | ||
79 | } while (0) | ||
80 | |||
81 | #ifdef CONFIG_SMP | ||
82 | # define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) | ||
83 | #else | ||
84 | # define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) | ||
85 | #endif | ||
86 | |||
87 | /* | ||
88 | * deadlock detection flag. We turn it off when we detect | ||
89 | * the first problem because we dont want to recurse back | ||
90 | * into the tracing code when doing error printk or | ||
91 | * executing a BUG(): | ||
92 | */ | ||
93 | int rt_trace_on = 1; | ||
94 | |||
95 | void deadlock_trace_off(void) | ||
96 | { | ||
97 | rt_trace_on = 0; | ||
98 | } | ||
99 | |||
100 | static void printk_task(task_t *p) | ||
101 | { | ||
102 | if (p) | ||
103 | printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio); | ||
104 | else | ||
105 | printk("<none>"); | ||
106 | } | ||
107 | |||
108 | static void printk_task_short(task_t *p) | ||
109 | { | ||
110 | if (p) | ||
111 | printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio); | ||
112 | else | ||
113 | printk("<none>"); | ||
114 | } | ||
115 | |||
116 | static void printk_lock(struct rt_mutex *lock, int print_owner) | ||
117 | { | ||
118 | if (lock->name) | ||
119 | printk(" [%p] {%s}\n", | ||
120 | lock, lock->name); | ||
121 | else | ||
122 | printk(" [%p] {%s:%d}\n", | ||
123 | lock, lock->file, lock->line); | ||
124 | |||
125 | if (print_owner && rt_mutex_owner(lock)) { | ||
126 | printk(".. ->owner: %p\n", lock->owner); | ||
127 | printk(".. held by: "); | ||
128 | printk_task(rt_mutex_owner(lock)); | ||
129 | printk("\n"); | ||
130 | } | ||
131 | if (rt_mutex_owner(lock)) { | ||
132 | printk("... acquired at: "); | ||
133 | print_symbol("%s\n", lock->acquire_ip); | ||
134 | } | ||
135 | } | ||
136 | |||
137 | static void printk_waiter(struct rt_mutex_waiter *w) | ||
138 | { | ||
139 | printk("-------------------------\n"); | ||
140 | printk("| waiter struct %p:\n", w); | ||
141 | printk("| w->list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n", | ||
142 | w->list_entry.plist.prio_list.prev, w->list_entry.plist.prio_list.next, | ||
143 | w->list_entry.plist.node_list.prev, w->list_entry.plist.node_list.next, | ||
144 | w->list_entry.prio); | ||
145 | printk("| w->pi_list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n", | ||
146 | w->pi_list_entry.plist.prio_list.prev, w->pi_list_entry.plist.prio_list.next, | ||
147 | w->pi_list_entry.plist.node_list.prev, w->pi_list_entry.plist.node_list.next, | ||
148 | w->pi_list_entry.prio); | ||
149 | printk("\n| lock:\n"); | ||
150 | printk_lock(w->lock, 1); | ||
151 | printk("| w->ti->task:\n"); | ||
152 | printk_task(w->task); | ||
153 | printk("| blocked at: "); | ||
154 | print_symbol("%s\n", w->ip); | ||
155 | printk("-------------------------\n"); | ||
156 | } | ||
157 | |||
158 | static void show_task_locks(task_t *p) | ||
159 | { | ||
160 | switch (p->state) { | ||
161 | case TASK_RUNNING: printk("R"); break; | ||
162 | case TASK_INTERRUPTIBLE: printk("S"); break; | ||
163 | case TASK_UNINTERRUPTIBLE: printk("D"); break; | ||
164 | case TASK_STOPPED: printk("T"); break; | ||
165 | case EXIT_ZOMBIE: printk("Z"); break; | ||
166 | case EXIT_DEAD: printk("X"); break; | ||
167 | default: printk("?"); break; | ||
168 | } | ||
169 | printk_task(p); | ||
170 | if (p->pi_blocked_on) { | ||
171 | struct rt_mutex *lock = p->pi_blocked_on->lock; | ||
172 | |||
173 | printk(" blocked on:"); | ||
174 | printk_lock(lock, 1); | ||
175 | } else | ||
176 | printk(" (not blocked)\n"); | ||
177 | } | ||
178 | |||
179 | void rt_mutex_show_held_locks(task_t *task, int verbose) | ||
180 | { | ||
181 | struct list_head *curr, *cursor = NULL; | ||
182 | struct rt_mutex *lock; | ||
183 | task_t *t; | ||
184 | unsigned long flags; | ||
185 | int count = 0; | ||
186 | |||
187 | if (!rt_trace_on) | ||
188 | return; | ||
189 | |||
190 | if (verbose) { | ||
191 | printk("------------------------------\n"); | ||
192 | printk("| showing all locks held by: | ("); | ||
193 | printk_task_short(task); | ||
194 | printk("):\n"); | ||
195 | printk("------------------------------\n"); | ||
196 | } | ||
197 | |||
198 | next: | ||
199 | spin_lock_irqsave(&task->held_list_lock, flags); | ||
200 | list_for_each(curr, &task->held_list_head) { | ||
201 | if (cursor && curr != cursor) | ||
202 | continue; | ||
203 | lock = list_entry(curr, struct rt_mutex, held_list_entry); | ||
204 | t = rt_mutex_owner(lock); | ||
205 | WARN_ON(t != task); | ||
206 | count++; | ||
207 | cursor = curr->next; | ||
208 | spin_unlock_irqrestore(&task->held_list_lock, flags); | ||
209 | |||
210 | printk("\n#%03d: ", count); | ||
211 | printk_lock(lock, 0); | ||
212 | goto next; | ||
213 | } | ||
214 | spin_unlock_irqrestore(&task->held_list_lock, flags); | ||
215 | |||
216 | printk("\n"); | ||
217 | } | ||
218 | |||
219 | void rt_mutex_show_all_locks(void) | ||
220 | { | ||
221 | task_t *g, *p; | ||
222 | int count = 10; | ||
223 | int unlock = 1; | ||
224 | |||
225 | printk("\n"); | ||
226 | printk("----------------------\n"); | ||
227 | printk("| showing all tasks: |\n"); | ||
228 | printk("----------------------\n"); | ||
229 | |||
230 | /* | ||
231 | * Here we try to get the tasklist_lock as hard as possible, | ||
232 | * if not successful after 2 seconds we ignore it (but keep | ||
233 | * trying). This is to enable a debug printout even if a | ||
234 | * tasklist_lock-holding task deadlocks or crashes. | ||
235 | */ | ||
236 | retry: | ||
237 | if (!read_trylock(&tasklist_lock)) { | ||
238 | if (count == 10) | ||
239 | printk("hm, tasklist_lock locked, retrying... "); | ||
240 | if (count) { | ||
241 | count--; | ||
242 | printk(" #%d", 10-count); | ||
243 | mdelay(200); | ||
244 | goto retry; | ||
245 | } | ||
246 | printk(" ignoring it.\n"); | ||
247 | unlock = 0; | ||
248 | } | ||
249 | if (count != 10) | ||
250 | printk(" locked it.\n"); | ||
251 | |||
252 | do_each_thread(g, p) { | ||
253 | show_task_locks(p); | ||
254 | if (!unlock) | ||
255 | if (read_trylock(&tasklist_lock)) | ||
256 | unlock = 1; | ||
257 | } while_each_thread(g, p); | ||
258 | |||
259 | printk("\n"); | ||
260 | |||
261 | printk("-----------------------------------------\n"); | ||
262 | printk("| showing all locks held in the system: |\n"); | ||
263 | printk("-----------------------------------------\n"); | ||
264 | |||
265 | do_each_thread(g, p) { | ||
266 | rt_mutex_show_held_locks(p, 0); | ||
267 | if (!unlock) | ||
268 | if (read_trylock(&tasklist_lock)) | ||
269 | unlock = 1; | ||
270 | } while_each_thread(g, p); | ||
271 | |||
272 | |||
273 | printk("=============================================\n\n"); | ||
274 | |||
275 | if (unlock) | ||
276 | read_unlock(&tasklist_lock); | ||
277 | } | ||
278 | |||
279 | void rt_mutex_debug_check_no_locks_held(task_t *task) | ||
280 | { | ||
281 | struct rt_mutex_waiter *w; | ||
282 | struct list_head *curr; | ||
283 | struct rt_mutex *lock; | ||
284 | |||
285 | if (!rt_trace_on) | ||
286 | return; | ||
287 | if (!rt_prio(task->normal_prio) && rt_prio(task->prio)) { | ||
288 | printk("BUG: PI priority boost leaked!\n"); | ||
289 | printk_task(task); | ||
290 | printk("\n"); | ||
291 | } | ||
292 | if (list_empty(&task->held_list_head)) | ||
293 | return; | ||
294 | |||
295 | spin_lock(&task->pi_lock); | ||
296 | plist_for_each_entry(w, &task->pi_waiters, pi_list_entry) { | ||
297 | TRACE_OFF(); | ||
298 | |||
299 | printk("hm, PI interest held at exit time? Task:\n"); | ||
300 | printk_task(task); | ||
301 | printk_waiter(w); | ||
302 | return; | ||
303 | } | ||
304 | spin_unlock(&task->pi_lock); | ||
305 | |||
306 | list_for_each(curr, &task->held_list_head) { | ||
307 | lock = list_entry(curr, struct rt_mutex, held_list_entry); | ||
308 | |||
309 | printk("BUG: %s/%d, lock held at task exit time!\n", | ||
310 | task->comm, task->pid); | ||
311 | printk_lock(lock, 1); | ||
312 | if (rt_mutex_owner(lock) != task) | ||
313 | printk("exiting task is not even the owner??\n"); | ||
314 | } | ||
315 | } | ||
316 | |||
317 | int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len) | ||
318 | { | ||
319 | const void *to = from + len; | ||
320 | struct list_head *curr; | ||
321 | struct rt_mutex *lock; | ||
322 | unsigned long flags; | ||
323 | void *lock_addr; | ||
324 | |||
325 | if (!rt_trace_on) | ||
326 | return 0; | ||
327 | |||
328 | spin_lock_irqsave(¤t->held_list_lock, flags); | ||
329 | list_for_each(curr, ¤t->held_list_head) { | ||
330 | lock = list_entry(curr, struct rt_mutex, held_list_entry); | ||
331 | lock_addr = lock; | ||
332 | if (lock_addr < from || lock_addr >= to) | ||
333 | continue; | ||
334 | TRACE_OFF(); | ||
335 | |||
336 | printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", | ||
337 | current->comm, current->pid, lock, from, to); | ||
338 | dump_stack(); | ||
339 | printk_lock(lock, 1); | ||
340 | if (rt_mutex_owner(lock) != current) | ||
341 | printk("freeing task is not even the owner??\n"); | ||
342 | return 1; | ||
343 | } | ||
344 | spin_unlock_irqrestore(¤t->held_list_lock, flags); | ||
345 | |||
346 | return 0; | ||
347 | } | ||
348 | |||
349 | void rt_mutex_debug_task_free(struct task_struct *task) | ||
350 | { | ||
351 | WARN_ON(!plist_head_empty(&task->pi_waiters)); | ||
352 | WARN_ON(task->pi_blocked_on); | ||
353 | } | ||
354 | |||
355 | /* | ||
356 | * We fill out the fields in the waiter to store the information about | ||
357 | * the deadlock. We print when we return. act_waiter can be NULL in | ||
358 | * case of a remove waiter operation. | ||
359 | */ | ||
360 | void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, | ||
361 | struct rt_mutex *lock) | ||
362 | { | ||
363 | struct task_struct *task; | ||
364 | |||
365 | if (!rt_trace_on || detect || !act_waiter) | ||
366 | return; | ||
367 | |||
368 | task = rt_mutex_owner(act_waiter->lock); | ||
369 | if (task && task != current) { | ||
370 | act_waiter->deadlock_task_pid = task->pid; | ||
371 | act_waiter->deadlock_lock = lock; | ||
372 | } | ||
373 | } | ||
374 | |||
375 | void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) | ||
376 | { | ||
377 | struct task_struct *task; | ||
378 | |||
379 | if (!waiter->deadlock_lock || !rt_trace_on) | ||
380 | return; | ||
381 | |||
382 | task = find_task_by_pid(waiter->deadlock_task_pid); | ||
383 | if (!task) | ||
384 | return; | ||
385 | |||
386 | TRACE_OFF_NOLOCK(); | ||
387 | |||
388 | printk("\n============================================\n"); | ||
389 | printk( "[ BUG: circular locking deadlock detected! ]\n"); | ||
390 | printk( "--------------------------------------------\n"); | ||
391 | printk("%s/%d is deadlocking current task %s/%d\n\n", | ||
392 | task->comm, task->pid, current->comm, current->pid); | ||
393 | |||
394 | printk("\n1) %s/%d is trying to acquire this lock:\n", | ||
395 | current->comm, current->pid); | ||
396 | printk_lock(waiter->lock, 1); | ||
397 | |||
398 | printk("... trying at: "); | ||
399 | print_symbol("%s\n", waiter->ip); | ||
400 | |||
401 | printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid); | ||
402 | printk_lock(waiter->deadlock_lock, 1); | ||
403 | |||
404 | rt_mutex_show_held_locks(current, 1); | ||
405 | rt_mutex_show_held_locks(task, 1); | ||
406 | |||
407 | printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid); | ||
408 | show_stack(task, NULL); | ||
409 | printk("\n%s/%d's [current] stackdump:\n\n", | ||
410 | current->comm, current->pid); | ||
411 | dump_stack(); | ||
412 | rt_mutex_show_all_locks(); | ||
413 | printk("[ turning off deadlock detection." | ||
414 | "Please report this trace. ]\n\n"); | ||
415 | local_irq_disable(); | ||
416 | } | ||
417 | |||
418 | void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__) | ||
419 | { | ||
420 | unsigned long flags; | ||
421 | |||
422 | if (rt_trace_on) { | ||
423 | TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry)); | ||
424 | |||
425 | spin_lock_irqsave(¤t->held_list_lock, flags); | ||
426 | list_add_tail(&lock->held_list_entry, ¤t->held_list_head); | ||
427 | spin_unlock_irqrestore(¤t->held_list_lock, flags); | ||
428 | |||
429 | lock->acquire_ip = ip; | ||
430 | } | ||
431 | } | ||
432 | |||
433 | void debug_rt_mutex_unlock(struct rt_mutex *lock) | ||
434 | { | ||
435 | unsigned long flags; | ||
436 | |||
437 | if (rt_trace_on) { | ||
438 | TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); | ||
439 | TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry)); | ||
440 | |||
441 | spin_lock_irqsave(¤t->held_list_lock, flags); | ||
442 | list_del_init(&lock->held_list_entry); | ||
443 | spin_unlock_irqrestore(¤t->held_list_lock, flags); | ||
444 | } | ||
445 | } | ||
446 | |||
447 | void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, | ||
448 | struct task_struct *powner __IP_DECL__) | ||
449 | { | ||
450 | unsigned long flags; | ||
451 | |||
452 | if (rt_trace_on) { | ||
453 | TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry)); | ||
454 | |||
455 | spin_lock_irqsave(&powner->held_list_lock, flags); | ||
456 | list_add_tail(&lock->held_list_entry, &powner->held_list_head); | ||
457 | spin_unlock_irqrestore(&powner->held_list_lock, flags); | ||
458 | |||
459 | lock->acquire_ip = ip; | ||
460 | } | ||
461 | } | ||
462 | |||
463 | void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) | ||
464 | { | ||
465 | unsigned long flags; | ||
466 | |||
467 | if (rt_trace_on) { | ||
468 | struct task_struct *owner = rt_mutex_owner(lock); | ||
469 | |||
470 | TRACE_WARN_ON_LOCKED(!owner); | ||
471 | TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry)); | ||
472 | |||
473 | spin_lock_irqsave(&owner->held_list_lock, flags); | ||
474 | list_del_init(&lock->held_list_entry); | ||
475 | spin_unlock_irqrestore(&owner->held_list_lock, flags); | ||
476 | } | ||
477 | } | ||
478 | |||
479 | void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) | ||
480 | { | ||
481 | memset(waiter, 0x11, sizeof(*waiter)); | ||
482 | plist_node_init(&waiter->list_entry, MAX_PRIO); | ||
483 | plist_node_init(&waiter->pi_list_entry, MAX_PRIO); | ||
484 | } | ||
485 | |||
486 | void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) | ||
487 | { | ||
488 | TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); | ||
489 | TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); | ||
490 | TRACE_WARN_ON(waiter->task); | ||
491 | memset(waiter, 0x22, sizeof(*waiter)); | ||
492 | } | ||
493 | |||
494 | void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) | ||
495 | { | ||
496 | void *addr = lock; | ||
497 | |||
498 | if (rt_trace_on) { | ||
499 | rt_mutex_debug_check_no_locks_freed(addr, | ||
500 | sizeof(struct rt_mutex)); | ||
501 | INIT_LIST_HEAD(&lock->held_list_entry); | ||
502 | lock->name = name; | ||
503 | } | ||
504 | } | ||
505 | |||
506 | void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, task_t *task) | ||
507 | { | ||
508 | } | ||
509 | |||
510 | void rt_mutex_deadlock_account_unlock(struct task_struct *task) | ||
511 | { | ||
512 | } | ||
513 | |||
diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h new file mode 100644 index 000000000000..7612fbc62d70 --- /dev/null +++ b/kernel/rtmutex-debug.h | |||
@@ -0,0 +1,37 @@ | |||
1 | /* | ||
2 | * RT-Mutexes: blocking mutual exclusion locks with PI support | ||
3 | * | ||
4 | * started by Ingo Molnar and Thomas Gleixner: | ||
5 | * | ||
6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
8 | * | ||
9 | * This file contains macros used solely by rtmutex.c. Debug version. | ||
10 | */ | ||
11 | |||
12 | #define __IP_DECL__ , unsigned long ip | ||
13 | #define __IP__ , ip | ||
14 | #define __RET_IP__ , (unsigned long)__builtin_return_address(0) | ||
15 | |||
16 | extern void | ||
17 | rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task); | ||
18 | extern void rt_mutex_deadlock_account_unlock(struct task_struct *task); | ||
19 | extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); | ||
20 | extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); | ||
21 | extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); | ||
22 | extern void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__); | ||
23 | extern void debug_rt_mutex_unlock(struct rt_mutex *lock); | ||
24 | extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, | ||
25 | struct task_struct *powner __IP_DECL__); | ||
26 | extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); | ||
27 | extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, | ||
28 | struct rt_mutex *lock); | ||
29 | extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); | ||
30 | # define debug_rt_mutex_reset_waiter(w) \ | ||
31 | do { (w)->deadlock_lock = NULL; } while (0) | ||
32 | |||
33 | static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, | ||
34 | int detect) | ||
35 | { | ||
36 | return (waiter != NULL); | ||
37 | } | ||
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c new file mode 100644 index 000000000000..e82c2f848249 --- /dev/null +++ b/kernel/rtmutex-tester.c | |||
@@ -0,0 +1,440 @@ | |||
1 | /* | ||
2 | * RT-Mutex-tester: scriptable tester for rt mutexes | ||
3 | * | ||
4 | * started by Thomas Gleixner: | ||
5 | * | ||
6 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
7 | * | ||
8 | */ | ||
9 | #include <linux/config.h> | ||
10 | #include <linux/kthread.h> | ||
11 | #include <linux/module.h> | ||
12 | #include <linux/sched.h> | ||
13 | #include <linux/smp_lock.h> | ||
14 | #include <linux/spinlock.h> | ||
15 | #include <linux/sysdev.h> | ||
16 | #include <linux/timer.h> | ||
17 | |||
18 | #include "rtmutex.h" | ||
19 | |||
20 | #define MAX_RT_TEST_THREADS 8 | ||
21 | #define MAX_RT_TEST_MUTEXES 8 | ||
22 | |||
23 | static spinlock_t rttest_lock; | ||
24 | static atomic_t rttest_event; | ||
25 | |||
26 | struct test_thread_data { | ||
27 | int opcode; | ||
28 | int opdata; | ||
29 | int mutexes[MAX_RT_TEST_MUTEXES]; | ||
30 | int bkl; | ||
31 | int event; | ||
32 | struct sys_device sysdev; | ||
33 | }; | ||
34 | |||
35 | static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; | ||
36 | static task_t *threads[MAX_RT_TEST_THREADS]; | ||
37 | static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; | ||
38 | |||
39 | enum test_opcodes { | ||
40 | RTTEST_NOP = 0, | ||
41 | RTTEST_SCHEDOT, /* 1 Sched other, data = nice */ | ||
42 | RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */ | ||
43 | RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */ | ||
44 | RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */ | ||
45 | RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */ | ||
46 | RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ | ||
47 | RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ | ||
48 | RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ | ||
49 | RTTEST_LOCKBKL, /* 9 Lock BKL */ | ||
50 | RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ | ||
51 | RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */ | ||
52 | RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ | ||
53 | RTTEST_RESET = 99, /* 99 Reset all pending operations */ | ||
54 | }; | ||
55 | |||
56 | static int handle_op(struct test_thread_data *td, int lockwakeup) | ||
57 | { | ||
58 | int i, id, ret = -EINVAL; | ||
59 | |||
60 | switch(td->opcode) { | ||
61 | |||
62 | case RTTEST_NOP: | ||
63 | return 0; | ||
64 | |||
65 | case RTTEST_LOCKCONT: | ||
66 | td->mutexes[td->opdata] = 1; | ||
67 | td->event = atomic_add_return(1, &rttest_event); | ||
68 | return 0; | ||
69 | |||
70 | case RTTEST_RESET: | ||
71 | for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) { | ||
72 | if (td->mutexes[i] == 4) { | ||
73 | rt_mutex_unlock(&mutexes[i]); | ||
74 | td->mutexes[i] = 0; | ||
75 | } | ||
76 | } | ||
77 | |||
78 | if (!lockwakeup && td->bkl == 4) { | ||
79 | unlock_kernel(); | ||
80 | td->bkl = 0; | ||
81 | } | ||
82 | return 0; | ||
83 | |||
84 | case RTTEST_RESETEVENT: | ||
85 | atomic_set(&rttest_event, 0); | ||
86 | return 0; | ||
87 | |||
88 | default: | ||
89 | if (lockwakeup) | ||
90 | return ret; | ||
91 | } | ||
92 | |||
93 | switch(td->opcode) { | ||
94 | |||
95 | case RTTEST_LOCK: | ||
96 | case RTTEST_LOCKNOWAIT: | ||
97 | id = td->opdata; | ||
98 | if (id < 0 || id >= MAX_RT_TEST_MUTEXES) | ||
99 | return ret; | ||
100 | |||
101 | td->mutexes[id] = 1; | ||
102 | td->event = atomic_add_return(1, &rttest_event); | ||
103 | rt_mutex_lock(&mutexes[id]); | ||
104 | td->event = atomic_add_return(1, &rttest_event); | ||
105 | td->mutexes[id] = 4; | ||
106 | return 0; | ||
107 | |||
108 | case RTTEST_LOCKINT: | ||
109 | case RTTEST_LOCKINTNOWAIT: | ||
110 | id = td->opdata; | ||
111 | if (id < 0 || id >= MAX_RT_TEST_MUTEXES) | ||
112 | return ret; | ||
113 | |||
114 | td->mutexes[id] = 1; | ||
115 | td->event = atomic_add_return(1, &rttest_event); | ||
116 | ret = rt_mutex_lock_interruptible(&mutexes[id], 0); | ||
117 | td->event = atomic_add_return(1, &rttest_event); | ||
118 | td->mutexes[id] = ret ? 0 : 4; | ||
119 | return ret ? -EINTR : 0; | ||
120 | |||
121 | case RTTEST_UNLOCK: | ||
122 | id = td->opdata; | ||
123 | if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4) | ||
124 | return ret; | ||
125 | |||
126 | td->event = atomic_add_return(1, &rttest_event); | ||
127 | rt_mutex_unlock(&mutexes[id]); | ||
128 | td->event = atomic_add_return(1, &rttest_event); | ||
129 | td->mutexes[id] = 0; | ||
130 | return 0; | ||
131 | |||
132 | case RTTEST_LOCKBKL: | ||
133 | if (td->bkl) | ||
134 | return 0; | ||
135 | td->bkl = 1; | ||
136 | lock_kernel(); | ||
137 | td->bkl = 4; | ||
138 | return 0; | ||
139 | |||
140 | case RTTEST_UNLOCKBKL: | ||
141 | if (td->bkl != 4) | ||
142 | break; | ||
143 | unlock_kernel(); | ||
144 | td->bkl = 0; | ||
145 | return 0; | ||
146 | |||
147 | default: | ||
148 | break; | ||
149 | } | ||
150 | return ret; | ||
151 | } | ||
152 | |||
153 | /* | ||
154 | * Schedule replacement for rtsem_down(). Only called for threads with | ||
155 | * PF_MUTEX_TESTER set. | ||
156 | * | ||
157 | * This allows us to have finegrained control over the event flow. | ||
158 | * | ||
159 | */ | ||
160 | void schedule_rt_mutex_test(struct rt_mutex *mutex) | ||
161 | { | ||
162 | int tid, op, dat; | ||
163 | struct test_thread_data *td; | ||
164 | |||
165 | /* We have to lookup the task */ | ||
166 | for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) { | ||
167 | if (threads[tid] == current) | ||
168 | break; | ||
169 | } | ||
170 | |||
171 | BUG_ON(tid == MAX_RT_TEST_THREADS); | ||
172 | |||
173 | td = &thread_data[tid]; | ||
174 | |||
175 | op = td->opcode; | ||
176 | dat = td->opdata; | ||
177 | |||
178 | switch (op) { | ||
179 | case RTTEST_LOCK: | ||
180 | case RTTEST_LOCKINT: | ||
181 | case RTTEST_LOCKNOWAIT: | ||
182 | case RTTEST_LOCKINTNOWAIT: | ||
183 | if (mutex != &mutexes[dat]) | ||
184 | break; | ||
185 | |||
186 | if (td->mutexes[dat] != 1) | ||
187 | break; | ||
188 | |||
189 | td->mutexes[dat] = 2; | ||
190 | td->event = atomic_add_return(1, &rttest_event); | ||
191 | break; | ||
192 | |||
193 | case RTTEST_LOCKBKL: | ||
194 | default: | ||
195 | break; | ||
196 | } | ||
197 | |||
198 | schedule(); | ||
199 | |||
200 | |||
201 | switch (op) { | ||
202 | case RTTEST_LOCK: | ||
203 | case RTTEST_LOCKINT: | ||
204 | if (mutex != &mutexes[dat]) | ||
205 | return; | ||
206 | |||
207 | if (td->mutexes[dat] != 2) | ||
208 | return; | ||
209 | |||
210 | td->mutexes[dat] = 3; | ||
211 | td->event = atomic_add_return(1, &rttest_event); | ||
212 | break; | ||
213 | |||
214 | case RTTEST_LOCKNOWAIT: | ||
215 | case RTTEST_LOCKINTNOWAIT: | ||
216 | if (mutex != &mutexes[dat]) | ||
217 | return; | ||
218 | |||
219 | if (td->mutexes[dat] != 2) | ||
220 | return; | ||
221 | |||
222 | td->mutexes[dat] = 1; | ||
223 | td->event = atomic_add_return(1, &rttest_event); | ||
224 | return; | ||
225 | |||
226 | case RTTEST_LOCKBKL: | ||
227 | return; | ||
228 | default: | ||
229 | return; | ||
230 | } | ||
231 | |||
232 | td->opcode = 0; | ||
233 | |||
234 | for (;;) { | ||
235 | set_current_state(TASK_INTERRUPTIBLE); | ||
236 | |||
237 | if (td->opcode > 0) { | ||
238 | int ret; | ||
239 | |||
240 | set_current_state(TASK_RUNNING); | ||
241 | ret = handle_op(td, 1); | ||
242 | set_current_state(TASK_INTERRUPTIBLE); | ||
243 | if (td->opcode == RTTEST_LOCKCONT) | ||
244 | break; | ||
245 | td->opcode = ret; | ||
246 | } | ||
247 | |||
248 | /* Wait for the next command to be executed */ | ||
249 | schedule(); | ||
250 | } | ||
251 | |||
252 | /* Restore previous command and data */ | ||
253 | td->opcode = op; | ||
254 | td->opdata = dat; | ||
255 | } | ||
256 | |||
257 | static int test_func(void *data) | ||
258 | { | ||
259 | struct test_thread_data *td = data; | ||
260 | int ret; | ||
261 | |||
262 | current->flags |= PF_MUTEX_TESTER; | ||
263 | allow_signal(SIGHUP); | ||
264 | |||
265 | for(;;) { | ||
266 | |||
267 | set_current_state(TASK_INTERRUPTIBLE); | ||
268 | |||
269 | if (td->opcode > 0) { | ||
270 | set_current_state(TASK_RUNNING); | ||
271 | ret = handle_op(td, 0); | ||
272 | set_current_state(TASK_INTERRUPTIBLE); | ||
273 | td->opcode = ret; | ||
274 | } | ||
275 | |||
276 | /* Wait for the next command to be executed */ | ||
277 | schedule(); | ||
278 | |||
279 | if (signal_pending(current)) | ||
280 | flush_signals(current); | ||
281 | |||
282 | if(kthread_should_stop()) | ||
283 | break; | ||
284 | } | ||
285 | return 0; | ||
286 | } | ||
287 | |||
288 | /** | ||
289 | * sysfs_test_command - interface for test commands | ||
290 | * @dev: thread reference | ||
291 | * @buf: command for actual step | ||
292 | * @count: length of buffer | ||
293 | * | ||
294 | * command syntax: | ||
295 | * | ||
296 | * opcode:data | ||
297 | */ | ||
298 | static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf, | ||
299 | size_t count) | ||
300 | { | ||
301 | struct sched_param schedpar; | ||
302 | struct test_thread_data *td; | ||
303 | char cmdbuf[32]; | ||
304 | int op, dat, tid, ret; | ||
305 | |||
306 | td = container_of(dev, struct test_thread_data, sysdev); | ||
307 | tid = td->sysdev.id; | ||
308 | |||
309 | /* strings from sysfs write are not 0 terminated! */ | ||
310 | if (count >= sizeof(cmdbuf)) | ||
311 | return -EINVAL; | ||
312 | |||
313 | /* strip of \n: */ | ||
314 | if (buf[count-1] == '\n') | ||
315 | count--; | ||
316 | if (count < 1) | ||
317 | return -EINVAL; | ||
318 | |||
319 | memcpy(cmdbuf, buf, count); | ||
320 | cmdbuf[count] = 0; | ||
321 | |||
322 | if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2) | ||
323 | return -EINVAL; | ||
324 | |||
325 | switch (op) { | ||
326 | case RTTEST_SCHEDOT: | ||
327 | schedpar.sched_priority = 0; | ||
328 | ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar); | ||
329 | if (ret) | ||
330 | return ret; | ||
331 | set_user_nice(current, 0); | ||
332 | break; | ||
333 | |||
334 | case RTTEST_SCHEDRT: | ||
335 | schedpar.sched_priority = dat; | ||
336 | ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar); | ||
337 | if (ret) | ||
338 | return ret; | ||
339 | break; | ||
340 | |||
341 | case RTTEST_SIGNAL: | ||
342 | send_sig(SIGHUP, threads[tid], 0); | ||
343 | break; | ||
344 | |||
345 | default: | ||
346 | if (td->opcode > 0) | ||
347 | return -EBUSY; | ||
348 | td->opdata = dat; | ||
349 | td->opcode = op; | ||
350 | wake_up_process(threads[tid]); | ||
351 | } | ||
352 | |||
353 | return count; | ||
354 | } | ||
355 | |||
356 | /** | ||
357 | * sysfs_test_status - sysfs interface for rt tester | ||
358 | * @dev: thread to query | ||
359 | * @buf: char buffer to be filled with thread status info | ||
360 | */ | ||
361 | static ssize_t sysfs_test_status(struct sys_device *dev, char *buf) | ||
362 | { | ||
363 | struct test_thread_data *td; | ||
364 | char *curr = buf; | ||
365 | task_t *tsk; | ||
366 | int i; | ||
367 | |||
368 | td = container_of(dev, struct test_thread_data, sysdev); | ||
369 | tsk = threads[td->sysdev.id]; | ||
370 | |||
371 | spin_lock(&rttest_lock); | ||
372 | |||
373 | curr += sprintf(curr, | ||
374 | "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", | ||
375 | td->opcode, td->event, tsk->state, | ||
376 | (MAX_RT_PRIO - 1) - tsk->prio, | ||
377 | (MAX_RT_PRIO - 1) - tsk->normal_prio, | ||
378 | tsk->pi_blocked_on, td->bkl); | ||
379 | |||
380 | for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) | ||
381 | curr += sprintf(curr, "%d", td->mutexes[i]); | ||
382 | |||
383 | spin_unlock(&rttest_lock); | ||
384 | |||
385 | curr += sprintf(curr, ", T: %p, R: %p\n", tsk, | ||
386 | mutexes[td->sysdev.id].owner); | ||
387 | |||
388 | return curr - buf; | ||
389 | } | ||
390 | |||
391 | static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); | ||
392 | static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); | ||
393 | |||
394 | static struct sysdev_class rttest_sysclass = { | ||
395 | set_kset_name("rttest"), | ||
396 | }; | ||
397 | |||
398 | static int init_test_thread(int id) | ||
399 | { | ||
400 | thread_data[id].sysdev.cls = &rttest_sysclass; | ||
401 | thread_data[id].sysdev.id = id; | ||
402 | |||
403 | threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); | ||
404 | if (IS_ERR(threads[id])) | ||
405 | return PTR_ERR(threads[id]); | ||
406 | |||
407 | return sysdev_register(&thread_data[id].sysdev); | ||
408 | } | ||
409 | |||
410 | static int init_rttest(void) | ||
411 | { | ||
412 | int ret, i; | ||
413 | |||
414 | spin_lock_init(&rttest_lock); | ||
415 | |||
416 | for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) | ||
417 | rt_mutex_init(&mutexes[i]); | ||
418 | |||
419 | ret = sysdev_class_register(&rttest_sysclass); | ||
420 | if (ret) | ||
421 | return ret; | ||
422 | |||
423 | for (i = 0; i < MAX_RT_TEST_THREADS; i++) { | ||
424 | ret = init_test_thread(i); | ||
425 | if (ret) | ||
426 | break; | ||
427 | ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status); | ||
428 | if (ret) | ||
429 | break; | ||
430 | ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command); | ||
431 | if (ret) | ||
432 | break; | ||
433 | } | ||
434 | |||
435 | printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" ); | ||
436 | |||
437 | return ret; | ||
438 | } | ||
439 | |||
440 | device_initcall(init_rttest); | ||
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c new file mode 100644 index 000000000000..45d61016da57 --- /dev/null +++ b/kernel/rtmutex.c | |||
@@ -0,0 +1,990 @@ | |||
1 | /* | ||
2 | * RT-Mutexes: simple blocking mutual exclusion locks with PI support | ||
3 | * | ||
4 | * started by Ingo Molnar and Thomas Gleixner. | ||
5 | * | ||
6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
8 | * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt | ||
9 | * Copyright (C) 2006 Esben Nielsen | ||
10 | */ | ||
11 | #include <linux/spinlock.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/sched.h> | ||
14 | #include <linux/timer.h> | ||
15 | |||
16 | #include "rtmutex_common.h" | ||
17 | |||
18 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
19 | # include "rtmutex-debug.h" | ||
20 | #else | ||
21 | # include "rtmutex.h" | ||
22 | #endif | ||
23 | |||
24 | /* | ||
25 | * lock->owner state tracking: | ||
26 | * | ||
27 | * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 | ||
28 | * are used to keep track of the "owner is pending" and "lock has | ||
29 | * waiters" state. | ||
30 | * | ||
31 | * owner bit1 bit0 | ||
32 | * NULL 0 0 lock is free (fast acquire possible) | ||
33 | * NULL 0 1 invalid state | ||
34 | * NULL 1 0 Transitional State* | ||
35 | * NULL 1 1 invalid state | ||
36 | * taskpointer 0 0 lock is held (fast release possible) | ||
37 | * taskpointer 0 1 task is pending owner | ||
38 | * taskpointer 1 0 lock is held and has waiters | ||
39 | * taskpointer 1 1 task is pending owner and lock has more waiters | ||
40 | * | ||
41 | * Pending ownership is assigned to the top (highest priority) | ||
42 | * waiter of the lock, when the lock is released. The thread is woken | ||
43 | * up and can now take the lock. Until the lock is taken (bit 0 | ||
44 | * cleared) a competing higher priority thread can steal the lock | ||
45 | * which puts the woken up thread back on the waiters list. | ||
46 | * | ||
47 | * The fast atomic compare exchange based acquire and release is only | ||
48 | * possible when bit 0 and 1 of lock->owner are 0. | ||
49 | * | ||
50 | * (*) There's a small time where the owner can be NULL and the | ||
51 | * "lock has waiters" bit is set. This can happen when grabbing the lock. | ||
52 | * To prevent a cmpxchg of the owner releasing the lock, we need to set this | ||
53 | * bit before looking at the lock, hence the reason this is a transitional | ||
54 | * state. | ||
55 | */ | ||
56 | |||
57 | static void | ||
58 | rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, | ||
59 | unsigned long mask) | ||
60 | { | ||
61 | unsigned long val = (unsigned long)owner | mask; | ||
62 | |||
63 | if (rt_mutex_has_waiters(lock)) | ||
64 | val |= RT_MUTEX_HAS_WAITERS; | ||
65 | |||
66 | lock->owner = (struct task_struct *)val; | ||
67 | } | ||
68 | |||
69 | static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) | ||
70 | { | ||
71 | lock->owner = (struct task_struct *) | ||
72 | ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); | ||
73 | } | ||
74 | |||
75 | static void fixup_rt_mutex_waiters(struct rt_mutex *lock) | ||
76 | { | ||
77 | if (!rt_mutex_has_waiters(lock)) | ||
78 | clear_rt_mutex_waiters(lock); | ||
79 | } | ||
80 | |||
81 | /* | ||
82 | * We can speed up the acquire/release, if the architecture | ||
83 | * supports cmpxchg and if there's no debugging state to be set up | ||
84 | */ | ||
85 | #if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) | ||
86 | # define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) | ||
87 | static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) | ||
88 | { | ||
89 | unsigned long owner, *p = (unsigned long *) &lock->owner; | ||
90 | |||
91 | do { | ||
92 | owner = *p; | ||
93 | } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); | ||
94 | } | ||
95 | #else | ||
96 | # define rt_mutex_cmpxchg(l,c,n) (0) | ||
97 | static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) | ||
98 | { | ||
99 | lock->owner = (struct task_struct *) | ||
100 | ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); | ||
101 | } | ||
102 | #endif | ||
103 | |||
104 | /* | ||
105 | * Calculate task priority from the waiter list priority | ||
106 | * | ||
107 | * Return task->normal_prio when the waiter list is empty or when | ||
108 | * the waiter is not allowed to do priority boosting | ||
109 | */ | ||
110 | int rt_mutex_getprio(struct task_struct *task) | ||
111 | { | ||
112 | if (likely(!task_has_pi_waiters(task))) | ||
113 | return task->normal_prio; | ||
114 | |||
115 | return min(task_top_pi_waiter(task)->pi_list_entry.prio, | ||
116 | task->normal_prio); | ||
117 | } | ||
118 | |||
119 | /* | ||
120 | * Adjust the priority of a task, after its pi_waiters got modified. | ||
121 | * | ||
122 | * This can be both boosting and unboosting. task->pi_lock must be held. | ||
123 | */ | ||
124 | static void __rt_mutex_adjust_prio(struct task_struct *task) | ||
125 | { | ||
126 | int prio = rt_mutex_getprio(task); | ||
127 | |||
128 | if (task->prio != prio) | ||
129 | rt_mutex_setprio(task, prio); | ||
130 | } | ||
131 | |||
132 | /* | ||
133 | * Adjust task priority (undo boosting). Called from the exit path of | ||
134 | * rt_mutex_slowunlock() and rt_mutex_slowlock(). | ||
135 | * | ||
136 | * (Note: We do this outside of the protection of lock->wait_lock to | ||
137 | * allow the lock to be taken while or before we readjust the priority | ||
138 | * of task. We do not use the spin_xx_mutex() variants here as we are | ||
139 | * outside of the debug path.) | ||
140 | */ | ||
141 | static void rt_mutex_adjust_prio(struct task_struct *task) | ||
142 | { | ||
143 | unsigned long flags; | ||
144 | |||
145 | spin_lock_irqsave(&task->pi_lock, flags); | ||
146 | __rt_mutex_adjust_prio(task); | ||
147 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
148 | } | ||
149 | |||
150 | /* | ||
151 | * Max number of times we'll walk the boosting chain: | ||
152 | */ | ||
153 | int max_lock_depth = 1024; | ||
154 | |||
155 | /* | ||
156 | * Adjust the priority chain. Also used for deadlock detection. | ||
157 | * Decreases task's usage by one - may thus free the task. | ||
158 | * Returns 0 or -EDEADLK. | ||
159 | */ | ||
160 | static int rt_mutex_adjust_prio_chain(task_t *task, | ||
161 | int deadlock_detect, | ||
162 | struct rt_mutex *orig_lock, | ||
163 | struct rt_mutex_waiter *orig_waiter, | ||
164 | struct task_struct *top_task | ||
165 | __IP_DECL__) | ||
166 | { | ||
167 | struct rt_mutex *lock; | ||
168 | struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; | ||
169 | int detect_deadlock, ret = 0, depth = 0; | ||
170 | unsigned long flags; | ||
171 | |||
172 | detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, | ||
173 | deadlock_detect); | ||
174 | |||
175 | /* | ||
176 | * The (de)boosting is a step by step approach with a lot of | ||
177 | * pitfalls. We want this to be preemptible and we want hold a | ||
178 | * maximum of two locks per step. So we have to check | ||
179 | * carefully whether things change under us. | ||
180 | */ | ||
181 | again: | ||
182 | if (++depth > max_lock_depth) { | ||
183 | static int prev_max; | ||
184 | |||
185 | /* | ||
186 | * Print this only once. If the admin changes the limit, | ||
187 | * print a new message when reaching the limit again. | ||
188 | */ | ||
189 | if (prev_max != max_lock_depth) { | ||
190 | prev_max = max_lock_depth; | ||
191 | printk(KERN_WARNING "Maximum lock depth %d reached " | ||
192 | "task: %s (%d)\n", max_lock_depth, | ||
193 | top_task->comm, top_task->pid); | ||
194 | } | ||
195 | put_task_struct(task); | ||
196 | |||
197 | return deadlock_detect ? -EDEADLK : 0; | ||
198 | } | ||
199 | retry: | ||
200 | /* | ||
201 | * Task can not go away as we did a get_task() before ! | ||
202 | */ | ||
203 | spin_lock_irqsave(&task->pi_lock, flags); | ||
204 | |||
205 | waiter = task->pi_blocked_on; | ||
206 | /* | ||
207 | * Check whether the end of the boosting chain has been | ||
208 | * reached or the state of the chain has changed while we | ||
209 | * dropped the locks. | ||
210 | */ | ||
211 | if (!waiter || !waiter->task) | ||
212 | goto out_unlock_pi; | ||
213 | |||
214 | if (top_waiter && (!task_has_pi_waiters(task) || | ||
215 | top_waiter != task_top_pi_waiter(task))) | ||
216 | goto out_unlock_pi; | ||
217 | |||
218 | /* | ||
219 | * When deadlock detection is off then we check, if further | ||
220 | * priority adjustment is necessary. | ||
221 | */ | ||
222 | if (!detect_deadlock && waiter->list_entry.prio == task->prio) | ||
223 | goto out_unlock_pi; | ||
224 | |||
225 | lock = waiter->lock; | ||
226 | if (!spin_trylock(&lock->wait_lock)) { | ||
227 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
228 | cpu_relax(); | ||
229 | goto retry; | ||
230 | } | ||
231 | |||
232 | /* Deadlock detection */ | ||
233 | if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { | ||
234 | debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); | ||
235 | spin_unlock(&lock->wait_lock); | ||
236 | ret = deadlock_detect ? -EDEADLK : 0; | ||
237 | goto out_unlock_pi; | ||
238 | } | ||
239 | |||
240 | top_waiter = rt_mutex_top_waiter(lock); | ||
241 | |||
242 | /* Requeue the waiter */ | ||
243 | plist_del(&waiter->list_entry, &lock->wait_list); | ||
244 | waiter->list_entry.prio = task->prio; | ||
245 | plist_add(&waiter->list_entry, &lock->wait_list); | ||
246 | |||
247 | /* Release the task */ | ||
248 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
249 | put_task_struct(task); | ||
250 | |||
251 | /* Grab the next task */ | ||
252 | task = rt_mutex_owner(lock); | ||
253 | spin_lock_irqsave(&task->pi_lock, flags); | ||
254 | |||
255 | if (waiter == rt_mutex_top_waiter(lock)) { | ||
256 | /* Boost the owner */ | ||
257 | plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); | ||
258 | waiter->pi_list_entry.prio = waiter->list_entry.prio; | ||
259 | plist_add(&waiter->pi_list_entry, &task->pi_waiters); | ||
260 | __rt_mutex_adjust_prio(task); | ||
261 | |||
262 | } else if (top_waiter == waiter) { | ||
263 | /* Deboost the owner */ | ||
264 | plist_del(&waiter->pi_list_entry, &task->pi_waiters); | ||
265 | waiter = rt_mutex_top_waiter(lock); | ||
266 | waiter->pi_list_entry.prio = waiter->list_entry.prio; | ||
267 | plist_add(&waiter->pi_list_entry, &task->pi_waiters); | ||
268 | __rt_mutex_adjust_prio(task); | ||
269 | } | ||
270 | |||
271 | get_task_struct(task); | ||
272 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
273 | |||
274 | top_waiter = rt_mutex_top_waiter(lock); | ||
275 | spin_unlock(&lock->wait_lock); | ||
276 | |||
277 | if (!detect_deadlock && waiter != top_waiter) | ||
278 | goto out_put_task; | ||
279 | |||
280 | goto again; | ||
281 | |||
282 | out_unlock_pi: | ||
283 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
284 | out_put_task: | ||
285 | put_task_struct(task); | ||
286 | return ret; | ||
287 | } | ||
288 | |||
289 | /* | ||
290 | * Optimization: check if we can steal the lock from the | ||
291 | * assigned pending owner [which might not have taken the | ||
292 | * lock yet]: | ||
293 | */ | ||
294 | static inline int try_to_steal_lock(struct rt_mutex *lock) | ||
295 | { | ||
296 | struct task_struct *pendowner = rt_mutex_owner(lock); | ||
297 | struct rt_mutex_waiter *next; | ||
298 | unsigned long flags; | ||
299 | |||
300 | if (!rt_mutex_owner_pending(lock)) | ||
301 | return 0; | ||
302 | |||
303 | if (pendowner == current) | ||
304 | return 1; | ||
305 | |||
306 | spin_lock_irqsave(&pendowner->pi_lock, flags); | ||
307 | if (current->prio >= pendowner->prio) { | ||
308 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
309 | return 0; | ||
310 | } | ||
311 | |||
312 | /* | ||
313 | * Check if a waiter is enqueued on the pending owners | ||
314 | * pi_waiters list. Remove it and readjust pending owners | ||
315 | * priority. | ||
316 | */ | ||
317 | if (likely(!rt_mutex_has_waiters(lock))) { | ||
318 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
319 | return 1; | ||
320 | } | ||
321 | |||
322 | /* No chain handling, pending owner is not blocked on anything: */ | ||
323 | next = rt_mutex_top_waiter(lock); | ||
324 | plist_del(&next->pi_list_entry, &pendowner->pi_waiters); | ||
325 | __rt_mutex_adjust_prio(pendowner); | ||
326 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
327 | |||
328 | /* | ||
329 | * We are going to steal the lock and a waiter was | ||
330 | * enqueued on the pending owners pi_waiters queue. So | ||
331 | * we have to enqueue this waiter into | ||
332 | * current->pi_waiters list. This covers the case, | ||
333 | * where current is boosted because it holds another | ||
334 | * lock and gets unboosted because the booster is | ||
335 | * interrupted, so we would delay a waiter with higher | ||
336 | * priority as current->normal_prio. | ||
337 | * | ||
338 | * Note: in the rare case of a SCHED_OTHER task changing | ||
339 | * its priority and thus stealing the lock, next->task | ||
340 | * might be current: | ||
341 | */ | ||
342 | if (likely(next->task != current)) { | ||
343 | spin_lock_irqsave(¤t->pi_lock, flags); | ||
344 | plist_add(&next->pi_list_entry, ¤t->pi_waiters); | ||
345 | __rt_mutex_adjust_prio(current); | ||
346 | spin_unlock_irqrestore(¤t->pi_lock, flags); | ||
347 | } | ||
348 | return 1; | ||
349 | } | ||
350 | |||
351 | /* | ||
352 | * Try to take an rt-mutex | ||
353 | * | ||
354 | * This fails | ||
355 | * - when the lock has a real owner | ||
356 | * - when a different pending owner exists and has higher priority than current | ||
357 | * | ||
358 | * Must be called with lock->wait_lock held. | ||
359 | */ | ||
360 | static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__) | ||
361 | { | ||
362 | /* | ||
363 | * We have to be careful here if the atomic speedups are | ||
364 | * enabled, such that, when | ||
365 | * - no other waiter is on the lock | ||
366 | * - the lock has been released since we did the cmpxchg | ||
367 | * the lock can be released or taken while we are doing the | ||
368 | * checks and marking the lock with RT_MUTEX_HAS_WAITERS. | ||
369 | * | ||
370 | * The atomic acquire/release aware variant of | ||
371 | * mark_rt_mutex_waiters uses a cmpxchg loop. After setting | ||
372 | * the WAITERS bit, the atomic release / acquire can not | ||
373 | * happen anymore and lock->wait_lock protects us from the | ||
374 | * non-atomic case. | ||
375 | * | ||
376 | * Note, that this might set lock->owner = | ||
377 | * RT_MUTEX_HAS_WAITERS in the case the lock is not contended | ||
378 | * any more. This is fixed up when we take the ownership. | ||
379 | * This is the transitional state explained at the top of this file. | ||
380 | */ | ||
381 | mark_rt_mutex_waiters(lock); | ||
382 | |||
383 | if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) | ||
384 | return 0; | ||
385 | |||
386 | /* We got the lock. */ | ||
387 | debug_rt_mutex_lock(lock __IP__); | ||
388 | |||
389 | rt_mutex_set_owner(lock, current, 0); | ||
390 | |||
391 | rt_mutex_deadlock_account_lock(lock, current); | ||
392 | |||
393 | return 1; | ||
394 | } | ||
395 | |||
396 | /* | ||
397 | * Task blocks on lock. | ||
398 | * | ||
399 | * Prepare waiter and propagate pi chain | ||
400 | * | ||
401 | * This must be called with lock->wait_lock held. | ||
402 | */ | ||
403 | static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | ||
404 | struct rt_mutex_waiter *waiter, | ||
405 | int detect_deadlock | ||
406 | __IP_DECL__) | ||
407 | { | ||
408 | struct rt_mutex_waiter *top_waiter = waiter; | ||
409 | task_t *owner = rt_mutex_owner(lock); | ||
410 | int boost = 0, res; | ||
411 | unsigned long flags; | ||
412 | |||
413 | spin_lock_irqsave(¤t->pi_lock, flags); | ||
414 | __rt_mutex_adjust_prio(current); | ||
415 | waiter->task = current; | ||
416 | waiter->lock = lock; | ||
417 | plist_node_init(&waiter->list_entry, current->prio); | ||
418 | plist_node_init(&waiter->pi_list_entry, current->prio); | ||
419 | |||
420 | /* Get the top priority waiter on the lock */ | ||
421 | if (rt_mutex_has_waiters(lock)) | ||
422 | top_waiter = rt_mutex_top_waiter(lock); | ||
423 | plist_add(&waiter->list_entry, &lock->wait_list); | ||
424 | |||
425 | current->pi_blocked_on = waiter; | ||
426 | |||
427 | spin_unlock_irqrestore(¤t->pi_lock, flags); | ||
428 | |||
429 | if (waiter == rt_mutex_top_waiter(lock)) { | ||
430 | spin_lock_irqsave(&owner->pi_lock, flags); | ||
431 | plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); | ||
432 | plist_add(&waiter->pi_list_entry, &owner->pi_waiters); | ||
433 | |||
434 | __rt_mutex_adjust_prio(owner); | ||
435 | if (owner->pi_blocked_on) { | ||
436 | boost = 1; | ||
437 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ | ||
438 | get_task_struct(owner); | ||
439 | } | ||
440 | spin_unlock_irqrestore(&owner->pi_lock, flags); | ||
441 | } | ||
442 | else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { | ||
443 | spin_lock_irqsave(&owner->pi_lock, flags); | ||
444 | if (owner->pi_blocked_on) { | ||
445 | boost = 1; | ||
446 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ | ||
447 | get_task_struct(owner); | ||
448 | } | ||
449 | spin_unlock_irqrestore(&owner->pi_lock, flags); | ||
450 | } | ||
451 | if (!boost) | ||
452 | return 0; | ||
453 | |||
454 | spin_unlock(&lock->wait_lock); | ||
455 | |||
456 | res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, | ||
457 | current __IP__); | ||
458 | |||
459 | spin_lock(&lock->wait_lock); | ||
460 | |||
461 | return res; | ||
462 | } | ||
463 | |||
464 | /* | ||
465 | * Wake up the next waiter on the lock. | ||
466 | * | ||
467 | * Remove the top waiter from the current tasks waiter list and from | ||
468 | * the lock waiter list. Set it as pending owner. Then wake it up. | ||
469 | * | ||
470 | * Called with lock->wait_lock held. | ||
471 | */ | ||
472 | static void wakeup_next_waiter(struct rt_mutex *lock) | ||
473 | { | ||
474 | struct rt_mutex_waiter *waiter; | ||
475 | struct task_struct *pendowner; | ||
476 | unsigned long flags; | ||
477 | |||
478 | spin_lock_irqsave(¤t->pi_lock, flags); | ||
479 | |||
480 | waiter = rt_mutex_top_waiter(lock); | ||
481 | plist_del(&waiter->list_entry, &lock->wait_list); | ||
482 | |||
483 | /* | ||
484 | * Remove it from current->pi_waiters. We do not adjust a | ||
485 | * possible priority boost right now. We execute wakeup in the | ||
486 | * boosted mode and go back to normal after releasing | ||
487 | * lock->wait_lock. | ||
488 | */ | ||
489 | plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); | ||
490 | pendowner = waiter->task; | ||
491 | waiter->task = NULL; | ||
492 | |||
493 | rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); | ||
494 | |||
495 | spin_unlock_irqrestore(¤t->pi_lock, flags); | ||
496 | |||
497 | /* | ||
498 | * Clear the pi_blocked_on variable and enqueue a possible | ||
499 | * waiter into the pi_waiters list of the pending owner. This | ||
500 | * prevents that in case the pending owner gets unboosted a | ||
501 | * waiter with higher priority than pending-owner->normal_prio | ||
502 | * is blocked on the unboosted (pending) owner. | ||
503 | */ | ||
504 | spin_lock_irqsave(&pendowner->pi_lock, flags); | ||
505 | |||
506 | WARN_ON(!pendowner->pi_blocked_on); | ||
507 | WARN_ON(pendowner->pi_blocked_on != waiter); | ||
508 | WARN_ON(pendowner->pi_blocked_on->lock != lock); | ||
509 | |||
510 | pendowner->pi_blocked_on = NULL; | ||
511 | |||
512 | if (rt_mutex_has_waiters(lock)) { | ||
513 | struct rt_mutex_waiter *next; | ||
514 | |||
515 | next = rt_mutex_top_waiter(lock); | ||
516 | plist_add(&next->pi_list_entry, &pendowner->pi_waiters); | ||
517 | } | ||
518 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
519 | |||
520 | wake_up_process(pendowner); | ||
521 | } | ||
522 | |||
523 | /* | ||
524 | * Remove a waiter from a lock | ||
525 | * | ||
526 | * Must be called with lock->wait_lock held | ||
527 | */ | ||
528 | static void remove_waiter(struct rt_mutex *lock, | ||
529 | struct rt_mutex_waiter *waiter __IP_DECL__) | ||
530 | { | ||
531 | int first = (waiter == rt_mutex_top_waiter(lock)); | ||
532 | int boost = 0; | ||
533 | task_t *owner = rt_mutex_owner(lock); | ||
534 | unsigned long flags; | ||
535 | |||
536 | spin_lock_irqsave(¤t->pi_lock, flags); | ||
537 | plist_del(&waiter->list_entry, &lock->wait_list); | ||
538 | waiter->task = NULL; | ||
539 | current->pi_blocked_on = NULL; | ||
540 | spin_unlock_irqrestore(¤t->pi_lock, flags); | ||
541 | |||
542 | if (first && owner != current) { | ||
543 | |||
544 | spin_lock_irqsave(&owner->pi_lock, flags); | ||
545 | |||
546 | plist_del(&waiter->pi_list_entry, &owner->pi_waiters); | ||
547 | |||
548 | if (rt_mutex_has_waiters(lock)) { | ||
549 | struct rt_mutex_waiter *next; | ||
550 | |||
551 | next = rt_mutex_top_waiter(lock); | ||
552 | plist_add(&next->pi_list_entry, &owner->pi_waiters); | ||
553 | } | ||
554 | __rt_mutex_adjust_prio(owner); | ||
555 | |||
556 | if (owner->pi_blocked_on) { | ||
557 | boost = 1; | ||
558 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ | ||
559 | get_task_struct(owner); | ||
560 | } | ||
561 | spin_unlock_irqrestore(&owner->pi_lock, flags); | ||
562 | } | ||
563 | |||
564 | WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); | ||
565 | |||
566 | if (!boost) | ||
567 | return; | ||
568 | |||
569 | spin_unlock(&lock->wait_lock); | ||
570 | |||
571 | rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current __IP__); | ||
572 | |||
573 | spin_lock(&lock->wait_lock); | ||
574 | } | ||
575 | |||
576 | /* | ||
577 | * Recheck the pi chain, in case we got a priority setting | ||
578 | * | ||
579 | * Called from sched_setscheduler | ||
580 | */ | ||
581 | void rt_mutex_adjust_pi(struct task_struct *task) | ||
582 | { | ||
583 | struct rt_mutex_waiter *waiter; | ||
584 | unsigned long flags; | ||
585 | |||
586 | spin_lock_irqsave(&task->pi_lock, flags); | ||
587 | |||
588 | waiter = task->pi_blocked_on; | ||
589 | if (!waiter || waiter->list_entry.prio == task->prio) { | ||
590 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
591 | return; | ||
592 | } | ||
593 | |||
594 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ | ||
595 | get_task_struct(task); | ||
596 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
597 | |||
598 | rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task __RET_IP__); | ||
599 | } | ||
600 | |||
601 | /* | ||
602 | * Slow path lock function: | ||
603 | */ | ||
604 | static int __sched | ||
605 | rt_mutex_slowlock(struct rt_mutex *lock, int state, | ||
606 | struct hrtimer_sleeper *timeout, | ||
607 | int detect_deadlock __IP_DECL__) | ||
608 | { | ||
609 | struct rt_mutex_waiter waiter; | ||
610 | int ret = 0; | ||
611 | |||
612 | debug_rt_mutex_init_waiter(&waiter); | ||
613 | waiter.task = NULL; | ||
614 | |||
615 | spin_lock(&lock->wait_lock); | ||
616 | |||
617 | /* Try to acquire the lock again: */ | ||
618 | if (try_to_take_rt_mutex(lock __IP__)) { | ||
619 | spin_unlock(&lock->wait_lock); | ||
620 | return 0; | ||
621 | } | ||
622 | |||
623 | set_current_state(state); | ||
624 | |||
625 | /* Setup the timer, when timeout != NULL */ | ||
626 | if (unlikely(timeout)) | ||
627 | hrtimer_start(&timeout->timer, timeout->timer.expires, | ||
628 | HRTIMER_ABS); | ||
629 | |||
630 | for (;;) { | ||
631 | /* Try to acquire the lock: */ | ||
632 | if (try_to_take_rt_mutex(lock __IP__)) | ||
633 | break; | ||
634 | |||
635 | /* | ||
636 | * TASK_INTERRUPTIBLE checks for signals and | ||
637 | * timeout. Ignored otherwise. | ||
638 | */ | ||
639 | if (unlikely(state == TASK_INTERRUPTIBLE)) { | ||
640 | /* Signal pending? */ | ||
641 | if (signal_pending(current)) | ||
642 | ret = -EINTR; | ||
643 | if (timeout && !timeout->task) | ||
644 | ret = -ETIMEDOUT; | ||
645 | if (ret) | ||
646 | break; | ||
647 | } | ||
648 | |||
649 | /* | ||
650 | * waiter.task is NULL the first time we come here and | ||
651 | * when we have been woken up by the previous owner | ||
652 | * but the lock got stolen by a higher prio task. | ||
653 | */ | ||
654 | if (!waiter.task) { | ||
655 | ret = task_blocks_on_rt_mutex(lock, &waiter, | ||
656 | detect_deadlock __IP__); | ||
657 | /* | ||
658 | * If we got woken up by the owner then start loop | ||
659 | * all over without going into schedule to try | ||
660 | * to get the lock now: | ||
661 | */ | ||
662 | if (unlikely(!waiter.task)) | ||
663 | continue; | ||
664 | |||
665 | if (unlikely(ret)) | ||
666 | break; | ||
667 | } | ||
668 | |||
669 | spin_unlock(&lock->wait_lock); | ||
670 | |||
671 | debug_rt_mutex_print_deadlock(&waiter); | ||
672 | |||
673 | if (waiter.task) | ||
674 | schedule_rt_mutex(lock); | ||
675 | |||
676 | spin_lock(&lock->wait_lock); | ||
677 | set_current_state(state); | ||
678 | } | ||
679 | |||
680 | set_current_state(TASK_RUNNING); | ||
681 | |||
682 | if (unlikely(waiter.task)) | ||
683 | remove_waiter(lock, &waiter __IP__); | ||
684 | |||
685 | /* | ||
686 | * try_to_take_rt_mutex() sets the waiter bit | ||
687 | * unconditionally. We might have to fix that up. | ||
688 | */ | ||
689 | fixup_rt_mutex_waiters(lock); | ||
690 | |||
691 | spin_unlock(&lock->wait_lock); | ||
692 | |||
693 | /* Remove pending timer: */ | ||
694 | if (unlikely(timeout)) | ||
695 | hrtimer_cancel(&timeout->timer); | ||
696 | |||
697 | /* | ||
698 | * Readjust priority, when we did not get the lock. We might | ||
699 | * have been the pending owner and boosted. Since we did not | ||
700 | * take the lock, the PI boost has to go. | ||
701 | */ | ||
702 | if (unlikely(ret)) | ||
703 | rt_mutex_adjust_prio(current); | ||
704 | |||
705 | debug_rt_mutex_free_waiter(&waiter); | ||
706 | |||
707 | return ret; | ||
708 | } | ||
709 | |||
710 | /* | ||
711 | * Slow path try-lock function: | ||
712 | */ | ||
713 | static inline int | ||
714 | rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__) | ||
715 | { | ||
716 | int ret = 0; | ||
717 | |||
718 | spin_lock(&lock->wait_lock); | ||
719 | |||
720 | if (likely(rt_mutex_owner(lock) != current)) { | ||
721 | |||
722 | ret = try_to_take_rt_mutex(lock __IP__); | ||
723 | /* | ||
724 | * try_to_take_rt_mutex() sets the lock waiters | ||
725 | * bit unconditionally. Clean this up. | ||
726 | */ | ||
727 | fixup_rt_mutex_waiters(lock); | ||
728 | } | ||
729 | |||
730 | spin_unlock(&lock->wait_lock); | ||
731 | |||
732 | return ret; | ||
733 | } | ||
734 | |||
735 | /* | ||
736 | * Slow path to release a rt-mutex: | ||
737 | */ | ||
738 | static void __sched | ||
739 | rt_mutex_slowunlock(struct rt_mutex *lock) | ||
740 | { | ||
741 | spin_lock(&lock->wait_lock); | ||
742 | |||
743 | debug_rt_mutex_unlock(lock); | ||
744 | |||
745 | rt_mutex_deadlock_account_unlock(current); | ||
746 | |||
747 | if (!rt_mutex_has_waiters(lock)) { | ||
748 | lock->owner = NULL; | ||
749 | spin_unlock(&lock->wait_lock); | ||
750 | return; | ||
751 | } | ||
752 | |||
753 | wakeup_next_waiter(lock); | ||
754 | |||
755 | spin_unlock(&lock->wait_lock); | ||
756 | |||
757 | /* Undo pi boosting if necessary: */ | ||
758 | rt_mutex_adjust_prio(current); | ||
759 | } | ||
760 | |||
761 | /* | ||
762 | * debug aware fast / slowpath lock,trylock,unlock | ||
763 | * | ||
764 | * The atomic acquire/release ops are compiled away, when either the | ||
765 | * architecture does not support cmpxchg or when debugging is enabled. | ||
766 | */ | ||
767 | static inline int | ||
768 | rt_mutex_fastlock(struct rt_mutex *lock, int state, | ||
769 | int detect_deadlock, | ||
770 | int (*slowfn)(struct rt_mutex *lock, int state, | ||
771 | struct hrtimer_sleeper *timeout, | ||
772 | int detect_deadlock __IP_DECL__)) | ||
773 | { | ||
774 | if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { | ||
775 | rt_mutex_deadlock_account_lock(lock, current); | ||
776 | return 0; | ||
777 | } else | ||
778 | return slowfn(lock, state, NULL, detect_deadlock __RET_IP__); | ||
779 | } | ||
780 | |||
781 | static inline int | ||
782 | rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, | ||
783 | struct hrtimer_sleeper *timeout, int detect_deadlock, | ||
784 | int (*slowfn)(struct rt_mutex *lock, int state, | ||
785 | struct hrtimer_sleeper *timeout, | ||
786 | int detect_deadlock __IP_DECL__)) | ||
787 | { | ||
788 | if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { | ||
789 | rt_mutex_deadlock_account_lock(lock, current); | ||
790 | return 0; | ||
791 | } else | ||
792 | return slowfn(lock, state, timeout, detect_deadlock __RET_IP__); | ||
793 | } | ||
794 | |||
795 | static inline int | ||
796 | rt_mutex_fasttrylock(struct rt_mutex *lock, | ||
797 | int (*slowfn)(struct rt_mutex *lock __IP_DECL__)) | ||
798 | { | ||
799 | if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { | ||
800 | rt_mutex_deadlock_account_lock(lock, current); | ||
801 | return 1; | ||
802 | } | ||
803 | return slowfn(lock __RET_IP__); | ||
804 | } | ||
805 | |||
806 | static inline void | ||
807 | rt_mutex_fastunlock(struct rt_mutex *lock, | ||
808 | void (*slowfn)(struct rt_mutex *lock)) | ||
809 | { | ||
810 | if (likely(rt_mutex_cmpxchg(lock, current, NULL))) | ||
811 | rt_mutex_deadlock_account_unlock(current); | ||
812 | else | ||
813 | slowfn(lock); | ||
814 | } | ||
815 | |||
816 | /** | ||
817 | * rt_mutex_lock - lock a rt_mutex | ||
818 | * | ||
819 | * @lock: the rt_mutex to be locked | ||
820 | */ | ||
821 | void __sched rt_mutex_lock(struct rt_mutex *lock) | ||
822 | { | ||
823 | might_sleep(); | ||
824 | |||
825 | rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); | ||
826 | } | ||
827 | EXPORT_SYMBOL_GPL(rt_mutex_lock); | ||
828 | |||
829 | /** | ||
830 | * rt_mutex_lock_interruptible - lock a rt_mutex interruptible | ||
831 | * | ||
832 | * @lock: the rt_mutex to be locked | ||
833 | * @detect_deadlock: deadlock detection on/off | ||
834 | * | ||
835 | * Returns: | ||
836 | * 0 on success | ||
837 | * -EINTR when interrupted by a signal | ||
838 | * -EDEADLK when the lock would deadlock (when deadlock detection is on) | ||
839 | */ | ||
840 | int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, | ||
841 | int detect_deadlock) | ||
842 | { | ||
843 | might_sleep(); | ||
844 | |||
845 | return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, | ||
846 | detect_deadlock, rt_mutex_slowlock); | ||
847 | } | ||
848 | EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); | ||
849 | |||
850 | /** | ||
851 | * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible | ||
852 | * the timeout structure is provided | ||
853 | * by the caller | ||
854 | * | ||
855 | * @lock: the rt_mutex to be locked | ||
856 | * @timeout: timeout structure or NULL (no timeout) | ||
857 | * @detect_deadlock: deadlock detection on/off | ||
858 | * | ||
859 | * Returns: | ||
860 | * 0 on success | ||
861 | * -EINTR when interrupted by a signal | ||
862 | * -ETIMEOUT when the timeout expired | ||
863 | * -EDEADLK when the lock would deadlock (when deadlock detection is on) | ||
864 | */ | ||
865 | int | ||
866 | rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, | ||
867 | int detect_deadlock) | ||
868 | { | ||
869 | might_sleep(); | ||
870 | |||
871 | return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, | ||
872 | detect_deadlock, rt_mutex_slowlock); | ||
873 | } | ||
874 | EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); | ||
875 | |||
876 | /** | ||
877 | * rt_mutex_trylock - try to lock a rt_mutex | ||
878 | * | ||
879 | * @lock: the rt_mutex to be locked | ||
880 | * | ||
881 | * Returns 1 on success and 0 on contention | ||
882 | */ | ||
883 | int __sched rt_mutex_trylock(struct rt_mutex *lock) | ||
884 | { | ||
885 | return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); | ||
886 | } | ||
887 | EXPORT_SYMBOL_GPL(rt_mutex_trylock); | ||
888 | |||
889 | /** | ||
890 | * rt_mutex_unlock - unlock a rt_mutex | ||
891 | * | ||
892 | * @lock: the rt_mutex to be unlocked | ||
893 | */ | ||
894 | void __sched rt_mutex_unlock(struct rt_mutex *lock) | ||
895 | { | ||
896 | rt_mutex_fastunlock(lock, rt_mutex_slowunlock); | ||
897 | } | ||
898 | EXPORT_SYMBOL_GPL(rt_mutex_unlock); | ||
899 | |||
900 | /*** | ||
901 | * rt_mutex_destroy - mark a mutex unusable | ||
902 | * @lock: the mutex to be destroyed | ||
903 | * | ||
904 | * This function marks the mutex uninitialized, and any subsequent | ||
905 | * use of the mutex is forbidden. The mutex must not be locked when | ||
906 | * this function is called. | ||
907 | */ | ||
908 | void rt_mutex_destroy(struct rt_mutex *lock) | ||
909 | { | ||
910 | WARN_ON(rt_mutex_is_locked(lock)); | ||
911 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
912 | lock->magic = NULL; | ||
913 | #endif | ||
914 | } | ||
915 | |||
916 | EXPORT_SYMBOL_GPL(rt_mutex_destroy); | ||
917 | |||
918 | /** | ||
919 | * __rt_mutex_init - initialize the rt lock | ||
920 | * | ||
921 | * @lock: the rt lock to be initialized | ||
922 | * | ||
923 | * Initialize the rt lock to unlocked state. | ||
924 | * | ||
925 | * Initializing of a locked rt lock is not allowed | ||
926 | */ | ||
927 | void __rt_mutex_init(struct rt_mutex *lock, const char *name) | ||
928 | { | ||
929 | lock->owner = NULL; | ||
930 | spin_lock_init(&lock->wait_lock); | ||
931 | plist_head_init(&lock->wait_list, &lock->wait_lock); | ||
932 | |||
933 | debug_rt_mutex_init(lock, name); | ||
934 | } | ||
935 | EXPORT_SYMBOL_GPL(__rt_mutex_init); | ||
936 | |||
937 | /** | ||
938 | * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a | ||
939 | * proxy owner | ||
940 | * | ||
941 | * @lock: the rt_mutex to be locked | ||
942 | * @proxy_owner:the task to set as owner | ||
943 | * | ||
944 | * No locking. Caller has to do serializing itself | ||
945 | * Special API call for PI-futex support | ||
946 | */ | ||
947 | void rt_mutex_init_proxy_locked(struct rt_mutex *lock, | ||
948 | struct task_struct *proxy_owner) | ||
949 | { | ||
950 | __rt_mutex_init(lock, NULL); | ||
951 | debug_rt_mutex_proxy_lock(lock, proxy_owner __RET_IP__); | ||
952 | rt_mutex_set_owner(lock, proxy_owner, 0); | ||
953 | rt_mutex_deadlock_account_lock(lock, proxy_owner); | ||
954 | } | ||
955 | |||
956 | /** | ||
957 | * rt_mutex_proxy_unlock - release a lock on behalf of owner | ||
958 | * | ||
959 | * @lock: the rt_mutex to be locked | ||
960 | * | ||
961 | * No locking. Caller has to do serializing itself | ||
962 | * Special API call for PI-futex support | ||
963 | */ | ||
964 | void rt_mutex_proxy_unlock(struct rt_mutex *lock, | ||
965 | struct task_struct *proxy_owner) | ||
966 | { | ||
967 | debug_rt_mutex_proxy_unlock(lock); | ||
968 | rt_mutex_set_owner(lock, NULL, 0); | ||
969 | rt_mutex_deadlock_account_unlock(proxy_owner); | ||
970 | } | ||
971 | |||
972 | /** | ||
973 | * rt_mutex_next_owner - return the next owner of the lock | ||
974 | * | ||
975 | * @lock: the rt lock query | ||
976 | * | ||
977 | * Returns the next owner of the lock or NULL | ||
978 | * | ||
979 | * Caller has to serialize against other accessors to the lock | ||
980 | * itself. | ||
981 | * | ||
982 | * Special API call for PI-futex support | ||
983 | */ | ||
984 | struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) | ||
985 | { | ||
986 | if (!rt_mutex_has_waiters(lock)) | ||
987 | return NULL; | ||
988 | |||
989 | return rt_mutex_top_waiter(lock)->task; | ||
990 | } | ||
diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h new file mode 100644 index 000000000000..1e0fca13ff72 --- /dev/null +++ b/kernel/rtmutex.h | |||
@@ -0,0 +1,29 @@ | |||
1 | /* | ||
2 | * RT-Mutexes: blocking mutual exclusion locks with PI support | ||
3 | * | ||
4 | * started by Ingo Molnar and Thomas Gleixner: | ||
5 | * | ||
6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
8 | * | ||
9 | * This file contains macros used solely by rtmutex.c. | ||
10 | * Non-debug version. | ||
11 | */ | ||
12 | |||
13 | #define __IP_DECL__ | ||
14 | #define __IP__ | ||
15 | #define __RET_IP__ | ||
16 | #define rt_mutex_deadlock_check(l) (0) | ||
17 | #define rt_mutex_deadlock_account_lock(m, t) do { } while (0) | ||
18 | #define rt_mutex_deadlock_account_unlock(l) do { } while (0) | ||
19 | #define debug_rt_mutex_init_waiter(w) do { } while (0) | ||
20 | #define debug_rt_mutex_free_waiter(w) do { } while (0) | ||
21 | #define debug_rt_mutex_lock(l) do { } while (0) | ||
22 | #define debug_rt_mutex_proxy_lock(l,p) do { } while (0) | ||
23 | #define debug_rt_mutex_proxy_unlock(l) do { } while (0) | ||
24 | #define debug_rt_mutex_unlock(l) do { } while (0) | ||
25 | #define debug_rt_mutex_init(m, n) do { } while (0) | ||
26 | #define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) | ||
27 | #define debug_rt_mutex_print_deadlock(w) do { } while (0) | ||
28 | #define debug_rt_mutex_detect_deadlock(w,d) (d) | ||
29 | #define debug_rt_mutex_reset_waiter(w) do { } while (0) | ||
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h new file mode 100644 index 000000000000..9c75856e791e --- /dev/null +++ b/kernel/rtmutex_common.h | |||
@@ -0,0 +1,123 @@ | |||
1 | /* | ||
2 | * RT Mutexes: blocking mutual exclusion locks with PI support | ||
3 | * | ||
4 | * started by Ingo Molnar and Thomas Gleixner: | ||
5 | * | ||
6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
8 | * | ||
9 | * This file contains the private data structure and API definitions. | ||
10 | */ | ||
11 | |||
12 | #ifndef __KERNEL_RTMUTEX_COMMON_H | ||
13 | #define __KERNEL_RTMUTEX_COMMON_H | ||
14 | |||
15 | #include <linux/rtmutex.h> | ||
16 | |||
17 | /* | ||
18 | * The rtmutex in kernel tester is independent of rtmutex debugging. We | ||
19 | * call schedule_rt_mutex_test() instead of schedule() for the tasks which | ||
20 | * belong to the tester. That way we can delay the wakeup path of those | ||
21 | * threads to provoke lock stealing and testing of complex boosting scenarios. | ||
22 | */ | ||
23 | #ifdef CONFIG_RT_MUTEX_TESTER | ||
24 | |||
25 | extern void schedule_rt_mutex_test(struct rt_mutex *lock); | ||
26 | |||
27 | #define schedule_rt_mutex(_lock) \ | ||
28 | do { \ | ||
29 | if (!(current->flags & PF_MUTEX_TESTER)) \ | ||
30 | schedule(); \ | ||
31 | else \ | ||
32 | schedule_rt_mutex_test(_lock); \ | ||
33 | } while (0) | ||
34 | |||
35 | #else | ||
36 | # define schedule_rt_mutex(_lock) schedule() | ||
37 | #endif | ||
38 | |||
39 | /* | ||
40 | * This is the control structure for tasks blocked on a rt_mutex, | ||
41 | * which is allocated on the kernel stack on of the blocked task. | ||
42 | * | ||
43 | * @list_entry: pi node to enqueue into the mutex waiters list | ||
44 | * @pi_list_entry: pi node to enqueue into the mutex owner waiters list | ||
45 | * @task: task reference to the blocked task | ||
46 | */ | ||
47 | struct rt_mutex_waiter { | ||
48 | struct plist_node list_entry; | ||
49 | struct plist_node pi_list_entry; | ||
50 | struct task_struct *task; | ||
51 | struct rt_mutex *lock; | ||
52 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
53 | unsigned long ip; | ||
54 | pid_t deadlock_task_pid; | ||
55 | struct rt_mutex *deadlock_lock; | ||
56 | #endif | ||
57 | }; | ||
58 | |||
59 | /* | ||
60 | * Various helpers to access the waiters-plist: | ||
61 | */ | ||
62 | static inline int rt_mutex_has_waiters(struct rt_mutex *lock) | ||
63 | { | ||
64 | return !plist_head_empty(&lock->wait_list); | ||
65 | } | ||
66 | |||
67 | static inline struct rt_mutex_waiter * | ||
68 | rt_mutex_top_waiter(struct rt_mutex *lock) | ||
69 | { | ||
70 | struct rt_mutex_waiter *w; | ||
71 | |||
72 | w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, | ||
73 | list_entry); | ||
74 | BUG_ON(w->lock != lock); | ||
75 | |||
76 | return w; | ||
77 | } | ||
78 | |||
79 | static inline int task_has_pi_waiters(struct task_struct *p) | ||
80 | { | ||
81 | return !plist_head_empty(&p->pi_waiters); | ||
82 | } | ||
83 | |||
84 | static inline struct rt_mutex_waiter * | ||
85 | task_top_pi_waiter(struct task_struct *p) | ||
86 | { | ||
87 | return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, | ||
88 | pi_list_entry); | ||
89 | } | ||
90 | |||
91 | /* | ||
92 | * lock->owner state tracking: | ||
93 | */ | ||
94 | #define RT_MUTEX_OWNER_PENDING 1UL | ||
95 | #define RT_MUTEX_HAS_WAITERS 2UL | ||
96 | #define RT_MUTEX_OWNER_MASKALL 3UL | ||
97 | |||
98 | static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) | ||
99 | { | ||
100 | return (struct task_struct *) | ||
101 | ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); | ||
102 | } | ||
103 | |||
104 | static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) | ||
105 | { | ||
106 | return (struct task_struct *) | ||
107 | ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); | ||
108 | } | ||
109 | |||
110 | static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) | ||
111 | { | ||
112 | return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; | ||
113 | } | ||
114 | |||
115 | /* | ||
116 | * PI-futex support (proxy locking functions, etc.): | ||
117 | */ | ||
118 | extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); | ||
119 | extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, | ||
120 | struct task_struct *proxy_owner); | ||
121 | extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, | ||
122 | struct task_struct *proxy_owner); | ||
123 | #endif | ||
diff --git a/kernel/sched.c b/kernel/sched.c index c13f1bd2df7d..d5e37072ea54 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -168,15 +168,21 @@ | |||
168 | */ | 168 | */ |
169 | 169 | ||
170 | #define SCALE_PRIO(x, prio) \ | 170 | #define SCALE_PRIO(x, prio) \ |
171 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) | 171 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) |
172 | 172 | ||
173 | static unsigned int task_timeslice(task_t *p) | 173 | static unsigned int static_prio_timeslice(int static_prio) |
174 | { | 174 | { |
175 | if (p->static_prio < NICE_TO_PRIO(0)) | 175 | if (static_prio < NICE_TO_PRIO(0)) |
176 | return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); | 176 | return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); |
177 | else | 177 | else |
178 | return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); | 178 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); |
179 | } | 179 | } |
180 | |||
181 | static inline unsigned int task_timeslice(task_t *p) | ||
182 | { | ||
183 | return static_prio_timeslice(p->static_prio); | ||
184 | } | ||
185 | |||
180 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ | 186 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ |
181 | < (long long) (sd)->cache_hot_time) | 187 | < (long long) (sd)->cache_hot_time) |
182 | 188 | ||
@@ -184,13 +190,11 @@ static unsigned int task_timeslice(task_t *p) | |||
184 | * These are the runqueue data structures: | 190 | * These are the runqueue data structures: |
185 | */ | 191 | */ |
186 | 192 | ||
187 | #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) | ||
188 | |||
189 | typedef struct runqueue runqueue_t; | 193 | typedef struct runqueue runqueue_t; |
190 | 194 | ||
191 | struct prio_array { | 195 | struct prio_array { |
192 | unsigned int nr_active; | 196 | unsigned int nr_active; |
193 | unsigned long bitmap[BITMAP_SIZE]; | 197 | DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ |
194 | struct list_head queue[MAX_PRIO]; | 198 | struct list_head queue[MAX_PRIO]; |
195 | }; | 199 | }; |
196 | 200 | ||
@@ -209,6 +213,7 @@ struct runqueue { | |||
209 | * remote CPUs use both these fields when doing load calculation. | 213 | * remote CPUs use both these fields when doing load calculation. |
210 | */ | 214 | */ |
211 | unsigned long nr_running; | 215 | unsigned long nr_running; |
216 | unsigned long raw_weighted_load; | ||
212 | #ifdef CONFIG_SMP | 217 | #ifdef CONFIG_SMP |
213 | unsigned long cpu_load[3]; | 218 | unsigned long cpu_load[3]; |
214 | #endif | 219 | #endif |
@@ -239,7 +244,6 @@ struct runqueue { | |||
239 | 244 | ||
240 | task_t *migration_thread; | 245 | task_t *migration_thread; |
241 | struct list_head migration_queue; | 246 | struct list_head migration_queue; |
242 | int cpu; | ||
243 | #endif | 247 | #endif |
244 | 248 | ||
245 | #ifdef CONFIG_SCHEDSTATS | 249 | #ifdef CONFIG_SCHEDSTATS |
@@ -351,11 +355,30 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | |||
351 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 355 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
352 | 356 | ||
353 | /* | 357 | /* |
358 | * __task_rq_lock - lock the runqueue a given task resides on. | ||
359 | * Must be called interrupts disabled. | ||
360 | */ | ||
361 | static inline runqueue_t *__task_rq_lock(task_t *p) | ||
362 | __acquires(rq->lock) | ||
363 | { | ||
364 | struct runqueue *rq; | ||
365 | |||
366 | repeat_lock_task: | ||
367 | rq = task_rq(p); | ||
368 | spin_lock(&rq->lock); | ||
369 | if (unlikely(rq != task_rq(p))) { | ||
370 | spin_unlock(&rq->lock); | ||
371 | goto repeat_lock_task; | ||
372 | } | ||
373 | return rq; | ||
374 | } | ||
375 | |||
376 | /* | ||
354 | * task_rq_lock - lock the runqueue a given task resides on and disable | 377 | * task_rq_lock - lock the runqueue a given task resides on and disable |
355 | * interrupts. Note the ordering: we can safely lookup the task_rq without | 378 | * interrupts. Note the ordering: we can safely lookup the task_rq without |
356 | * explicitly disabling preemption. | 379 | * explicitly disabling preemption. |
357 | */ | 380 | */ |
358 | static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) | 381 | static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) |
359 | __acquires(rq->lock) | 382 | __acquires(rq->lock) |
360 | { | 383 | { |
361 | struct runqueue *rq; | 384 | struct runqueue *rq; |
@@ -371,6 +394,12 @@ repeat_lock_task: | |||
371 | return rq; | 394 | return rq; |
372 | } | 395 | } |
373 | 396 | ||
397 | static inline void __task_rq_unlock(runqueue_t *rq) | ||
398 | __releases(rq->lock) | ||
399 | { | ||
400 | spin_unlock(&rq->lock); | ||
401 | } | ||
402 | |||
374 | static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) | 403 | static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) |
375 | __releases(rq->lock) | 404 | __releases(rq->lock) |
376 | { | 405 | { |
@@ -634,7 +663,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | |||
634 | } | 663 | } |
635 | 664 | ||
636 | /* | 665 | /* |
637 | * effective_prio - return the priority that is based on the static | 666 | * __normal_prio - return the priority that is based on the static |
638 | * priority but is modified by bonuses/penalties. | 667 | * priority but is modified by bonuses/penalties. |
639 | * | 668 | * |
640 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] | 669 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] |
@@ -647,13 +676,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | |||
647 | * | 676 | * |
648 | * Both properties are important to certain workloads. | 677 | * Both properties are important to certain workloads. |
649 | */ | 678 | */ |
650 | static int effective_prio(task_t *p) | 679 | |
680 | static inline int __normal_prio(task_t *p) | ||
651 | { | 681 | { |
652 | int bonus, prio; | 682 | int bonus, prio; |
653 | 683 | ||
654 | if (rt_task(p)) | ||
655 | return p->prio; | ||
656 | |||
657 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; | 684 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; |
658 | 685 | ||
659 | prio = p->static_prio - bonus; | 686 | prio = p->static_prio - bonus; |
@@ -665,6 +692,106 @@ static int effective_prio(task_t *p) | |||
665 | } | 692 | } |
666 | 693 | ||
667 | /* | 694 | /* |
695 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | ||
696 | * of tasks with abnormal "nice" values across CPUs the contribution that | ||
697 | * each task makes to its run queue's load is weighted according to its | ||
698 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a | ||
699 | * scaled version of the new time slice allocation that they receive on time | ||
700 | * slice expiry etc. | ||
701 | */ | ||
702 | |||
703 | /* | ||
704 | * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE | ||
705 | * If static_prio_timeslice() is ever changed to break this assumption then | ||
706 | * this code will need modification | ||
707 | */ | ||
708 | #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE | ||
709 | #define LOAD_WEIGHT(lp) \ | ||
710 | (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) | ||
711 | #define PRIO_TO_LOAD_WEIGHT(prio) \ | ||
712 | LOAD_WEIGHT(static_prio_timeslice(prio)) | ||
713 | #define RTPRIO_TO_LOAD_WEIGHT(rp) \ | ||
714 | (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) | ||
715 | |||
716 | static void set_load_weight(task_t *p) | ||
717 | { | ||
718 | if (has_rt_policy(p)) { | ||
719 | #ifdef CONFIG_SMP | ||
720 | if (p == task_rq(p)->migration_thread) | ||
721 | /* | ||
722 | * The migration thread does the actual balancing. | ||
723 | * Giving its load any weight will skew balancing | ||
724 | * adversely. | ||
725 | */ | ||
726 | p->load_weight = 0; | ||
727 | else | ||
728 | #endif | ||
729 | p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); | ||
730 | } else | ||
731 | p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); | ||
732 | } | ||
733 | |||
734 | static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p) | ||
735 | { | ||
736 | rq->raw_weighted_load += p->load_weight; | ||
737 | } | ||
738 | |||
739 | static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p) | ||
740 | { | ||
741 | rq->raw_weighted_load -= p->load_weight; | ||
742 | } | ||
743 | |||
744 | static inline void inc_nr_running(task_t *p, runqueue_t *rq) | ||
745 | { | ||
746 | rq->nr_running++; | ||
747 | inc_raw_weighted_load(rq, p); | ||
748 | } | ||
749 | |||
750 | static inline void dec_nr_running(task_t *p, runqueue_t *rq) | ||
751 | { | ||
752 | rq->nr_running--; | ||
753 | dec_raw_weighted_load(rq, p); | ||
754 | } | ||
755 | |||
756 | /* | ||
757 | * Calculate the expected normal priority: i.e. priority | ||
758 | * without taking RT-inheritance into account. Might be | ||
759 | * boosted by interactivity modifiers. Changes upon fork, | ||
760 | * setprio syscalls, and whenever the interactivity | ||
761 | * estimator recalculates. | ||
762 | */ | ||
763 | static inline int normal_prio(task_t *p) | ||
764 | { | ||
765 | int prio; | ||
766 | |||
767 | if (has_rt_policy(p)) | ||
768 | prio = MAX_RT_PRIO-1 - p->rt_priority; | ||
769 | else | ||
770 | prio = __normal_prio(p); | ||
771 | return prio; | ||
772 | } | ||
773 | |||
774 | /* | ||
775 | * Calculate the current priority, i.e. the priority | ||
776 | * taken into account by the scheduler. This value might | ||
777 | * be boosted by RT tasks, or might be boosted by | ||
778 | * interactivity modifiers. Will be RT if the task got | ||
779 | * RT-boosted. If not then it returns p->normal_prio. | ||
780 | */ | ||
781 | static int effective_prio(task_t *p) | ||
782 | { | ||
783 | p->normal_prio = normal_prio(p); | ||
784 | /* | ||
785 | * If we are RT tasks or we were boosted to RT priority, | ||
786 | * keep the priority unchanged. Otherwise, update priority | ||
787 | * to the normal priority: | ||
788 | */ | ||
789 | if (!rt_prio(p->prio)) | ||
790 | return p->normal_prio; | ||
791 | return p->prio; | ||
792 | } | ||
793 | |||
794 | /* | ||
668 | * __activate_task - move a task to the runqueue. | 795 | * __activate_task - move a task to the runqueue. |
669 | */ | 796 | */ |
670 | static void __activate_task(task_t *p, runqueue_t *rq) | 797 | static void __activate_task(task_t *p, runqueue_t *rq) |
@@ -674,7 +801,7 @@ static void __activate_task(task_t *p, runqueue_t *rq) | |||
674 | if (batch_task(p)) | 801 | if (batch_task(p)) |
675 | target = rq->expired; | 802 | target = rq->expired; |
676 | enqueue_task(p, target); | 803 | enqueue_task(p, target); |
677 | rq->nr_running++; | 804 | inc_nr_running(p, rq); |
678 | } | 805 | } |
679 | 806 | ||
680 | /* | 807 | /* |
@@ -683,39 +810,45 @@ static void __activate_task(task_t *p, runqueue_t *rq) | |||
683 | static inline void __activate_idle_task(task_t *p, runqueue_t *rq) | 810 | static inline void __activate_idle_task(task_t *p, runqueue_t *rq) |
684 | { | 811 | { |
685 | enqueue_task_head(p, rq->active); | 812 | enqueue_task_head(p, rq->active); |
686 | rq->nr_running++; | 813 | inc_nr_running(p, rq); |
687 | } | 814 | } |
688 | 815 | ||
816 | /* | ||
817 | * Recalculate p->normal_prio and p->prio after having slept, | ||
818 | * updating the sleep-average too: | ||
819 | */ | ||
689 | static int recalc_task_prio(task_t *p, unsigned long long now) | 820 | static int recalc_task_prio(task_t *p, unsigned long long now) |
690 | { | 821 | { |
691 | /* Caller must always ensure 'now >= p->timestamp' */ | 822 | /* Caller must always ensure 'now >= p->timestamp' */ |
692 | unsigned long long __sleep_time = now - p->timestamp; | 823 | unsigned long sleep_time = now - p->timestamp; |
693 | unsigned long sleep_time; | ||
694 | 824 | ||
695 | if (batch_task(p)) | 825 | if (batch_task(p)) |
696 | sleep_time = 0; | 826 | sleep_time = 0; |
697 | else { | ||
698 | if (__sleep_time > NS_MAX_SLEEP_AVG) | ||
699 | sleep_time = NS_MAX_SLEEP_AVG; | ||
700 | else | ||
701 | sleep_time = (unsigned long)__sleep_time; | ||
702 | } | ||
703 | 827 | ||
704 | if (likely(sleep_time > 0)) { | 828 | if (likely(sleep_time > 0)) { |
705 | /* | 829 | /* |
706 | * User tasks that sleep a long time are categorised as | 830 | * This ceiling is set to the lowest priority that would allow |
707 | * idle. They will only have their sleep_avg increased to a | 831 | * a task to be reinserted into the active array on timeslice |
708 | * level that makes them just interactive priority to stay | 832 | * completion. |
709 | * active yet prevent them suddenly becoming cpu hogs and | ||
710 | * starving other processes. | ||
711 | */ | 833 | */ |
712 | if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) { | 834 | unsigned long ceiling = INTERACTIVE_SLEEP(p); |
713 | unsigned long ceiling; | ||
714 | 835 | ||
715 | ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG - | 836 | if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { |
716 | DEF_TIMESLICE); | 837 | /* |
717 | if (p->sleep_avg < ceiling) | 838 | * Prevents user tasks from achieving best priority |
718 | p->sleep_avg = ceiling; | 839 | * with one single large enough sleep. |
840 | */ | ||
841 | p->sleep_avg = ceiling; | ||
842 | /* | ||
843 | * Using INTERACTIVE_SLEEP() as a ceiling places a | ||
844 | * nice(0) task 1ms sleep away from promotion, and | ||
845 | * gives it 700ms to round-robin with no chance of | ||
846 | * being demoted. This is more than generous, so | ||
847 | * mark this sleep as non-interactive to prevent the | ||
848 | * on-runqueue bonus logic from intervening should | ||
849 | * this task not receive cpu immediately. | ||
850 | */ | ||
851 | p->sleep_type = SLEEP_NONINTERACTIVE; | ||
719 | } else { | 852 | } else { |
720 | /* | 853 | /* |
721 | * Tasks waking from uninterruptible sleep are | 854 | * Tasks waking from uninterruptible sleep are |
@@ -723,12 +856,12 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
723 | * are likely to be waiting on I/O | 856 | * are likely to be waiting on I/O |
724 | */ | 857 | */ |
725 | if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { | 858 | if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { |
726 | if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) | 859 | if (p->sleep_avg >= ceiling) |
727 | sleep_time = 0; | 860 | sleep_time = 0; |
728 | else if (p->sleep_avg + sleep_time >= | 861 | else if (p->sleep_avg + sleep_time >= |
729 | INTERACTIVE_SLEEP(p)) { | 862 | ceiling) { |
730 | p->sleep_avg = INTERACTIVE_SLEEP(p); | 863 | p->sleep_avg = ceiling; |
731 | sleep_time = 0; | 864 | sleep_time = 0; |
732 | } | 865 | } |
733 | } | 866 | } |
734 | 867 | ||
@@ -742,9 +875,9 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
742 | */ | 875 | */ |
743 | p->sleep_avg += sleep_time; | 876 | p->sleep_avg += sleep_time; |
744 | 877 | ||
745 | if (p->sleep_avg > NS_MAX_SLEEP_AVG) | ||
746 | p->sleep_avg = NS_MAX_SLEEP_AVG; | ||
747 | } | 878 | } |
879 | if (p->sleep_avg > NS_MAX_SLEEP_AVG) | ||
880 | p->sleep_avg = NS_MAX_SLEEP_AVG; | ||
748 | } | 881 | } |
749 | 882 | ||
750 | return effective_prio(p); | 883 | return effective_prio(p); |
@@ -805,7 +938,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) | |||
805 | */ | 938 | */ |
806 | static void deactivate_task(struct task_struct *p, runqueue_t *rq) | 939 | static void deactivate_task(struct task_struct *p, runqueue_t *rq) |
807 | { | 940 | { |
808 | rq->nr_running--; | 941 | dec_nr_running(p, rq); |
809 | dequeue_task(p, p->array); | 942 | dequeue_task(p, p->array); |
810 | p->array = NULL; | 943 | p->array = NULL; |
811 | } | 944 | } |
@@ -818,6 +951,11 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq) | |||
818 | * the target CPU. | 951 | * the target CPU. |
819 | */ | 952 | */ |
820 | #ifdef CONFIG_SMP | 953 | #ifdef CONFIG_SMP |
954 | |||
955 | #ifndef tsk_is_polling | ||
956 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | ||
957 | #endif | ||
958 | |||
821 | static void resched_task(task_t *p) | 959 | static void resched_task(task_t *p) |
822 | { | 960 | { |
823 | int cpu; | 961 | int cpu; |
@@ -833,9 +971,9 @@ static void resched_task(task_t *p) | |||
833 | if (cpu == smp_processor_id()) | 971 | if (cpu == smp_processor_id()) |
834 | return; | 972 | return; |
835 | 973 | ||
836 | /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ | 974 | /* NEED_RESCHED must be visible before we test polling */ |
837 | smp_mb(); | 975 | smp_mb(); |
838 | if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) | 976 | if (!tsk_is_polling(p)) |
839 | smp_send_reschedule(cpu); | 977 | smp_send_reschedule(cpu); |
840 | } | 978 | } |
841 | #else | 979 | #else |
@@ -855,6 +993,12 @@ inline int task_curr(const task_t *p) | |||
855 | return cpu_curr(task_cpu(p)) == p; | 993 | return cpu_curr(task_cpu(p)) == p; |
856 | } | 994 | } |
857 | 995 | ||
996 | /* Used instead of source_load when we know the type == 0 */ | ||
997 | unsigned long weighted_cpuload(const int cpu) | ||
998 | { | ||
999 | return cpu_rq(cpu)->raw_weighted_load; | ||
1000 | } | ||
1001 | |||
858 | #ifdef CONFIG_SMP | 1002 | #ifdef CONFIG_SMP |
859 | typedef struct { | 1003 | typedef struct { |
860 | struct list_head list; | 1004 | struct list_head list; |
@@ -944,7 +1088,8 @@ void kick_process(task_t *p) | |||
944 | } | 1088 | } |
945 | 1089 | ||
946 | /* | 1090 | /* |
947 | * Return a low guess at the load of a migration-source cpu. | 1091 | * Return a low guess at the load of a migration-source cpu weighted |
1092 | * according to the scheduling class and "nice" value. | ||
948 | * | 1093 | * |
949 | * We want to under-estimate the load of migration sources, to | 1094 | * We want to under-estimate the load of migration sources, to |
950 | * balance conservatively. | 1095 | * balance conservatively. |
@@ -952,24 +1097,36 @@ void kick_process(task_t *p) | |||
952 | static inline unsigned long source_load(int cpu, int type) | 1097 | static inline unsigned long source_load(int cpu, int type) |
953 | { | 1098 | { |
954 | runqueue_t *rq = cpu_rq(cpu); | 1099 | runqueue_t *rq = cpu_rq(cpu); |
955 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 1100 | |
956 | if (type == 0) | 1101 | if (type == 0) |
957 | return load_now; | 1102 | return rq->raw_weighted_load; |
958 | 1103 | ||
959 | return min(rq->cpu_load[type-1], load_now); | 1104 | return min(rq->cpu_load[type-1], rq->raw_weighted_load); |
960 | } | 1105 | } |
961 | 1106 | ||
962 | /* | 1107 | /* |
963 | * Return a high guess at the load of a migration-target cpu | 1108 | * Return a high guess at the load of a migration-target cpu weighted |
1109 | * according to the scheduling class and "nice" value. | ||
964 | */ | 1110 | */ |
965 | static inline unsigned long target_load(int cpu, int type) | 1111 | static inline unsigned long target_load(int cpu, int type) |
966 | { | 1112 | { |
967 | runqueue_t *rq = cpu_rq(cpu); | 1113 | runqueue_t *rq = cpu_rq(cpu); |
968 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 1114 | |
969 | if (type == 0) | 1115 | if (type == 0) |
970 | return load_now; | 1116 | return rq->raw_weighted_load; |
971 | 1117 | ||
972 | return max(rq->cpu_load[type-1], load_now); | 1118 | return max(rq->cpu_load[type-1], rq->raw_weighted_load); |
1119 | } | ||
1120 | |||
1121 | /* | ||
1122 | * Return the average load per task on the cpu's run queue | ||
1123 | */ | ||
1124 | static inline unsigned long cpu_avg_load_per_task(int cpu) | ||
1125 | { | ||
1126 | runqueue_t *rq = cpu_rq(cpu); | ||
1127 | unsigned long n = rq->nr_running; | ||
1128 | |||
1129 | return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; | ||
973 | } | 1130 | } |
974 | 1131 | ||
975 | /* | 1132 | /* |
@@ -1042,7 +1199,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
1042 | cpus_and(tmp, group->cpumask, p->cpus_allowed); | 1199 | cpus_and(tmp, group->cpumask, p->cpus_allowed); |
1043 | 1200 | ||
1044 | for_each_cpu_mask(i, tmp) { | 1201 | for_each_cpu_mask(i, tmp) { |
1045 | load = source_load(i, 0); | 1202 | load = weighted_cpuload(i); |
1046 | 1203 | ||
1047 | if (load < min_load || (load == min_load && i == this_cpu)) { | 1204 | if (load < min_load || (load == min_load && i == this_cpu)) { |
1048 | min_load = load; | 1205 | min_load = load; |
@@ -1069,9 +1226,15 @@ static int sched_balance_self(int cpu, int flag) | |||
1069 | struct task_struct *t = current; | 1226 | struct task_struct *t = current; |
1070 | struct sched_domain *tmp, *sd = NULL; | 1227 | struct sched_domain *tmp, *sd = NULL; |
1071 | 1228 | ||
1072 | for_each_domain(cpu, tmp) | 1229 | for_each_domain(cpu, tmp) { |
1230 | /* | ||
1231 | * If power savings logic is enabled for a domain, stop there. | ||
1232 | */ | ||
1233 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | ||
1234 | break; | ||
1073 | if (tmp->flags & flag) | 1235 | if (tmp->flags & flag) |
1074 | sd = tmp; | 1236 | sd = tmp; |
1237 | } | ||
1075 | 1238 | ||
1076 | while (sd) { | 1239 | while (sd) { |
1077 | cpumask_t span; | 1240 | cpumask_t span; |
@@ -1221,17 +1384,19 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync) | |||
1221 | 1384 | ||
1222 | if (this_sd->flags & SD_WAKE_AFFINE) { | 1385 | if (this_sd->flags & SD_WAKE_AFFINE) { |
1223 | unsigned long tl = this_load; | 1386 | unsigned long tl = this_load; |
1387 | unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
1388 | |||
1224 | /* | 1389 | /* |
1225 | * If sync wakeup then subtract the (maximum possible) | 1390 | * If sync wakeup then subtract the (maximum possible) |
1226 | * effect of the currently running task from the load | 1391 | * effect of the currently running task from the load |
1227 | * of the current CPU: | 1392 | * of the current CPU: |
1228 | */ | 1393 | */ |
1229 | if (sync) | 1394 | if (sync) |
1230 | tl -= SCHED_LOAD_SCALE; | 1395 | tl -= current->load_weight; |
1231 | 1396 | ||
1232 | if ((tl <= load && | 1397 | if ((tl <= load && |
1233 | tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || | 1398 | tl + target_load(cpu, idx) <= tl_per_task) || |
1234 | 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { | 1399 | 100*(tl + p->load_weight) <= imbalance*load) { |
1235 | /* | 1400 | /* |
1236 | * This domain has SD_WAKE_AFFINE and | 1401 | * This domain has SD_WAKE_AFFINE and |
1237 | * p is cache cold in this domain, and | 1402 | * p is cache cold in this domain, and |
@@ -1348,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int clone_flags) | |||
1348 | * event cannot wake it up and insert it on the runqueue either. | 1513 | * event cannot wake it up and insert it on the runqueue either. |
1349 | */ | 1514 | */ |
1350 | p->state = TASK_RUNNING; | 1515 | p->state = TASK_RUNNING; |
1516 | |||
1517 | /* | ||
1518 | * Make sure we do not leak PI boosting priority to the child: | ||
1519 | */ | ||
1520 | p->prio = current->normal_prio; | ||
1521 | |||
1351 | INIT_LIST_HEAD(&p->run_list); | 1522 | INIT_LIST_HEAD(&p->run_list); |
1352 | p->array = NULL; | 1523 | p->array = NULL; |
1353 | #ifdef CONFIG_SCHEDSTATS | 1524 | #ifdef CONFIG_SCHEDSTATS |
@@ -1427,10 +1598,11 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) | |||
1427 | __activate_task(p, rq); | 1598 | __activate_task(p, rq); |
1428 | else { | 1599 | else { |
1429 | p->prio = current->prio; | 1600 | p->prio = current->prio; |
1601 | p->normal_prio = current->normal_prio; | ||
1430 | list_add_tail(&p->run_list, ¤t->run_list); | 1602 | list_add_tail(&p->run_list, ¤t->run_list); |
1431 | p->array = current->array; | 1603 | p->array = current->array; |
1432 | p->array->nr_active++; | 1604 | p->array->nr_active++; |
1433 | rq->nr_running++; | 1605 | inc_nr_running(p, rq); |
1434 | } | 1606 | } |
1435 | set_need_resched(); | 1607 | set_need_resched(); |
1436 | } else | 1608 | } else |
@@ -1648,7 +1820,8 @@ unsigned long nr_uninterruptible(void) | |||
1648 | 1820 | ||
1649 | unsigned long long nr_context_switches(void) | 1821 | unsigned long long nr_context_switches(void) |
1650 | { | 1822 | { |
1651 | unsigned long long i, sum = 0; | 1823 | int i; |
1824 | unsigned long long sum = 0; | ||
1652 | 1825 | ||
1653 | for_each_possible_cpu(i) | 1826 | for_each_possible_cpu(i) |
1654 | sum += cpu_rq(i)->nr_switches; | 1827 | sum += cpu_rq(i)->nr_switches; |
@@ -1686,9 +1859,6 @@ unsigned long nr_active(void) | |||
1686 | /* | 1859 | /* |
1687 | * double_rq_lock - safely lock two runqueues | 1860 | * double_rq_lock - safely lock two runqueues |
1688 | * | 1861 | * |
1689 | * We must take them in cpu order to match code in | ||
1690 | * dependent_sleeper and wake_dependent_sleeper. | ||
1691 | * | ||
1692 | * Note this does not disable interrupts like task_rq_lock, | 1862 | * Note this does not disable interrupts like task_rq_lock, |
1693 | * you need to do so manually before calling. | 1863 | * you need to do so manually before calling. |
1694 | */ | 1864 | */ |
@@ -1700,7 +1870,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) | |||
1700 | spin_lock(&rq1->lock); | 1870 | spin_lock(&rq1->lock); |
1701 | __acquire(rq2->lock); /* Fake it out ;) */ | 1871 | __acquire(rq2->lock); /* Fake it out ;) */ |
1702 | } else { | 1872 | } else { |
1703 | if (rq1->cpu < rq2->cpu) { | 1873 | if (rq1 < rq2) { |
1704 | spin_lock(&rq1->lock); | 1874 | spin_lock(&rq1->lock); |
1705 | spin_lock(&rq2->lock); | 1875 | spin_lock(&rq2->lock); |
1706 | } else { | 1876 | } else { |
@@ -1736,7 +1906,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) | |||
1736 | __acquires(this_rq->lock) | 1906 | __acquires(this_rq->lock) |
1737 | { | 1907 | { |
1738 | if (unlikely(!spin_trylock(&busiest->lock))) { | 1908 | if (unlikely(!spin_trylock(&busiest->lock))) { |
1739 | if (busiest->cpu < this_rq->cpu) { | 1909 | if (busiest < this_rq) { |
1740 | spin_unlock(&this_rq->lock); | 1910 | spin_unlock(&this_rq->lock); |
1741 | spin_lock(&busiest->lock); | 1911 | spin_lock(&busiest->lock); |
1742 | spin_lock(&this_rq->lock); | 1912 | spin_lock(&this_rq->lock); |
@@ -1799,9 +1969,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | |||
1799 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) | 1969 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) |
1800 | { | 1970 | { |
1801 | dequeue_task(p, src_array); | 1971 | dequeue_task(p, src_array); |
1802 | src_rq->nr_running--; | 1972 | dec_nr_running(p, src_rq); |
1803 | set_task_cpu(p, this_cpu); | 1973 | set_task_cpu(p, this_cpu); |
1804 | this_rq->nr_running++; | 1974 | inc_nr_running(p, this_rq); |
1805 | enqueue_task(p, this_array); | 1975 | enqueue_task(p, this_array); |
1806 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) | 1976 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) |
1807 | + this_rq->timestamp_last_tick; | 1977 | + this_rq->timestamp_last_tick; |
@@ -1848,26 +2018,42 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | |||
1848 | return 1; | 2018 | return 1; |
1849 | } | 2019 | } |
1850 | 2020 | ||
2021 | #define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) | ||
1851 | /* | 2022 | /* |
1852 | * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, | 2023 | * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted |
1853 | * as part of a balancing operation within "domain". Returns the number of | 2024 | * load from busiest to this_rq, as part of a balancing operation within |
1854 | * tasks moved. | 2025 | * "domain". Returns the number of tasks moved. |
1855 | * | 2026 | * |
1856 | * Called with both runqueues locked. | 2027 | * Called with both runqueues locked. |
1857 | */ | 2028 | */ |
1858 | static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, | 2029 | static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, |
1859 | unsigned long max_nr_move, struct sched_domain *sd, | 2030 | unsigned long max_nr_move, unsigned long max_load_move, |
1860 | enum idle_type idle, int *all_pinned) | 2031 | struct sched_domain *sd, enum idle_type idle, |
2032 | int *all_pinned) | ||
1861 | { | 2033 | { |
1862 | prio_array_t *array, *dst_array; | 2034 | prio_array_t *array, *dst_array; |
1863 | struct list_head *head, *curr; | 2035 | struct list_head *head, *curr; |
1864 | int idx, pulled = 0, pinned = 0; | 2036 | int idx, pulled = 0, pinned = 0, this_best_prio, busiest_best_prio; |
2037 | int busiest_best_prio_seen; | ||
2038 | int skip_for_load; /* skip the task based on weighted load issues */ | ||
2039 | long rem_load_move; | ||
1865 | task_t *tmp; | 2040 | task_t *tmp; |
1866 | 2041 | ||
1867 | if (max_nr_move == 0) | 2042 | if (max_nr_move == 0 || max_load_move == 0) |
1868 | goto out; | 2043 | goto out; |
1869 | 2044 | ||
2045 | rem_load_move = max_load_move; | ||
1870 | pinned = 1; | 2046 | pinned = 1; |
2047 | this_best_prio = rq_best_prio(this_rq); | ||
2048 | busiest_best_prio = rq_best_prio(busiest); | ||
2049 | /* | ||
2050 | * Enable handling of the case where there is more than one task | ||
2051 | * with the best priority. If the current running task is one | ||
2052 | * of those with prio==busiest_best_prio we know it won't be moved | ||
2053 | * and therefore it's safe to override the skip (based on load) of | ||
2054 | * any task we find with that prio. | ||
2055 | */ | ||
2056 | busiest_best_prio_seen = busiest_best_prio == busiest->curr->prio; | ||
1871 | 2057 | ||
1872 | /* | 2058 | /* |
1873 | * We first consider expired tasks. Those will likely not be | 2059 | * We first consider expired tasks. Those will likely not be |
@@ -1907,7 +2093,17 @@ skip_queue: | |||
1907 | 2093 | ||
1908 | curr = curr->prev; | 2094 | curr = curr->prev; |
1909 | 2095 | ||
1910 | if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { | 2096 | /* |
2097 | * To help distribute high priority tasks accross CPUs we don't | ||
2098 | * skip a task if it will be the highest priority task (i.e. smallest | ||
2099 | * prio value) on its new queue regardless of its load weight | ||
2100 | */ | ||
2101 | skip_for_load = tmp->load_weight > rem_load_move; | ||
2102 | if (skip_for_load && idx < this_best_prio) | ||
2103 | skip_for_load = !busiest_best_prio_seen && idx == busiest_best_prio; | ||
2104 | if (skip_for_load || | ||
2105 | !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { | ||
2106 | busiest_best_prio_seen |= idx == busiest_best_prio; | ||
1911 | if (curr != head) | 2107 | if (curr != head) |
1912 | goto skip_queue; | 2108 | goto skip_queue; |
1913 | idx++; | 2109 | idx++; |
@@ -1921,9 +2117,15 @@ skip_queue: | |||
1921 | 2117 | ||
1922 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); | 2118 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); |
1923 | pulled++; | 2119 | pulled++; |
2120 | rem_load_move -= tmp->load_weight; | ||
1924 | 2121 | ||
1925 | /* We only want to steal up to the prescribed number of tasks. */ | 2122 | /* |
1926 | if (pulled < max_nr_move) { | 2123 | * We only want to steal up to the prescribed number of tasks |
2124 | * and the prescribed amount of weighted load. | ||
2125 | */ | ||
2126 | if (pulled < max_nr_move && rem_load_move > 0) { | ||
2127 | if (idx < this_best_prio) | ||
2128 | this_best_prio = idx; | ||
1927 | if (curr != head) | 2129 | if (curr != head) |
1928 | goto skip_queue; | 2130 | goto skip_queue; |
1929 | idx++; | 2131 | idx++; |
@@ -1944,7 +2146,7 @@ out: | |||
1944 | 2146 | ||
1945 | /* | 2147 | /* |
1946 | * find_busiest_group finds and returns the busiest CPU group within the | 2148 | * find_busiest_group finds and returns the busiest CPU group within the |
1947 | * domain. It calculates and returns the number of tasks which should be | 2149 | * domain. It calculates and returns the amount of weighted load which should be |
1948 | * moved to restore balance via the imbalance parameter. | 2150 | * moved to restore balance via the imbalance parameter. |
1949 | */ | 2151 | */ |
1950 | static struct sched_group * | 2152 | static struct sched_group * |
@@ -1954,9 +2156,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1954 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 2156 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
1955 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 2157 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
1956 | unsigned long max_pull; | 2158 | unsigned long max_pull; |
2159 | unsigned long busiest_load_per_task, busiest_nr_running; | ||
2160 | unsigned long this_load_per_task, this_nr_running; | ||
1957 | int load_idx; | 2161 | int load_idx; |
2162 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2163 | int power_savings_balance = 1; | ||
2164 | unsigned long leader_nr_running = 0, min_load_per_task = 0; | ||
2165 | unsigned long min_nr_running = ULONG_MAX; | ||
2166 | struct sched_group *group_min = NULL, *group_leader = NULL; | ||
2167 | #endif | ||
1958 | 2168 | ||
1959 | max_load = this_load = total_load = total_pwr = 0; | 2169 | max_load = this_load = total_load = total_pwr = 0; |
2170 | busiest_load_per_task = busiest_nr_running = 0; | ||
2171 | this_load_per_task = this_nr_running = 0; | ||
1960 | if (idle == NOT_IDLE) | 2172 | if (idle == NOT_IDLE) |
1961 | load_idx = sd->busy_idx; | 2173 | load_idx = sd->busy_idx; |
1962 | else if (idle == NEWLY_IDLE) | 2174 | else if (idle == NEWLY_IDLE) |
@@ -1965,16 +2177,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1965 | load_idx = sd->idle_idx; | 2177 | load_idx = sd->idle_idx; |
1966 | 2178 | ||
1967 | do { | 2179 | do { |
1968 | unsigned long load; | 2180 | unsigned long load, group_capacity; |
1969 | int local_group; | 2181 | int local_group; |
1970 | int i; | 2182 | int i; |
2183 | unsigned long sum_nr_running, sum_weighted_load; | ||
1971 | 2184 | ||
1972 | local_group = cpu_isset(this_cpu, group->cpumask); | 2185 | local_group = cpu_isset(this_cpu, group->cpumask); |
1973 | 2186 | ||
1974 | /* Tally up the load of all CPUs in the group */ | 2187 | /* Tally up the load of all CPUs in the group */ |
1975 | avg_load = 0; | 2188 | sum_weighted_load = sum_nr_running = avg_load = 0; |
1976 | 2189 | ||
1977 | for_each_cpu_mask(i, group->cpumask) { | 2190 | for_each_cpu_mask(i, group->cpumask) { |
2191 | runqueue_t *rq = cpu_rq(i); | ||
2192 | |||
1978 | if (*sd_idle && !idle_cpu(i)) | 2193 | if (*sd_idle && !idle_cpu(i)) |
1979 | *sd_idle = 0; | 2194 | *sd_idle = 0; |
1980 | 2195 | ||
@@ -1985,6 +2200,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1985 | load = source_load(i, load_idx); | 2200 | load = source_load(i, load_idx); |
1986 | 2201 | ||
1987 | avg_load += load; | 2202 | avg_load += load; |
2203 | sum_nr_running += rq->nr_running; | ||
2204 | sum_weighted_load += rq->raw_weighted_load; | ||
1988 | } | 2205 | } |
1989 | 2206 | ||
1990 | total_load += avg_load; | 2207 | total_load += avg_load; |
@@ -1993,17 +2210,80 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1993 | /* Adjust by relative CPU power of the group */ | 2210 | /* Adjust by relative CPU power of the group */ |
1994 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | 2211 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; |
1995 | 2212 | ||
2213 | group_capacity = group->cpu_power / SCHED_LOAD_SCALE; | ||
2214 | |||
1996 | if (local_group) { | 2215 | if (local_group) { |
1997 | this_load = avg_load; | 2216 | this_load = avg_load; |
1998 | this = group; | 2217 | this = group; |
1999 | } else if (avg_load > max_load) { | 2218 | this_nr_running = sum_nr_running; |
2219 | this_load_per_task = sum_weighted_load; | ||
2220 | } else if (avg_load > max_load && | ||
2221 | sum_nr_running > group_capacity) { | ||
2000 | max_load = avg_load; | 2222 | max_load = avg_load; |
2001 | busiest = group; | 2223 | busiest = group; |
2224 | busiest_nr_running = sum_nr_running; | ||
2225 | busiest_load_per_task = sum_weighted_load; | ||
2002 | } | 2226 | } |
2227 | |||
2228 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2229 | /* | ||
2230 | * Busy processors will not participate in power savings | ||
2231 | * balance. | ||
2232 | */ | ||
2233 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
2234 | goto group_next; | ||
2235 | |||
2236 | /* | ||
2237 | * If the local group is idle or completely loaded | ||
2238 | * no need to do power savings balance at this domain | ||
2239 | */ | ||
2240 | if (local_group && (this_nr_running >= group_capacity || | ||
2241 | !this_nr_running)) | ||
2242 | power_savings_balance = 0; | ||
2243 | |||
2244 | /* | ||
2245 | * If a group is already running at full capacity or idle, | ||
2246 | * don't include that group in power savings calculations | ||
2247 | */ | ||
2248 | if (!power_savings_balance || sum_nr_running >= group_capacity | ||
2249 | || !sum_nr_running) | ||
2250 | goto group_next; | ||
2251 | |||
2252 | /* | ||
2253 | * Calculate the group which has the least non-idle load. | ||
2254 | * This is the group from where we need to pick up the load | ||
2255 | * for saving power | ||
2256 | */ | ||
2257 | if ((sum_nr_running < min_nr_running) || | ||
2258 | (sum_nr_running == min_nr_running && | ||
2259 | first_cpu(group->cpumask) < | ||
2260 | first_cpu(group_min->cpumask))) { | ||
2261 | group_min = group; | ||
2262 | min_nr_running = sum_nr_running; | ||
2263 | min_load_per_task = sum_weighted_load / | ||
2264 | sum_nr_running; | ||
2265 | } | ||
2266 | |||
2267 | /* | ||
2268 | * Calculate the group which is almost near its | ||
2269 | * capacity but still has some space to pick up some load | ||
2270 | * from other group and save more power | ||
2271 | */ | ||
2272 | if (sum_nr_running <= group_capacity - 1) | ||
2273 | if (sum_nr_running > leader_nr_running || | ||
2274 | (sum_nr_running == leader_nr_running && | ||
2275 | first_cpu(group->cpumask) > | ||
2276 | first_cpu(group_leader->cpumask))) { | ||
2277 | group_leader = group; | ||
2278 | leader_nr_running = sum_nr_running; | ||
2279 | } | ||
2280 | |||
2281 | group_next: | ||
2282 | #endif | ||
2003 | group = group->next; | 2283 | group = group->next; |
2004 | } while (group != sd->groups); | 2284 | } while (group != sd->groups); |
2005 | 2285 | ||
2006 | if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) | 2286 | if (!busiest || this_load >= max_load || busiest_nr_running == 0) |
2007 | goto out_balanced; | 2287 | goto out_balanced; |
2008 | 2288 | ||
2009 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; | 2289 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; |
@@ -2012,6 +2292,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2012 | 100*max_load <= sd->imbalance_pct*this_load) | 2292 | 100*max_load <= sd->imbalance_pct*this_load) |
2013 | goto out_balanced; | 2293 | goto out_balanced; |
2014 | 2294 | ||
2295 | busiest_load_per_task /= busiest_nr_running; | ||
2015 | /* | 2296 | /* |
2016 | * We're trying to get all the cpus to the average_load, so we don't | 2297 | * We're trying to get all the cpus to the average_load, so we don't |
2017 | * want to push ourselves above the average load, nor do we wish to | 2298 | * want to push ourselves above the average load, nor do we wish to |
@@ -2023,21 +2304,50 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2023 | * by pulling tasks to us. Be careful of negative numbers as they'll | 2304 | * by pulling tasks to us. Be careful of negative numbers as they'll |
2024 | * appear as very large values with unsigned longs. | 2305 | * appear as very large values with unsigned longs. |
2025 | */ | 2306 | */ |
2307 | if (max_load <= busiest_load_per_task) | ||
2308 | goto out_balanced; | ||
2309 | |||
2310 | /* | ||
2311 | * In the presence of smp nice balancing, certain scenarios can have | ||
2312 | * max load less than avg load(as we skip the groups at or below | ||
2313 | * its cpu_power, while calculating max_load..) | ||
2314 | */ | ||
2315 | if (max_load < avg_load) { | ||
2316 | *imbalance = 0; | ||
2317 | goto small_imbalance; | ||
2318 | } | ||
2026 | 2319 | ||
2027 | /* Don't want to pull so many tasks that a group would go idle */ | 2320 | /* Don't want to pull so many tasks that a group would go idle */ |
2028 | max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); | 2321 | max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); |
2029 | 2322 | ||
2030 | /* How much load to actually move to equalise the imbalance */ | 2323 | /* How much load to actually move to equalise the imbalance */ |
2031 | *imbalance = min(max_pull * busiest->cpu_power, | 2324 | *imbalance = min(max_pull * busiest->cpu_power, |
2032 | (avg_load - this_load) * this->cpu_power) | 2325 | (avg_load - this_load) * this->cpu_power) |
2033 | / SCHED_LOAD_SCALE; | 2326 | / SCHED_LOAD_SCALE; |
2034 | 2327 | ||
2035 | if (*imbalance < SCHED_LOAD_SCALE) { | 2328 | /* |
2036 | unsigned long pwr_now = 0, pwr_move = 0; | 2329 | * if *imbalance is less than the average load per runnable task |
2330 | * there is no gaurantee that any tasks will be moved so we'll have | ||
2331 | * a think about bumping its value to force at least one task to be | ||
2332 | * moved | ||
2333 | */ | ||
2334 | if (*imbalance < busiest_load_per_task) { | ||
2335 | unsigned long pwr_now, pwr_move; | ||
2037 | unsigned long tmp; | 2336 | unsigned long tmp; |
2337 | unsigned int imbn; | ||
2338 | |||
2339 | small_imbalance: | ||
2340 | pwr_move = pwr_now = 0; | ||
2341 | imbn = 2; | ||
2342 | if (this_nr_running) { | ||
2343 | this_load_per_task /= this_nr_running; | ||
2344 | if (busiest_load_per_task > this_load_per_task) | ||
2345 | imbn = 1; | ||
2346 | } else | ||
2347 | this_load_per_task = SCHED_LOAD_SCALE; | ||
2038 | 2348 | ||
2039 | if (max_load - this_load >= SCHED_LOAD_SCALE*2) { | 2349 | if (max_load - this_load >= busiest_load_per_task * imbn) { |
2040 | *imbalance = 1; | 2350 | *imbalance = busiest_load_per_task; |
2041 | return busiest; | 2351 | return busiest; |
2042 | } | 2352 | } |
2043 | 2353 | ||
@@ -2047,39 +2357,47 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2047 | * moving them. | 2357 | * moving them. |
2048 | */ | 2358 | */ |
2049 | 2359 | ||
2050 | pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); | 2360 | pwr_now += busiest->cpu_power * |
2051 | pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); | 2361 | min(busiest_load_per_task, max_load); |
2362 | pwr_now += this->cpu_power * | ||
2363 | min(this_load_per_task, this_load); | ||
2052 | pwr_now /= SCHED_LOAD_SCALE; | 2364 | pwr_now /= SCHED_LOAD_SCALE; |
2053 | 2365 | ||
2054 | /* Amount of load we'd subtract */ | 2366 | /* Amount of load we'd subtract */ |
2055 | tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; | 2367 | tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; |
2056 | if (max_load > tmp) | 2368 | if (max_load > tmp) |
2057 | pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, | 2369 | pwr_move += busiest->cpu_power * |
2058 | max_load - tmp); | 2370 | min(busiest_load_per_task, max_load - tmp); |
2059 | 2371 | ||
2060 | /* Amount of load we'd add */ | 2372 | /* Amount of load we'd add */ |
2061 | if (max_load*busiest->cpu_power < | 2373 | if (max_load*busiest->cpu_power < |
2062 | SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) | 2374 | busiest_load_per_task*SCHED_LOAD_SCALE) |
2063 | tmp = max_load*busiest->cpu_power/this->cpu_power; | 2375 | tmp = max_load*busiest->cpu_power/this->cpu_power; |
2064 | else | 2376 | else |
2065 | tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; | 2377 | tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; |
2066 | pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); | 2378 | pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); |
2067 | pwr_move /= SCHED_LOAD_SCALE; | 2379 | pwr_move /= SCHED_LOAD_SCALE; |
2068 | 2380 | ||
2069 | /* Move if we gain throughput */ | 2381 | /* Move if we gain throughput */ |
2070 | if (pwr_move <= pwr_now) | 2382 | if (pwr_move <= pwr_now) |
2071 | goto out_balanced; | 2383 | goto out_balanced; |
2072 | 2384 | ||
2073 | *imbalance = 1; | 2385 | *imbalance = busiest_load_per_task; |
2074 | return busiest; | ||
2075 | } | 2386 | } |
2076 | 2387 | ||
2077 | /* Get rid of the scaling factor, rounding down as we divide */ | ||
2078 | *imbalance = *imbalance / SCHED_LOAD_SCALE; | ||
2079 | return busiest; | 2388 | return busiest; |
2080 | 2389 | ||
2081 | out_balanced: | 2390 | out_balanced: |
2391 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2392 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
2393 | goto ret; | ||
2082 | 2394 | ||
2395 | if (this == group_leader && group_leader != group_min) { | ||
2396 | *imbalance = min_load_per_task; | ||
2397 | return group_min; | ||
2398 | } | ||
2399 | ret: | ||
2400 | #endif | ||
2083 | *imbalance = 0; | 2401 | *imbalance = 0; |
2084 | return NULL; | 2402 | return NULL; |
2085 | } | 2403 | } |
@@ -2088,18 +2406,21 @@ out_balanced: | |||
2088 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 2406 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
2089 | */ | 2407 | */ |
2090 | static runqueue_t *find_busiest_queue(struct sched_group *group, | 2408 | static runqueue_t *find_busiest_queue(struct sched_group *group, |
2091 | enum idle_type idle) | 2409 | enum idle_type idle, unsigned long imbalance) |
2092 | { | 2410 | { |
2093 | unsigned long load, max_load = 0; | 2411 | unsigned long max_load = 0; |
2094 | runqueue_t *busiest = NULL; | 2412 | runqueue_t *busiest = NULL, *rqi; |
2095 | int i; | 2413 | int i; |
2096 | 2414 | ||
2097 | for_each_cpu_mask(i, group->cpumask) { | 2415 | for_each_cpu_mask(i, group->cpumask) { |
2098 | load = source_load(i, 0); | 2416 | rqi = cpu_rq(i); |
2099 | 2417 | ||
2100 | if (load > max_load) { | 2418 | if (rqi->nr_running == 1 && rqi->raw_weighted_load > imbalance) |
2101 | max_load = load; | 2419 | continue; |
2102 | busiest = cpu_rq(i); | 2420 | |
2421 | if (rqi->raw_weighted_load > max_load) { | ||
2422 | max_load = rqi->raw_weighted_load; | ||
2423 | busiest = rqi; | ||
2103 | } | 2424 | } |
2104 | } | 2425 | } |
2105 | 2426 | ||
@@ -2112,6 +2433,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group, | |||
2112 | */ | 2433 | */ |
2113 | #define MAX_PINNED_INTERVAL 512 | 2434 | #define MAX_PINNED_INTERVAL 512 |
2114 | 2435 | ||
2436 | #define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0) | ||
2115 | /* | 2437 | /* |
2116 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2438 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
2117 | * tasks if there is an imbalance. | 2439 | * tasks if there is an imbalance. |
@@ -2128,7 +2450,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2128 | int active_balance = 0; | 2450 | int active_balance = 0; |
2129 | int sd_idle = 0; | 2451 | int sd_idle = 0; |
2130 | 2452 | ||
2131 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) | 2453 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && |
2454 | !sched_smt_power_savings) | ||
2132 | sd_idle = 1; | 2455 | sd_idle = 1; |
2133 | 2456 | ||
2134 | schedstat_inc(sd, lb_cnt[idle]); | 2457 | schedstat_inc(sd, lb_cnt[idle]); |
@@ -2139,7 +2462,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2139 | goto out_balanced; | 2462 | goto out_balanced; |
2140 | } | 2463 | } |
2141 | 2464 | ||
2142 | busiest = find_busiest_queue(group, idle); | 2465 | busiest = find_busiest_queue(group, idle, imbalance); |
2143 | if (!busiest) { | 2466 | if (!busiest) { |
2144 | schedstat_inc(sd, lb_nobusyq[idle]); | 2467 | schedstat_inc(sd, lb_nobusyq[idle]); |
2145 | goto out_balanced; | 2468 | goto out_balanced; |
@@ -2159,6 +2482,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2159 | */ | 2482 | */ |
2160 | double_rq_lock(this_rq, busiest); | 2483 | double_rq_lock(this_rq, busiest); |
2161 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2484 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
2485 | minus_1_or_zero(busiest->nr_running), | ||
2162 | imbalance, sd, idle, &all_pinned); | 2486 | imbalance, sd, idle, &all_pinned); |
2163 | double_rq_unlock(this_rq, busiest); | 2487 | double_rq_unlock(this_rq, busiest); |
2164 | 2488 | ||
@@ -2216,7 +2540,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2216 | sd->balance_interval *= 2; | 2540 | sd->balance_interval *= 2; |
2217 | } | 2541 | } |
2218 | 2542 | ||
2219 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2543 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2544 | !sched_smt_power_savings) | ||
2220 | return -1; | 2545 | return -1; |
2221 | return nr_moved; | 2546 | return nr_moved; |
2222 | 2547 | ||
@@ -2231,7 +2556,7 @@ out_one_pinned: | |||
2231 | (sd->balance_interval < sd->max_interval)) | 2556 | (sd->balance_interval < sd->max_interval)) |
2232 | sd->balance_interval *= 2; | 2557 | sd->balance_interval *= 2; |
2233 | 2558 | ||
2234 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2559 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) |
2235 | return -1; | 2560 | return -1; |
2236 | return 0; | 2561 | return 0; |
2237 | } | 2562 | } |
@@ -2252,7 +2577,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2252 | int nr_moved = 0; | 2577 | int nr_moved = 0; |
2253 | int sd_idle = 0; | 2578 | int sd_idle = 0; |
2254 | 2579 | ||
2255 | if (sd->flags & SD_SHARE_CPUPOWER) | 2580 | if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) |
2256 | sd_idle = 1; | 2581 | sd_idle = 1; |
2257 | 2582 | ||
2258 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); | 2583 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); |
@@ -2262,7 +2587,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2262 | goto out_balanced; | 2587 | goto out_balanced; |
2263 | } | 2588 | } |
2264 | 2589 | ||
2265 | busiest = find_busiest_queue(group, NEWLY_IDLE); | 2590 | busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance); |
2266 | if (!busiest) { | 2591 | if (!busiest) { |
2267 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); | 2592 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); |
2268 | goto out_balanced; | 2593 | goto out_balanced; |
@@ -2277,6 +2602,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2277 | /* Attempt to move tasks */ | 2602 | /* Attempt to move tasks */ |
2278 | double_lock_balance(this_rq, busiest); | 2603 | double_lock_balance(this_rq, busiest); |
2279 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2604 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
2605 | minus_1_or_zero(busiest->nr_running), | ||
2280 | imbalance, sd, NEWLY_IDLE, NULL); | 2606 | imbalance, sd, NEWLY_IDLE, NULL); |
2281 | spin_unlock(&busiest->lock); | 2607 | spin_unlock(&busiest->lock); |
2282 | } | 2608 | } |
@@ -2292,7 +2618,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2292 | 2618 | ||
2293 | out_balanced: | 2619 | out_balanced: |
2294 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | 2620 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); |
2295 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2621 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) |
2296 | return -1; | 2622 | return -1; |
2297 | sd->nr_balance_failed = 0; | 2623 | sd->nr_balance_failed = 0; |
2298 | return 0; | 2624 | return 0; |
@@ -2347,17 +2673,19 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) | |||
2347 | double_lock_balance(busiest_rq, target_rq); | 2673 | double_lock_balance(busiest_rq, target_rq); |
2348 | 2674 | ||
2349 | /* Search for an sd spanning us and the target CPU. */ | 2675 | /* Search for an sd spanning us and the target CPU. */ |
2350 | for_each_domain(target_cpu, sd) | 2676 | for_each_domain(target_cpu, sd) { |
2351 | if ((sd->flags & SD_LOAD_BALANCE) && | 2677 | if ((sd->flags & SD_LOAD_BALANCE) && |
2352 | cpu_isset(busiest_cpu, sd->span)) | 2678 | cpu_isset(busiest_cpu, sd->span)) |
2353 | break; | 2679 | break; |
2680 | } | ||
2354 | 2681 | ||
2355 | if (unlikely(sd == NULL)) | 2682 | if (unlikely(sd == NULL)) |
2356 | goto out; | 2683 | goto out; |
2357 | 2684 | ||
2358 | schedstat_inc(sd, alb_cnt); | 2685 | schedstat_inc(sd, alb_cnt); |
2359 | 2686 | ||
2360 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) | 2687 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, |
2688 | RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, NULL)) | ||
2361 | schedstat_inc(sd, alb_pushed); | 2689 | schedstat_inc(sd, alb_pushed); |
2362 | else | 2690 | else |
2363 | schedstat_inc(sd, alb_failed); | 2691 | schedstat_inc(sd, alb_failed); |
@@ -2385,7 +2713,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, | |||
2385 | struct sched_domain *sd; | 2713 | struct sched_domain *sd; |
2386 | int i; | 2714 | int i; |
2387 | 2715 | ||
2388 | this_load = this_rq->nr_running * SCHED_LOAD_SCALE; | 2716 | this_load = this_rq->raw_weighted_load; |
2389 | /* Update our load */ | 2717 | /* Update our load */ |
2390 | for (i = 0; i < 3; i++) { | 2718 | for (i = 0; i < 3; i++) { |
2391 | unsigned long new_load = this_load; | 2719 | unsigned long new_load = this_load; |
@@ -2686,48 +3014,35 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq) | |||
2686 | resched_task(rq->idle); | 3014 | resched_task(rq->idle); |
2687 | } | 3015 | } |
2688 | 3016 | ||
2689 | static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 3017 | /* |
3018 | * Called with interrupt disabled and this_rq's runqueue locked. | ||
3019 | */ | ||
3020 | static void wake_sleeping_dependent(int this_cpu) | ||
2690 | { | 3021 | { |
2691 | struct sched_domain *tmp, *sd = NULL; | 3022 | struct sched_domain *tmp, *sd = NULL; |
2692 | cpumask_t sibling_map; | ||
2693 | int i; | 3023 | int i; |
2694 | 3024 | ||
2695 | for_each_domain(this_cpu, tmp) | 3025 | for_each_domain(this_cpu, tmp) { |
2696 | if (tmp->flags & SD_SHARE_CPUPOWER) | 3026 | if (tmp->flags & SD_SHARE_CPUPOWER) { |
2697 | sd = tmp; | 3027 | sd = tmp; |
3028 | break; | ||
3029 | } | ||
3030 | } | ||
2698 | 3031 | ||
2699 | if (!sd) | 3032 | if (!sd) |
2700 | return; | 3033 | return; |
2701 | 3034 | ||
2702 | /* | 3035 | for_each_cpu_mask(i, sd->span) { |
2703 | * Unlock the current runqueue because we have to lock in | ||
2704 | * CPU order to avoid deadlocks. Caller knows that we might | ||
2705 | * unlock. We keep IRQs disabled. | ||
2706 | */ | ||
2707 | spin_unlock(&this_rq->lock); | ||
2708 | |||
2709 | sibling_map = sd->span; | ||
2710 | |||
2711 | for_each_cpu_mask(i, sibling_map) | ||
2712 | spin_lock(&cpu_rq(i)->lock); | ||
2713 | /* | ||
2714 | * We clear this CPU from the mask. This both simplifies the | ||
2715 | * inner loop and keps this_rq locked when we exit: | ||
2716 | */ | ||
2717 | cpu_clear(this_cpu, sibling_map); | ||
2718 | |||
2719 | for_each_cpu_mask(i, sibling_map) { | ||
2720 | runqueue_t *smt_rq = cpu_rq(i); | 3036 | runqueue_t *smt_rq = cpu_rq(i); |
2721 | 3037 | ||
3038 | if (i == this_cpu) | ||
3039 | continue; | ||
3040 | if (unlikely(!spin_trylock(&smt_rq->lock))) | ||
3041 | continue; | ||
3042 | |||
2722 | wakeup_busy_runqueue(smt_rq); | 3043 | wakeup_busy_runqueue(smt_rq); |
3044 | spin_unlock(&smt_rq->lock); | ||
2723 | } | 3045 | } |
2724 | |||
2725 | for_each_cpu_mask(i, sibling_map) | ||
2726 | spin_unlock(&cpu_rq(i)->lock); | ||
2727 | /* | ||
2728 | * We exit with this_cpu's rq still held and IRQs | ||
2729 | * still disabled: | ||
2730 | */ | ||
2731 | } | 3046 | } |
2732 | 3047 | ||
2733 | /* | 3048 | /* |
@@ -2740,52 +3055,46 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) | |||
2740 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; | 3055 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; |
2741 | } | 3056 | } |
2742 | 3057 | ||
2743 | static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 3058 | /* |
3059 | * To minimise lock contention and not have to drop this_rq's runlock we only | ||
3060 | * trylock the sibling runqueues and bypass those runqueues if we fail to | ||
3061 | * acquire their lock. As we only trylock the normal locking order does not | ||
3062 | * need to be obeyed. | ||
3063 | */ | ||
3064 | static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p) | ||
2744 | { | 3065 | { |
2745 | struct sched_domain *tmp, *sd = NULL; | 3066 | struct sched_domain *tmp, *sd = NULL; |
2746 | cpumask_t sibling_map; | ||
2747 | prio_array_t *array; | ||
2748 | int ret = 0, i; | 3067 | int ret = 0, i; |
2749 | task_t *p; | ||
2750 | 3068 | ||
2751 | for_each_domain(this_cpu, tmp) | 3069 | /* kernel/rt threads do not participate in dependent sleeping */ |
2752 | if (tmp->flags & SD_SHARE_CPUPOWER) | 3070 | if (!p->mm || rt_task(p)) |
3071 | return 0; | ||
3072 | |||
3073 | for_each_domain(this_cpu, tmp) { | ||
3074 | if (tmp->flags & SD_SHARE_CPUPOWER) { | ||
2753 | sd = tmp; | 3075 | sd = tmp; |
3076 | break; | ||
3077 | } | ||
3078 | } | ||
2754 | 3079 | ||
2755 | if (!sd) | 3080 | if (!sd) |
2756 | return 0; | 3081 | return 0; |
2757 | 3082 | ||
2758 | /* | 3083 | for_each_cpu_mask(i, sd->span) { |
2759 | * The same locking rules and details apply as for | 3084 | runqueue_t *smt_rq; |
2760 | * wake_sleeping_dependent(): | 3085 | task_t *smt_curr; |
2761 | */ | ||
2762 | spin_unlock(&this_rq->lock); | ||
2763 | sibling_map = sd->span; | ||
2764 | for_each_cpu_mask(i, sibling_map) | ||
2765 | spin_lock(&cpu_rq(i)->lock); | ||
2766 | cpu_clear(this_cpu, sibling_map); | ||
2767 | 3086 | ||
2768 | /* | 3087 | if (i == this_cpu) |
2769 | * Establish next task to be run - it might have gone away because | 3088 | continue; |
2770 | * we released the runqueue lock above: | ||
2771 | */ | ||
2772 | if (!this_rq->nr_running) | ||
2773 | goto out_unlock; | ||
2774 | array = this_rq->active; | ||
2775 | if (!array->nr_active) | ||
2776 | array = this_rq->expired; | ||
2777 | BUG_ON(!array->nr_active); | ||
2778 | 3089 | ||
2779 | p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, | 3090 | smt_rq = cpu_rq(i); |
2780 | task_t, run_list); | 3091 | if (unlikely(!spin_trylock(&smt_rq->lock))) |
3092 | continue; | ||
2781 | 3093 | ||
2782 | for_each_cpu_mask(i, sibling_map) { | 3094 | smt_curr = smt_rq->curr; |
2783 | runqueue_t *smt_rq = cpu_rq(i); | ||
2784 | task_t *smt_curr = smt_rq->curr; | ||
2785 | 3095 | ||
2786 | /* Kernel threads do not participate in dependent sleeping */ | 3096 | if (!smt_curr->mm) |
2787 | if (!p->mm || !smt_curr->mm || rt_task(p)) | 3097 | goto unlock; |
2788 | goto check_smt_task; | ||
2789 | 3098 | ||
2790 | /* | 3099 | /* |
2791 | * If a user task with lower static priority than the | 3100 | * If a user task with lower static priority than the |
@@ -2803,49 +3112,24 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | |||
2803 | if ((jiffies % DEF_TIMESLICE) > | 3112 | if ((jiffies % DEF_TIMESLICE) > |
2804 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) | 3113 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) |
2805 | ret = 1; | 3114 | ret = 1; |
2806 | } else | 3115 | } else { |
2807 | if (smt_curr->static_prio < p->static_prio && | 3116 | if (smt_curr->static_prio < p->static_prio && |
2808 | !TASK_PREEMPTS_CURR(p, smt_rq) && | 3117 | !TASK_PREEMPTS_CURR(p, smt_rq) && |
2809 | smt_slice(smt_curr, sd) > task_timeslice(p)) | 3118 | smt_slice(smt_curr, sd) > task_timeslice(p)) |
2810 | ret = 1; | 3119 | ret = 1; |
2811 | |||
2812 | check_smt_task: | ||
2813 | if ((!smt_curr->mm && smt_curr != smt_rq->idle) || | ||
2814 | rt_task(smt_curr)) | ||
2815 | continue; | ||
2816 | if (!p->mm) { | ||
2817 | wakeup_busy_runqueue(smt_rq); | ||
2818 | continue; | ||
2819 | } | ||
2820 | |||
2821 | /* | ||
2822 | * Reschedule a lower priority task on the SMT sibling for | ||
2823 | * it to be put to sleep, or wake it up if it has been put to | ||
2824 | * sleep for priority reasons to see if it should run now. | ||
2825 | */ | ||
2826 | if (rt_task(p)) { | ||
2827 | if ((jiffies % DEF_TIMESLICE) > | ||
2828 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) | ||
2829 | resched_task(smt_curr); | ||
2830 | } else { | ||
2831 | if (TASK_PREEMPTS_CURR(p, smt_rq) && | ||
2832 | smt_slice(p, sd) > task_timeslice(smt_curr)) | ||
2833 | resched_task(smt_curr); | ||
2834 | else | ||
2835 | wakeup_busy_runqueue(smt_rq); | ||
2836 | } | 3120 | } |
3121 | unlock: | ||
3122 | spin_unlock(&smt_rq->lock); | ||
2837 | } | 3123 | } |
2838 | out_unlock: | ||
2839 | for_each_cpu_mask(i, sibling_map) | ||
2840 | spin_unlock(&cpu_rq(i)->lock); | ||
2841 | return ret; | 3124 | return ret; |
2842 | } | 3125 | } |
2843 | #else | 3126 | #else |
2844 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 3127 | static inline void wake_sleeping_dependent(int this_cpu) |
2845 | { | 3128 | { |
2846 | } | 3129 | } |
2847 | 3130 | ||
2848 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 3131 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq, |
3132 | task_t *p) | ||
2849 | { | 3133 | { |
2850 | return 0; | 3134 | return 0; |
2851 | } | 3135 | } |
@@ -2967,32 +3251,13 @@ need_resched_nonpreemptible: | |||
2967 | 3251 | ||
2968 | cpu = smp_processor_id(); | 3252 | cpu = smp_processor_id(); |
2969 | if (unlikely(!rq->nr_running)) { | 3253 | if (unlikely(!rq->nr_running)) { |
2970 | go_idle: | ||
2971 | idle_balance(cpu, rq); | 3254 | idle_balance(cpu, rq); |
2972 | if (!rq->nr_running) { | 3255 | if (!rq->nr_running) { |
2973 | next = rq->idle; | 3256 | next = rq->idle; |
2974 | rq->expired_timestamp = 0; | 3257 | rq->expired_timestamp = 0; |
2975 | wake_sleeping_dependent(cpu, rq); | 3258 | wake_sleeping_dependent(cpu); |
2976 | /* | ||
2977 | * wake_sleeping_dependent() might have released | ||
2978 | * the runqueue, so break out if we got new | ||
2979 | * tasks meanwhile: | ||
2980 | */ | ||
2981 | if (!rq->nr_running) | ||
2982 | goto switch_tasks; | ||
2983 | } | ||
2984 | } else { | ||
2985 | if (dependent_sleeper(cpu, rq)) { | ||
2986 | next = rq->idle; | ||
2987 | goto switch_tasks; | 3259 | goto switch_tasks; |
2988 | } | 3260 | } |
2989 | /* | ||
2990 | * dependent_sleeper() releases and reacquires the runqueue | ||
2991 | * lock, hence go into the idle loop if the rq went | ||
2992 | * empty meanwhile: | ||
2993 | */ | ||
2994 | if (unlikely(!rq->nr_running)) | ||
2995 | goto go_idle; | ||
2996 | } | 3261 | } |
2997 | 3262 | ||
2998 | array = rq->active; | 3263 | array = rq->active; |
@@ -3030,6 +3295,8 @@ go_idle: | |||
3030 | } | 3295 | } |
3031 | } | 3296 | } |
3032 | next->sleep_type = SLEEP_NORMAL; | 3297 | next->sleep_type = SLEEP_NORMAL; |
3298 | if (dependent_sleeper(cpu, rq, next)) | ||
3299 | next = rq->idle; | ||
3033 | switch_tasks: | 3300 | switch_tasks: |
3034 | if (next == rq->idle) | 3301 | if (next == rq->idle) |
3035 | schedstat_inc(rq, sched_goidle); | 3302 | schedstat_inc(rq, sched_goidle); |
@@ -3473,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | |||
3473 | 3740 | ||
3474 | EXPORT_SYMBOL(sleep_on_timeout); | 3741 | EXPORT_SYMBOL(sleep_on_timeout); |
3475 | 3742 | ||
3743 | #ifdef CONFIG_RT_MUTEXES | ||
3744 | |||
3745 | /* | ||
3746 | * rt_mutex_setprio - set the current priority of a task | ||
3747 | * @p: task | ||
3748 | * @prio: prio value (kernel-internal form) | ||
3749 | * | ||
3750 | * This function changes the 'effective' priority of a task. It does | ||
3751 | * not touch ->normal_prio like __setscheduler(). | ||
3752 | * | ||
3753 | * Used by the rt_mutex code to implement priority inheritance logic. | ||
3754 | */ | ||
3755 | void rt_mutex_setprio(task_t *p, int prio) | ||
3756 | { | ||
3757 | unsigned long flags; | ||
3758 | prio_array_t *array; | ||
3759 | runqueue_t *rq; | ||
3760 | int oldprio; | ||
3761 | |||
3762 | BUG_ON(prio < 0 || prio > MAX_PRIO); | ||
3763 | |||
3764 | rq = task_rq_lock(p, &flags); | ||
3765 | |||
3766 | oldprio = p->prio; | ||
3767 | array = p->array; | ||
3768 | if (array) | ||
3769 | dequeue_task(p, array); | ||
3770 | p->prio = prio; | ||
3771 | |||
3772 | if (array) { | ||
3773 | /* | ||
3774 | * If changing to an RT priority then queue it | ||
3775 | * in the active array! | ||
3776 | */ | ||
3777 | if (rt_task(p)) | ||
3778 | array = rq->active; | ||
3779 | enqueue_task(p, array); | ||
3780 | /* | ||
3781 | * Reschedule if we are currently running on this runqueue and | ||
3782 | * our priority decreased, or if we are not currently running on | ||
3783 | * this runqueue and our priority is higher than the current's | ||
3784 | */ | ||
3785 | if (task_running(rq, p)) { | ||
3786 | if (p->prio > oldprio) | ||
3787 | resched_task(rq->curr); | ||
3788 | } else if (TASK_PREEMPTS_CURR(p, rq)) | ||
3789 | resched_task(rq->curr); | ||
3790 | } | ||
3791 | task_rq_unlock(rq, &flags); | ||
3792 | } | ||
3793 | |||
3794 | #endif | ||
3795 | |||
3476 | void set_user_nice(task_t *p, long nice) | 3796 | void set_user_nice(task_t *p, long nice) |
3477 | { | 3797 | { |
3478 | unsigned long flags; | 3798 | unsigned long flags; |
3479 | prio_array_t *array; | 3799 | prio_array_t *array; |
3480 | runqueue_t *rq; | 3800 | runqueue_t *rq; |
3481 | int old_prio, new_prio, delta; | 3801 | int old_prio, delta; |
3482 | 3802 | ||
3483 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | 3803 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) |
3484 | return; | 3804 | return; |
@@ -3493,22 +3813,25 @@ void set_user_nice(task_t *p, long nice) | |||
3493 | * it wont have any effect on scheduling until the task is | 3813 | * it wont have any effect on scheduling until the task is |
3494 | * not SCHED_NORMAL/SCHED_BATCH: | 3814 | * not SCHED_NORMAL/SCHED_BATCH: |
3495 | */ | 3815 | */ |
3496 | if (rt_task(p)) { | 3816 | if (has_rt_policy(p)) { |
3497 | p->static_prio = NICE_TO_PRIO(nice); | 3817 | p->static_prio = NICE_TO_PRIO(nice); |
3498 | goto out_unlock; | 3818 | goto out_unlock; |
3499 | } | 3819 | } |
3500 | array = p->array; | 3820 | array = p->array; |
3501 | if (array) | 3821 | if (array) { |
3502 | dequeue_task(p, array); | 3822 | dequeue_task(p, array); |
3823 | dec_raw_weighted_load(rq, p); | ||
3824 | } | ||
3503 | 3825 | ||
3504 | old_prio = p->prio; | ||
3505 | new_prio = NICE_TO_PRIO(nice); | ||
3506 | delta = new_prio - old_prio; | ||
3507 | p->static_prio = NICE_TO_PRIO(nice); | 3826 | p->static_prio = NICE_TO_PRIO(nice); |
3508 | p->prio += delta; | 3827 | set_load_weight(p); |
3828 | old_prio = p->prio; | ||
3829 | p->prio = effective_prio(p); | ||
3830 | delta = p->prio - old_prio; | ||
3509 | 3831 | ||
3510 | if (array) { | 3832 | if (array) { |
3511 | enqueue_task(p, array); | 3833 | enqueue_task(p, array); |
3834 | inc_raw_weighted_load(rq, p); | ||
3512 | /* | 3835 | /* |
3513 | * If the task increased its priority or is running and | 3836 | * If the task increased its priority or is running and |
3514 | * lowered its priority, then reschedule its CPU: | 3837 | * lowered its priority, then reschedule its CPU: |
@@ -3519,7 +3842,6 @@ void set_user_nice(task_t *p, long nice) | |||
3519 | out_unlock: | 3842 | out_unlock: |
3520 | task_rq_unlock(rq, &flags); | 3843 | task_rq_unlock(rq, &flags); |
3521 | } | 3844 | } |
3522 | |||
3523 | EXPORT_SYMBOL(set_user_nice); | 3845 | EXPORT_SYMBOL(set_user_nice); |
3524 | 3846 | ||
3525 | /* | 3847 | /* |
@@ -3634,16 +3956,15 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) | |||
3634 | BUG_ON(p->array); | 3956 | BUG_ON(p->array); |
3635 | p->policy = policy; | 3957 | p->policy = policy; |
3636 | p->rt_priority = prio; | 3958 | p->rt_priority = prio; |
3637 | if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { | 3959 | p->normal_prio = normal_prio(p); |
3638 | p->prio = MAX_RT_PRIO-1 - p->rt_priority; | 3960 | /* we are holding p->pi_lock already */ |
3639 | } else { | 3961 | p->prio = rt_mutex_getprio(p); |
3640 | p->prio = p->static_prio; | 3962 | /* |
3641 | /* | 3963 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: |
3642 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: | 3964 | */ |
3643 | */ | 3965 | if (policy == SCHED_BATCH) |
3644 | if (policy == SCHED_BATCH) | 3966 | p->sleep_avg = 0; |
3645 | p->sleep_avg = 0; | 3967 | set_load_weight(p); |
3646 | } | ||
3647 | } | 3968 | } |
3648 | 3969 | ||
3649 | /** | 3970 | /** |
@@ -3662,6 +3983,8 @@ int sched_setscheduler(struct task_struct *p, int policy, | |||
3662 | unsigned long flags; | 3983 | unsigned long flags; |
3663 | runqueue_t *rq; | 3984 | runqueue_t *rq; |
3664 | 3985 | ||
3986 | /* may grab non-irq protected spin_locks */ | ||
3987 | BUG_ON(in_interrupt()); | ||
3665 | recheck: | 3988 | recheck: |
3666 | /* double check policy once rq lock held */ | 3989 | /* double check policy once rq lock held */ |
3667 | if (policy < 0) | 3990 | if (policy < 0) |
@@ -3710,14 +4033,20 @@ recheck: | |||
3710 | if (retval) | 4033 | if (retval) |
3711 | return retval; | 4034 | return retval; |
3712 | /* | 4035 | /* |
4036 | * make sure no PI-waiters arrive (or leave) while we are | ||
4037 | * changing the priority of the task: | ||
4038 | */ | ||
4039 | spin_lock_irqsave(&p->pi_lock, flags); | ||
4040 | /* | ||
3713 | * To be able to change p->policy safely, the apropriate | 4041 | * To be able to change p->policy safely, the apropriate |
3714 | * runqueue lock must be held. | 4042 | * runqueue lock must be held. |
3715 | */ | 4043 | */ |
3716 | rq = task_rq_lock(p, &flags); | 4044 | rq = __task_rq_lock(p); |
3717 | /* recheck policy now with rq lock held */ | 4045 | /* recheck policy now with rq lock held */ |
3718 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 4046 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
3719 | policy = oldpolicy = -1; | 4047 | policy = oldpolicy = -1; |
3720 | task_rq_unlock(rq, &flags); | 4048 | __task_rq_unlock(rq); |
4049 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
3721 | goto recheck; | 4050 | goto recheck; |
3722 | } | 4051 | } |
3723 | array = p->array; | 4052 | array = p->array; |
@@ -3738,7 +4067,11 @@ recheck: | |||
3738 | } else if (TASK_PREEMPTS_CURR(p, rq)) | 4067 | } else if (TASK_PREEMPTS_CURR(p, rq)) |
3739 | resched_task(rq->curr); | 4068 | resched_task(rq->curr); |
3740 | } | 4069 | } |
3741 | task_rq_unlock(rq, &flags); | 4070 | __task_rq_unlock(rq); |
4071 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
4072 | |||
4073 | rt_mutex_adjust_pi(p); | ||
4074 | |||
3742 | return 0; | 4075 | return 0; |
3743 | } | 4076 | } |
3744 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 4077 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
@@ -3760,8 +4093,10 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | |||
3760 | read_unlock_irq(&tasklist_lock); | 4093 | read_unlock_irq(&tasklist_lock); |
3761 | return -ESRCH; | 4094 | return -ESRCH; |
3762 | } | 4095 | } |
3763 | retval = sched_setscheduler(p, policy, &lparam); | 4096 | get_task_struct(p); |
3764 | read_unlock_irq(&tasklist_lock); | 4097 | read_unlock_irq(&tasklist_lock); |
4098 | retval = sched_setscheduler(p, policy, &lparam); | ||
4099 | put_task_struct(p); | ||
3765 | return retval; | 4100 | return retval; |
3766 | } | 4101 | } |
3767 | 4102 | ||
@@ -3886,6 +4221,10 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
3886 | !capable(CAP_SYS_NICE)) | 4221 | !capable(CAP_SYS_NICE)) |
3887 | goto out_unlock; | 4222 | goto out_unlock; |
3888 | 4223 | ||
4224 | retval = security_task_setscheduler(p, 0, NULL); | ||
4225 | if (retval) | ||
4226 | goto out_unlock; | ||
4227 | |||
3889 | cpus_allowed = cpuset_cpus_allowed(p); | 4228 | cpus_allowed = cpuset_cpus_allowed(p); |
3890 | cpus_and(new_mask, new_mask, cpus_allowed); | 4229 | cpus_and(new_mask, new_mask, cpus_allowed); |
3891 | retval = set_cpus_allowed(p, new_mask); | 4230 | retval = set_cpus_allowed(p, new_mask); |
@@ -3954,7 +4293,10 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) | |||
3954 | if (!p) | 4293 | if (!p) |
3955 | goto out_unlock; | 4294 | goto out_unlock; |
3956 | 4295 | ||
3957 | retval = 0; | 4296 | retval = security_task_getscheduler(p); |
4297 | if (retval) | ||
4298 | goto out_unlock; | ||
4299 | |||
3958 | cpus_and(*mask, p->cpus_allowed, cpu_online_map); | 4300 | cpus_and(*mask, p->cpus_allowed, cpu_online_map); |
3959 | 4301 | ||
3960 | out_unlock: | 4302 | out_unlock: |
@@ -4044,17 +4386,25 @@ asmlinkage long sys_sched_yield(void) | |||
4044 | return 0; | 4386 | return 0; |
4045 | } | 4387 | } |
4046 | 4388 | ||
4047 | static inline void __cond_resched(void) | 4389 | static inline int __resched_legal(void) |
4390 | { | ||
4391 | if (unlikely(preempt_count())) | ||
4392 | return 0; | ||
4393 | if (unlikely(system_state != SYSTEM_RUNNING)) | ||
4394 | return 0; | ||
4395 | return 1; | ||
4396 | } | ||
4397 | |||
4398 | static void __cond_resched(void) | ||
4048 | { | 4399 | { |
4400 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | ||
4401 | __might_sleep(__FILE__, __LINE__); | ||
4402 | #endif | ||
4049 | /* | 4403 | /* |
4050 | * The BKS might be reacquired before we have dropped | 4404 | * The BKS might be reacquired before we have dropped |
4051 | * PREEMPT_ACTIVE, which could trigger a second | 4405 | * PREEMPT_ACTIVE, which could trigger a second |
4052 | * cond_resched() call. | 4406 | * cond_resched() call. |
4053 | */ | 4407 | */ |
4054 | if (unlikely(preempt_count())) | ||
4055 | return; | ||
4056 | if (unlikely(system_state != SYSTEM_RUNNING)) | ||
4057 | return; | ||
4058 | do { | 4408 | do { |
4059 | add_preempt_count(PREEMPT_ACTIVE); | 4409 | add_preempt_count(PREEMPT_ACTIVE); |
4060 | schedule(); | 4410 | schedule(); |
@@ -4064,13 +4414,12 @@ static inline void __cond_resched(void) | |||
4064 | 4414 | ||
4065 | int __sched cond_resched(void) | 4415 | int __sched cond_resched(void) |
4066 | { | 4416 | { |
4067 | if (need_resched()) { | 4417 | if (need_resched() && __resched_legal()) { |
4068 | __cond_resched(); | 4418 | __cond_resched(); |
4069 | return 1; | 4419 | return 1; |
4070 | } | 4420 | } |
4071 | return 0; | 4421 | return 0; |
4072 | } | 4422 | } |
4073 | |||
4074 | EXPORT_SYMBOL(cond_resched); | 4423 | EXPORT_SYMBOL(cond_resched); |
4075 | 4424 | ||
4076 | /* | 4425 | /* |
@@ -4091,7 +4440,7 @@ int cond_resched_lock(spinlock_t *lock) | |||
4091 | ret = 1; | 4440 | ret = 1; |
4092 | spin_lock(lock); | 4441 | spin_lock(lock); |
4093 | } | 4442 | } |
4094 | if (need_resched()) { | 4443 | if (need_resched() && __resched_legal()) { |
4095 | _raw_spin_unlock(lock); | 4444 | _raw_spin_unlock(lock); |
4096 | preempt_enable_no_resched(); | 4445 | preempt_enable_no_resched(); |
4097 | __cond_resched(); | 4446 | __cond_resched(); |
@@ -4100,14 +4449,13 @@ int cond_resched_lock(spinlock_t *lock) | |||
4100 | } | 4449 | } |
4101 | return ret; | 4450 | return ret; |
4102 | } | 4451 | } |
4103 | |||
4104 | EXPORT_SYMBOL(cond_resched_lock); | 4452 | EXPORT_SYMBOL(cond_resched_lock); |
4105 | 4453 | ||
4106 | int __sched cond_resched_softirq(void) | 4454 | int __sched cond_resched_softirq(void) |
4107 | { | 4455 | { |
4108 | BUG_ON(!in_softirq()); | 4456 | BUG_ON(!in_softirq()); |
4109 | 4457 | ||
4110 | if (need_resched()) { | 4458 | if (need_resched() && __resched_legal()) { |
4111 | __local_bh_enable(); | 4459 | __local_bh_enable(); |
4112 | __cond_resched(); | 4460 | __cond_resched(); |
4113 | local_bh_disable(); | 4461 | local_bh_disable(); |
@@ -4115,10 +4463,8 @@ int __sched cond_resched_softirq(void) | |||
4115 | } | 4463 | } |
4116 | return 0; | 4464 | return 0; |
4117 | } | 4465 | } |
4118 | |||
4119 | EXPORT_SYMBOL(cond_resched_softirq); | 4466 | EXPORT_SYMBOL(cond_resched_softirq); |
4120 | 4467 | ||
4121 | |||
4122 | /** | 4468 | /** |
4123 | * yield - yield the current processor to other threads. | 4469 | * yield - yield the current processor to other threads. |
4124 | * | 4470 | * |
@@ -4142,7 +4488,7 @@ EXPORT_SYMBOL(yield); | |||
4142 | */ | 4488 | */ |
4143 | void __sched io_schedule(void) | 4489 | void __sched io_schedule(void) |
4144 | { | 4490 | { |
4145 | struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); | 4491 | struct runqueue *rq = &__raw_get_cpu_var(runqueues); |
4146 | 4492 | ||
4147 | atomic_inc(&rq->nr_iowait); | 4493 | atomic_inc(&rq->nr_iowait); |
4148 | schedule(); | 4494 | schedule(); |
@@ -4153,7 +4499,7 @@ EXPORT_SYMBOL(io_schedule); | |||
4153 | 4499 | ||
4154 | long __sched io_schedule_timeout(long timeout) | 4500 | long __sched io_schedule_timeout(long timeout) |
4155 | { | 4501 | { |
4156 | struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); | 4502 | struct runqueue *rq = &__raw_get_cpu_var(runqueues); |
4157 | long ret; | 4503 | long ret; |
4158 | 4504 | ||
4159 | atomic_inc(&rq->nr_iowait); | 4505 | atomic_inc(&rq->nr_iowait); |
@@ -4237,7 +4583,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | |||
4237 | if (retval) | 4583 | if (retval) |
4238 | goto out_unlock; | 4584 | goto out_unlock; |
4239 | 4585 | ||
4240 | jiffies_to_timespec(p->policy & SCHED_FIFO ? | 4586 | jiffies_to_timespec(p->policy == SCHED_FIFO ? |
4241 | 0 : task_timeslice(p), &t); | 4587 | 0 : task_timeslice(p), &t); |
4242 | read_unlock(&tasklist_lock); | 4588 | read_unlock(&tasklist_lock); |
4243 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | 4589 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
@@ -4363,7 +4709,7 @@ void __devinit init_idle(task_t *idle, int cpu) | |||
4363 | idle->timestamp = sched_clock(); | 4709 | idle->timestamp = sched_clock(); |
4364 | idle->sleep_avg = 0; | 4710 | idle->sleep_avg = 0; |
4365 | idle->array = NULL; | 4711 | idle->array = NULL; |
4366 | idle->prio = MAX_PRIO; | 4712 | idle->prio = idle->normal_prio = MAX_PRIO; |
4367 | idle->state = TASK_RUNNING; | 4713 | idle->state = TASK_RUNNING; |
4368 | idle->cpus_allowed = cpumask_of_cpu(cpu); | 4714 | idle->cpus_allowed = cpumask_of_cpu(cpu); |
4369 | set_task_cpu(idle, cpu); | 4715 | set_task_cpu(idle, cpu); |
@@ -4459,13 +4805,16 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); | |||
4459 | * | 4805 | * |
4460 | * So we race with normal scheduler movements, but that's OK, as long | 4806 | * So we race with normal scheduler movements, but that's OK, as long |
4461 | * as the task is no longer on this CPU. | 4807 | * as the task is no longer on this CPU. |
4808 | * | ||
4809 | * Returns non-zero if task was successfully migrated. | ||
4462 | */ | 4810 | */ |
4463 | static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | 4811 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) |
4464 | { | 4812 | { |
4465 | runqueue_t *rq_dest, *rq_src; | 4813 | runqueue_t *rq_dest, *rq_src; |
4814 | int ret = 0; | ||
4466 | 4815 | ||
4467 | if (unlikely(cpu_is_offline(dest_cpu))) | 4816 | if (unlikely(cpu_is_offline(dest_cpu))) |
4468 | return; | 4817 | return ret; |
4469 | 4818 | ||
4470 | rq_src = cpu_rq(src_cpu); | 4819 | rq_src = cpu_rq(src_cpu); |
4471 | rq_dest = cpu_rq(dest_cpu); | 4820 | rq_dest = cpu_rq(dest_cpu); |
@@ -4493,9 +4842,10 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
4493 | if (TASK_PREEMPTS_CURR(p, rq_dest)) | 4842 | if (TASK_PREEMPTS_CURR(p, rq_dest)) |
4494 | resched_task(rq_dest->curr); | 4843 | resched_task(rq_dest->curr); |
4495 | } | 4844 | } |
4496 | 4845 | ret = 1; | |
4497 | out: | 4846 | out: |
4498 | double_rq_unlock(rq_src, rq_dest); | 4847 | double_rq_unlock(rq_src, rq_dest); |
4848 | return ret; | ||
4499 | } | 4849 | } |
4500 | 4850 | ||
4501 | /* | 4851 | /* |
@@ -4565,9 +4915,12 @@ wait_to_die: | |||
4565 | /* Figure out where task on dead CPU should go, use force if neccessary. */ | 4915 | /* Figure out where task on dead CPU should go, use force if neccessary. */ |
4566 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) | 4916 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) |
4567 | { | 4917 | { |
4918 | runqueue_t *rq; | ||
4919 | unsigned long flags; | ||
4568 | int dest_cpu; | 4920 | int dest_cpu; |
4569 | cpumask_t mask; | 4921 | cpumask_t mask; |
4570 | 4922 | ||
4923 | restart: | ||
4571 | /* On same node? */ | 4924 | /* On same node? */ |
4572 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); | 4925 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); |
4573 | cpus_and(mask, mask, tsk->cpus_allowed); | 4926 | cpus_and(mask, mask, tsk->cpus_allowed); |
@@ -4579,8 +4932,10 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) | |||
4579 | 4932 | ||
4580 | /* No more Mr. Nice Guy. */ | 4933 | /* No more Mr. Nice Guy. */ |
4581 | if (dest_cpu == NR_CPUS) { | 4934 | if (dest_cpu == NR_CPUS) { |
4935 | rq = task_rq_lock(tsk, &flags); | ||
4582 | cpus_setall(tsk->cpus_allowed); | 4936 | cpus_setall(tsk->cpus_allowed); |
4583 | dest_cpu = any_online_cpu(tsk->cpus_allowed); | 4937 | dest_cpu = any_online_cpu(tsk->cpus_allowed); |
4938 | task_rq_unlock(rq, &flags); | ||
4584 | 4939 | ||
4585 | /* | 4940 | /* |
4586 | * Don't tell them about moving exiting tasks or | 4941 | * Don't tell them about moving exiting tasks or |
@@ -4592,7 +4947,8 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) | |||
4592 | "longer affine to cpu%d\n", | 4947 | "longer affine to cpu%d\n", |
4593 | tsk->pid, tsk->comm, dead_cpu); | 4948 | tsk->pid, tsk->comm, dead_cpu); |
4594 | } | 4949 | } |
4595 | __migrate_task(tsk, dead_cpu, dest_cpu); | 4950 | if (!__migrate_task(tsk, dead_cpu, dest_cpu)) |
4951 | goto restart; | ||
4596 | } | 4952 | } |
4597 | 4953 | ||
4598 | /* | 4954 | /* |
@@ -4719,8 +5075,9 @@ static void migrate_dead_tasks(unsigned int dead_cpu) | |||
4719 | * migration_call - callback that gets triggered when a CPU is added. | 5075 | * migration_call - callback that gets triggered when a CPU is added. |
4720 | * Here we can start up the necessary migration thread for the new CPU. | 5076 | * Here we can start up the necessary migration thread for the new CPU. |
4721 | */ | 5077 | */ |
4722 | static int migration_call(struct notifier_block *nfb, unsigned long action, | 5078 | static int __cpuinit migration_call(struct notifier_block *nfb, |
4723 | void *hcpu) | 5079 | unsigned long action, |
5080 | void *hcpu) | ||
4724 | { | 5081 | { |
4725 | int cpu = (long)hcpu; | 5082 | int cpu = (long)hcpu; |
4726 | struct task_struct *p; | 5083 | struct task_struct *p; |
@@ -4746,6 +5103,8 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, | |||
4746 | break; | 5103 | break; |
4747 | #ifdef CONFIG_HOTPLUG_CPU | 5104 | #ifdef CONFIG_HOTPLUG_CPU |
4748 | case CPU_UP_CANCELED: | 5105 | case CPU_UP_CANCELED: |
5106 | if (!cpu_rq(cpu)->migration_thread) | ||
5107 | break; | ||
4749 | /* Unbind it from offline cpu so it can run. Fall thru. */ | 5108 | /* Unbind it from offline cpu so it can run. Fall thru. */ |
4750 | kthread_bind(cpu_rq(cpu)->migration_thread, | 5109 | kthread_bind(cpu_rq(cpu)->migration_thread, |
4751 | any_online_cpu(cpu_online_map)); | 5110 | any_online_cpu(cpu_online_map)); |
@@ -4788,7 +5147,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, | |||
4788 | /* Register at highest priority so that task migration (migrate_all_tasks) | 5147 | /* Register at highest priority so that task migration (migrate_all_tasks) |
4789 | * happens before everything else. | 5148 | * happens before everything else. |
4790 | */ | 5149 | */ |
4791 | static struct notifier_block migration_notifier = { | 5150 | static struct notifier_block __cpuinitdata migration_notifier = { |
4792 | .notifier_call = migration_call, | 5151 | .notifier_call = migration_call, |
4793 | .priority = 10 | 5152 | .priority = 10 |
4794 | }; | 5153 | }; |
@@ -5589,6 +5948,7 @@ static cpumask_t sched_domain_node_span(int node) | |||
5589 | } | 5948 | } |
5590 | #endif | 5949 | #endif |
5591 | 5950 | ||
5951 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | ||
5592 | /* | 5952 | /* |
5593 | * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we | 5953 | * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we |
5594 | * can switch it on easily if needed. | 5954 | * can switch it on easily if needed. |
@@ -5604,7 +5964,7 @@ static int cpu_to_cpu_group(int cpu) | |||
5604 | 5964 | ||
5605 | #ifdef CONFIG_SCHED_MC | 5965 | #ifdef CONFIG_SCHED_MC |
5606 | static DEFINE_PER_CPU(struct sched_domain, core_domains); | 5966 | static DEFINE_PER_CPU(struct sched_domain, core_domains); |
5607 | static struct sched_group sched_group_core[NR_CPUS]; | 5967 | static struct sched_group *sched_group_core_bycpu[NR_CPUS]; |
5608 | #endif | 5968 | #endif |
5609 | 5969 | ||
5610 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 5970 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
@@ -5620,7 +5980,7 @@ static int cpu_to_core_group(int cpu) | |||
5620 | #endif | 5980 | #endif |
5621 | 5981 | ||
5622 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | 5982 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); |
5623 | static struct sched_group sched_group_phys[NR_CPUS]; | 5983 | static struct sched_group *sched_group_phys_bycpu[NR_CPUS]; |
5624 | static int cpu_to_phys_group(int cpu) | 5984 | static int cpu_to_phys_group(int cpu) |
5625 | { | 5985 | { |
5626 | #if defined(CONFIG_SCHED_MC) | 5986 | #if defined(CONFIG_SCHED_MC) |
@@ -5677,13 +6037,74 @@ next_sg: | |||
5677 | } | 6037 | } |
5678 | #endif | 6038 | #endif |
5679 | 6039 | ||
6040 | /* Free memory allocated for various sched_group structures */ | ||
6041 | static void free_sched_groups(const cpumask_t *cpu_map) | ||
6042 | { | ||
6043 | int cpu; | ||
6044 | #ifdef CONFIG_NUMA | ||
6045 | int i; | ||
6046 | |||
6047 | for_each_cpu_mask(cpu, *cpu_map) { | ||
6048 | struct sched_group *sched_group_allnodes | ||
6049 | = sched_group_allnodes_bycpu[cpu]; | ||
6050 | struct sched_group **sched_group_nodes | ||
6051 | = sched_group_nodes_bycpu[cpu]; | ||
6052 | |||
6053 | if (sched_group_allnodes) { | ||
6054 | kfree(sched_group_allnodes); | ||
6055 | sched_group_allnodes_bycpu[cpu] = NULL; | ||
6056 | } | ||
6057 | |||
6058 | if (!sched_group_nodes) | ||
6059 | continue; | ||
6060 | |||
6061 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
6062 | cpumask_t nodemask = node_to_cpumask(i); | ||
6063 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | ||
6064 | |||
6065 | cpus_and(nodemask, nodemask, *cpu_map); | ||
6066 | if (cpus_empty(nodemask)) | ||
6067 | continue; | ||
6068 | |||
6069 | if (sg == NULL) | ||
6070 | continue; | ||
6071 | sg = sg->next; | ||
6072 | next_sg: | ||
6073 | oldsg = sg; | ||
6074 | sg = sg->next; | ||
6075 | kfree(oldsg); | ||
6076 | if (oldsg != sched_group_nodes[i]) | ||
6077 | goto next_sg; | ||
6078 | } | ||
6079 | kfree(sched_group_nodes); | ||
6080 | sched_group_nodes_bycpu[cpu] = NULL; | ||
6081 | } | ||
6082 | #endif | ||
6083 | for_each_cpu_mask(cpu, *cpu_map) { | ||
6084 | if (sched_group_phys_bycpu[cpu]) { | ||
6085 | kfree(sched_group_phys_bycpu[cpu]); | ||
6086 | sched_group_phys_bycpu[cpu] = NULL; | ||
6087 | } | ||
6088 | #ifdef CONFIG_SCHED_MC | ||
6089 | if (sched_group_core_bycpu[cpu]) { | ||
6090 | kfree(sched_group_core_bycpu[cpu]); | ||
6091 | sched_group_core_bycpu[cpu] = NULL; | ||
6092 | } | ||
6093 | #endif | ||
6094 | } | ||
6095 | } | ||
6096 | |||
5680 | /* | 6097 | /* |
5681 | * Build sched domains for a given set of cpus and attach the sched domains | 6098 | * Build sched domains for a given set of cpus and attach the sched domains |
5682 | * to the individual cpus | 6099 | * to the individual cpus |
5683 | */ | 6100 | */ |
5684 | void build_sched_domains(const cpumask_t *cpu_map) | 6101 | static int build_sched_domains(const cpumask_t *cpu_map) |
5685 | { | 6102 | { |
5686 | int i; | 6103 | int i; |
6104 | struct sched_group *sched_group_phys = NULL; | ||
6105 | #ifdef CONFIG_SCHED_MC | ||
6106 | struct sched_group *sched_group_core = NULL; | ||
6107 | #endif | ||
5687 | #ifdef CONFIG_NUMA | 6108 | #ifdef CONFIG_NUMA |
5688 | struct sched_group **sched_group_nodes = NULL; | 6109 | struct sched_group **sched_group_nodes = NULL; |
5689 | struct sched_group *sched_group_allnodes = NULL; | 6110 | struct sched_group *sched_group_allnodes = NULL; |
@@ -5691,11 +6112,11 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5691 | /* | 6112 | /* |
5692 | * Allocate the per-node list of sched groups | 6113 | * Allocate the per-node list of sched groups |
5693 | */ | 6114 | */ |
5694 | sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, | 6115 | sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, |
5695 | GFP_ATOMIC); | 6116 | GFP_KERNEL); |
5696 | if (!sched_group_nodes) { | 6117 | if (!sched_group_nodes) { |
5697 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 6118 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
5698 | return; | 6119 | return -ENOMEM; |
5699 | } | 6120 | } |
5700 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | 6121 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; |
5701 | #endif | 6122 | #endif |
@@ -5721,7 +6142,7 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5721 | if (!sched_group_allnodes) { | 6142 | if (!sched_group_allnodes) { |
5722 | printk(KERN_WARNING | 6143 | printk(KERN_WARNING |
5723 | "Can not alloc allnodes sched group\n"); | 6144 | "Can not alloc allnodes sched group\n"); |
5724 | break; | 6145 | goto error; |
5725 | } | 6146 | } |
5726 | sched_group_allnodes_bycpu[i] | 6147 | sched_group_allnodes_bycpu[i] |
5727 | = sched_group_allnodes; | 6148 | = sched_group_allnodes; |
@@ -5742,6 +6163,18 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5742 | cpus_and(sd->span, sd->span, *cpu_map); | 6163 | cpus_and(sd->span, sd->span, *cpu_map); |
5743 | #endif | 6164 | #endif |
5744 | 6165 | ||
6166 | if (!sched_group_phys) { | ||
6167 | sched_group_phys | ||
6168 | = kmalloc(sizeof(struct sched_group) * NR_CPUS, | ||
6169 | GFP_KERNEL); | ||
6170 | if (!sched_group_phys) { | ||
6171 | printk (KERN_WARNING "Can not alloc phys sched" | ||
6172 | "group\n"); | ||
6173 | goto error; | ||
6174 | } | ||
6175 | sched_group_phys_bycpu[i] = sched_group_phys; | ||
6176 | } | ||
6177 | |||
5745 | p = sd; | 6178 | p = sd; |
5746 | sd = &per_cpu(phys_domains, i); | 6179 | sd = &per_cpu(phys_domains, i); |
5747 | group = cpu_to_phys_group(i); | 6180 | group = cpu_to_phys_group(i); |
@@ -5751,6 +6184,18 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5751 | sd->groups = &sched_group_phys[group]; | 6184 | sd->groups = &sched_group_phys[group]; |
5752 | 6185 | ||
5753 | #ifdef CONFIG_SCHED_MC | 6186 | #ifdef CONFIG_SCHED_MC |
6187 | if (!sched_group_core) { | ||
6188 | sched_group_core | ||
6189 | = kmalloc(sizeof(struct sched_group) * NR_CPUS, | ||
6190 | GFP_KERNEL); | ||
6191 | if (!sched_group_core) { | ||
6192 | printk (KERN_WARNING "Can not alloc core sched" | ||
6193 | "group\n"); | ||
6194 | goto error; | ||
6195 | } | ||
6196 | sched_group_core_bycpu[i] = sched_group_core; | ||
6197 | } | ||
6198 | |||
5754 | p = sd; | 6199 | p = sd; |
5755 | sd = &per_cpu(core_domains, i); | 6200 | sd = &per_cpu(core_domains, i); |
5756 | group = cpu_to_core_group(i); | 6201 | group = cpu_to_core_group(i); |
@@ -5834,24 +6279,21 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5834 | domainspan = sched_domain_node_span(i); | 6279 | domainspan = sched_domain_node_span(i); |
5835 | cpus_and(domainspan, domainspan, *cpu_map); | 6280 | cpus_and(domainspan, domainspan, *cpu_map); |
5836 | 6281 | ||
5837 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | 6282 | sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); |
6283 | if (!sg) { | ||
6284 | printk(KERN_WARNING "Can not alloc domain group for " | ||
6285 | "node %d\n", i); | ||
6286 | goto error; | ||
6287 | } | ||
5838 | sched_group_nodes[i] = sg; | 6288 | sched_group_nodes[i] = sg; |
5839 | for_each_cpu_mask(j, nodemask) { | 6289 | for_each_cpu_mask(j, nodemask) { |
5840 | struct sched_domain *sd; | 6290 | struct sched_domain *sd; |
5841 | sd = &per_cpu(node_domains, j); | 6291 | sd = &per_cpu(node_domains, j); |
5842 | sd->groups = sg; | 6292 | sd->groups = sg; |
5843 | if (sd->groups == NULL) { | ||
5844 | /* Turn off balancing if we have no groups */ | ||
5845 | sd->flags = 0; | ||
5846 | } | ||
5847 | } | ||
5848 | if (!sg) { | ||
5849 | printk(KERN_WARNING | ||
5850 | "Can not alloc domain group for node %d\n", i); | ||
5851 | continue; | ||
5852 | } | 6293 | } |
5853 | sg->cpu_power = 0; | 6294 | sg->cpu_power = 0; |
5854 | sg->cpumask = nodemask; | 6295 | sg->cpumask = nodemask; |
6296 | sg->next = sg; | ||
5855 | cpus_or(covered, covered, nodemask); | 6297 | cpus_or(covered, covered, nodemask); |
5856 | prev = sg; | 6298 | prev = sg; |
5857 | 6299 | ||
@@ -5870,54 +6312,90 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5870 | if (cpus_empty(tmp)) | 6312 | if (cpus_empty(tmp)) |
5871 | continue; | 6313 | continue; |
5872 | 6314 | ||
5873 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | 6315 | sg = kmalloc_node(sizeof(struct sched_group), |
6316 | GFP_KERNEL, i); | ||
5874 | if (!sg) { | 6317 | if (!sg) { |
5875 | printk(KERN_WARNING | 6318 | printk(KERN_WARNING |
5876 | "Can not alloc domain group for node %d\n", j); | 6319 | "Can not alloc domain group for node %d\n", j); |
5877 | break; | 6320 | goto error; |
5878 | } | 6321 | } |
5879 | sg->cpu_power = 0; | 6322 | sg->cpu_power = 0; |
5880 | sg->cpumask = tmp; | 6323 | sg->cpumask = tmp; |
6324 | sg->next = prev->next; | ||
5881 | cpus_or(covered, covered, tmp); | 6325 | cpus_or(covered, covered, tmp); |
5882 | prev->next = sg; | 6326 | prev->next = sg; |
5883 | prev = sg; | 6327 | prev = sg; |
5884 | } | 6328 | } |
5885 | prev->next = sched_group_nodes[i]; | ||
5886 | } | 6329 | } |
5887 | #endif | 6330 | #endif |
5888 | 6331 | ||
5889 | /* Calculate CPU power for physical packages and nodes */ | 6332 | /* Calculate CPU power for physical packages and nodes */ |
6333 | #ifdef CONFIG_SCHED_SMT | ||
5890 | for_each_cpu_mask(i, *cpu_map) { | 6334 | for_each_cpu_mask(i, *cpu_map) { |
5891 | int power; | ||
5892 | struct sched_domain *sd; | 6335 | struct sched_domain *sd; |
5893 | #ifdef CONFIG_SCHED_SMT | ||
5894 | sd = &per_cpu(cpu_domains, i); | 6336 | sd = &per_cpu(cpu_domains, i); |
5895 | power = SCHED_LOAD_SCALE; | 6337 | sd->groups->cpu_power = SCHED_LOAD_SCALE; |
5896 | sd->groups->cpu_power = power; | 6338 | } |
5897 | #endif | 6339 | #endif |
5898 | #ifdef CONFIG_SCHED_MC | 6340 | #ifdef CONFIG_SCHED_MC |
6341 | for_each_cpu_mask(i, *cpu_map) { | ||
6342 | int power; | ||
6343 | struct sched_domain *sd; | ||
5899 | sd = &per_cpu(core_domains, i); | 6344 | sd = &per_cpu(core_domains, i); |
5900 | power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) | 6345 | if (sched_smt_power_savings) |
6346 | power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); | ||
6347 | else | ||
6348 | power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) | ||
5901 | * SCHED_LOAD_SCALE / 10; | 6349 | * SCHED_LOAD_SCALE / 10; |
5902 | sd->groups->cpu_power = power; | 6350 | sd->groups->cpu_power = power; |
6351 | } | ||
6352 | #endif | ||
5903 | 6353 | ||
6354 | for_each_cpu_mask(i, *cpu_map) { | ||
6355 | struct sched_domain *sd; | ||
6356 | #ifdef CONFIG_SCHED_MC | ||
5904 | sd = &per_cpu(phys_domains, i); | 6357 | sd = &per_cpu(phys_domains, i); |
6358 | if (i != first_cpu(sd->groups->cpumask)) | ||
6359 | continue; | ||
5905 | 6360 | ||
5906 | /* | 6361 | sd->groups->cpu_power = 0; |
5907 | * This has to be < 2 * SCHED_LOAD_SCALE | 6362 | if (sched_mc_power_savings || sched_smt_power_savings) { |
5908 | * Lets keep it SCHED_LOAD_SCALE, so that | 6363 | int j; |
5909 | * while calculating NUMA group's cpu_power | 6364 | |
5910 | * we can simply do | 6365 | for_each_cpu_mask(j, sd->groups->cpumask) { |
5911 | * numa_group->cpu_power += phys_group->cpu_power; | 6366 | struct sched_domain *sd1; |
5912 | * | 6367 | sd1 = &per_cpu(core_domains, j); |
5913 | * See "only add power once for each physical pkg" | 6368 | /* |
5914 | * comment below | 6369 | * for each core we will add once |
5915 | */ | 6370 | * to the group in physical domain |
5916 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | 6371 | */ |
6372 | if (j != first_cpu(sd1->groups->cpumask)) | ||
6373 | continue; | ||
6374 | |||
6375 | if (sched_smt_power_savings) | ||
6376 | sd->groups->cpu_power += sd1->groups->cpu_power; | ||
6377 | else | ||
6378 | sd->groups->cpu_power += SCHED_LOAD_SCALE; | ||
6379 | } | ||
6380 | } else | ||
6381 | /* | ||
6382 | * This has to be < 2 * SCHED_LOAD_SCALE | ||
6383 | * Lets keep it SCHED_LOAD_SCALE, so that | ||
6384 | * while calculating NUMA group's cpu_power | ||
6385 | * we can simply do | ||
6386 | * numa_group->cpu_power += phys_group->cpu_power; | ||
6387 | * | ||
6388 | * See "only add power once for each physical pkg" | ||
6389 | * comment below | ||
6390 | */ | ||
6391 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | ||
5917 | #else | 6392 | #else |
6393 | int power; | ||
5918 | sd = &per_cpu(phys_domains, i); | 6394 | sd = &per_cpu(phys_domains, i); |
5919 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * | 6395 | if (sched_smt_power_savings) |
5920 | (cpus_weight(sd->groups->cpumask)-1) / 10; | 6396 | power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); |
6397 | else | ||
6398 | power = SCHED_LOAD_SCALE; | ||
5921 | sd->groups->cpu_power = power; | 6399 | sd->groups->cpu_power = power; |
5922 | #endif | 6400 | #endif |
5923 | } | 6401 | } |
@@ -5945,13 +6423,20 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5945 | * Tune cache-hot values: | 6423 | * Tune cache-hot values: |
5946 | */ | 6424 | */ |
5947 | calibrate_migration_costs(cpu_map); | 6425 | calibrate_migration_costs(cpu_map); |
6426 | |||
6427 | return 0; | ||
6428 | |||
6429 | error: | ||
6430 | free_sched_groups(cpu_map); | ||
6431 | return -ENOMEM; | ||
5948 | } | 6432 | } |
5949 | /* | 6433 | /* |
5950 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 6434 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
5951 | */ | 6435 | */ |
5952 | static void arch_init_sched_domains(const cpumask_t *cpu_map) | 6436 | static int arch_init_sched_domains(const cpumask_t *cpu_map) |
5953 | { | 6437 | { |
5954 | cpumask_t cpu_default_map; | 6438 | cpumask_t cpu_default_map; |
6439 | int err; | ||
5955 | 6440 | ||
5956 | /* | 6441 | /* |
5957 | * Setup mask for cpus without special case scheduling requirements. | 6442 | * Setup mask for cpus without special case scheduling requirements. |
@@ -5960,51 +6445,14 @@ static void arch_init_sched_domains(const cpumask_t *cpu_map) | |||
5960 | */ | 6445 | */ |
5961 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); | 6446 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); |
5962 | 6447 | ||
5963 | build_sched_domains(&cpu_default_map); | 6448 | err = build_sched_domains(&cpu_default_map); |
6449 | |||
6450 | return err; | ||
5964 | } | 6451 | } |
5965 | 6452 | ||
5966 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | 6453 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) |
5967 | { | 6454 | { |
5968 | #ifdef CONFIG_NUMA | 6455 | free_sched_groups(cpu_map); |
5969 | int i; | ||
5970 | int cpu; | ||
5971 | |||
5972 | for_each_cpu_mask(cpu, *cpu_map) { | ||
5973 | struct sched_group *sched_group_allnodes | ||
5974 | = sched_group_allnodes_bycpu[cpu]; | ||
5975 | struct sched_group **sched_group_nodes | ||
5976 | = sched_group_nodes_bycpu[cpu]; | ||
5977 | |||
5978 | if (sched_group_allnodes) { | ||
5979 | kfree(sched_group_allnodes); | ||
5980 | sched_group_allnodes_bycpu[cpu] = NULL; | ||
5981 | } | ||
5982 | |||
5983 | if (!sched_group_nodes) | ||
5984 | continue; | ||
5985 | |||
5986 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
5987 | cpumask_t nodemask = node_to_cpumask(i); | ||
5988 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | ||
5989 | |||
5990 | cpus_and(nodemask, nodemask, *cpu_map); | ||
5991 | if (cpus_empty(nodemask)) | ||
5992 | continue; | ||
5993 | |||
5994 | if (sg == NULL) | ||
5995 | continue; | ||
5996 | sg = sg->next; | ||
5997 | next_sg: | ||
5998 | oldsg = sg; | ||
5999 | sg = sg->next; | ||
6000 | kfree(oldsg); | ||
6001 | if (oldsg != sched_group_nodes[i]) | ||
6002 | goto next_sg; | ||
6003 | } | ||
6004 | kfree(sched_group_nodes); | ||
6005 | sched_group_nodes_bycpu[cpu] = NULL; | ||
6006 | } | ||
6007 | #endif | ||
6008 | } | 6456 | } |
6009 | 6457 | ||
6010 | /* | 6458 | /* |
@@ -6029,9 +6477,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
6029 | * correct sched domains | 6477 | * correct sched domains |
6030 | * Call with hotplug lock held | 6478 | * Call with hotplug lock held |
6031 | */ | 6479 | */ |
6032 | void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | 6480 | int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) |
6033 | { | 6481 | { |
6034 | cpumask_t change_map; | 6482 | cpumask_t change_map; |
6483 | int err = 0; | ||
6035 | 6484 | ||
6036 | cpus_and(*partition1, *partition1, cpu_online_map); | 6485 | cpus_and(*partition1, *partition1, cpu_online_map); |
6037 | cpus_and(*partition2, *partition2, cpu_online_map); | 6486 | cpus_and(*partition2, *partition2, cpu_online_map); |
@@ -6040,10 +6489,86 @@ void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | |||
6040 | /* Detach sched domains from all of the affected cpus */ | 6489 | /* Detach sched domains from all of the affected cpus */ |
6041 | detach_destroy_domains(&change_map); | 6490 | detach_destroy_domains(&change_map); |
6042 | if (!cpus_empty(*partition1)) | 6491 | if (!cpus_empty(*partition1)) |
6043 | build_sched_domains(partition1); | 6492 | err = build_sched_domains(partition1); |
6044 | if (!cpus_empty(*partition2)) | 6493 | if (!err && !cpus_empty(*partition2)) |
6045 | build_sched_domains(partition2); | 6494 | err = build_sched_domains(partition2); |
6495 | |||
6496 | return err; | ||
6497 | } | ||
6498 | |||
6499 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
6500 | int arch_reinit_sched_domains(void) | ||
6501 | { | ||
6502 | int err; | ||
6503 | |||
6504 | lock_cpu_hotplug(); | ||
6505 | detach_destroy_domains(&cpu_online_map); | ||
6506 | err = arch_init_sched_domains(&cpu_online_map); | ||
6507 | unlock_cpu_hotplug(); | ||
6508 | |||
6509 | return err; | ||
6510 | } | ||
6511 | |||
6512 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | ||
6513 | { | ||
6514 | int ret; | ||
6515 | |||
6516 | if (buf[0] != '0' && buf[0] != '1') | ||
6517 | return -EINVAL; | ||
6518 | |||
6519 | if (smt) | ||
6520 | sched_smt_power_savings = (buf[0] == '1'); | ||
6521 | else | ||
6522 | sched_mc_power_savings = (buf[0] == '1'); | ||
6523 | |||
6524 | ret = arch_reinit_sched_domains(); | ||
6525 | |||
6526 | return ret ? ret : count; | ||
6527 | } | ||
6528 | |||
6529 | int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | ||
6530 | { | ||
6531 | int err = 0; | ||
6532 | #ifdef CONFIG_SCHED_SMT | ||
6533 | if (smt_capable()) | ||
6534 | err = sysfs_create_file(&cls->kset.kobj, | ||
6535 | &attr_sched_smt_power_savings.attr); | ||
6536 | #endif | ||
6537 | #ifdef CONFIG_SCHED_MC | ||
6538 | if (!err && mc_capable()) | ||
6539 | err = sysfs_create_file(&cls->kset.kobj, | ||
6540 | &attr_sched_mc_power_savings.attr); | ||
6541 | #endif | ||
6542 | return err; | ||
6543 | } | ||
6544 | #endif | ||
6545 | |||
6546 | #ifdef CONFIG_SCHED_MC | ||
6547 | static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) | ||
6548 | { | ||
6549 | return sprintf(page, "%u\n", sched_mc_power_savings); | ||
6550 | } | ||
6551 | static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count) | ||
6552 | { | ||
6553 | return sched_power_savings_store(buf, count, 0); | ||
6046 | } | 6554 | } |
6555 | SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, | ||
6556 | sched_mc_power_savings_store); | ||
6557 | #endif | ||
6558 | |||
6559 | #ifdef CONFIG_SCHED_SMT | ||
6560 | static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) | ||
6561 | { | ||
6562 | return sprintf(page, "%u\n", sched_smt_power_savings); | ||
6563 | } | ||
6564 | static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count) | ||
6565 | { | ||
6566 | return sched_power_savings_store(buf, count, 1); | ||
6567 | } | ||
6568 | SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, | ||
6569 | sched_smt_power_savings_store); | ||
6570 | #endif | ||
6571 | |||
6047 | 6572 | ||
6048 | #ifdef CONFIG_HOTPLUG_CPU | 6573 | #ifdef CONFIG_HOTPLUG_CPU |
6049 | /* | 6574 | /* |
@@ -6126,7 +6651,6 @@ void __init sched_init(void) | |||
6126 | rq->push_cpu = 0; | 6651 | rq->push_cpu = 0; |
6127 | rq->migration_thread = NULL; | 6652 | rq->migration_thread = NULL; |
6128 | INIT_LIST_HEAD(&rq->migration_queue); | 6653 | INIT_LIST_HEAD(&rq->migration_queue); |
6129 | rq->cpu = i; | ||
6130 | #endif | 6654 | #endif |
6131 | atomic_set(&rq->nr_iowait, 0); | 6655 | atomic_set(&rq->nr_iowait, 0); |
6132 | 6656 | ||
@@ -6141,6 +6665,7 @@ void __init sched_init(void) | |||
6141 | } | 6665 | } |
6142 | } | 6666 | } |
6143 | 6667 | ||
6668 | set_load_weight(&init_task); | ||
6144 | /* | 6669 | /* |
6145 | * The boot idle thread does lazy MMU switching as well: | 6670 | * The boot idle thread does lazy MMU switching as well: |
6146 | */ | 6671 | */ |
@@ -6187,11 +6712,12 @@ void normalize_rt_tasks(void) | |||
6187 | runqueue_t *rq; | 6712 | runqueue_t *rq; |
6188 | 6713 | ||
6189 | read_lock_irq(&tasklist_lock); | 6714 | read_lock_irq(&tasklist_lock); |
6190 | for_each_process (p) { | 6715 | for_each_process(p) { |
6191 | if (!rt_task(p)) | 6716 | if (!rt_task(p)) |
6192 | continue; | 6717 | continue; |
6193 | 6718 | ||
6194 | rq = task_rq_lock(p, &flags); | 6719 | spin_lock_irqsave(&p->pi_lock, flags); |
6720 | rq = __task_rq_lock(p); | ||
6195 | 6721 | ||
6196 | array = p->array; | 6722 | array = p->array; |
6197 | if (array) | 6723 | if (array) |
@@ -6202,7 +6728,8 @@ void normalize_rt_tasks(void) | |||
6202 | resched_task(rq->curr); | 6728 | resched_task(rq->curr); |
6203 | } | 6729 | } |
6204 | 6730 | ||
6205 | task_rq_unlock(rq, &flags); | 6731 | __task_rq_unlock(rq); |
6732 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
6206 | } | 6733 | } |
6207 | read_unlock_irq(&tasklist_lock); | 6734 | read_unlock_irq(&tasklist_lock); |
6208 | } | 6735 | } |
diff --git a/kernel/signal.c b/kernel/signal.c index e5f8aea78ffe..7fe874d12fae 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -10,7 +10,6 @@ | |||
10 | * to allow signals to be sent reliably. | 10 | * to allow signals to be sent reliably. |
11 | */ | 11 | */ |
12 | 12 | ||
13 | #include <linux/config.h> | ||
14 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
15 | #include <linux/module.h> | 14 | #include <linux/module.h> |
16 | #include <linux/smp_lock.h> | 15 | #include <linux/smp_lock.h> |
@@ -23,12 +22,12 @@ | |||
23 | #include <linux/syscalls.h> | 22 | #include <linux/syscalls.h> |
24 | #include <linux/ptrace.h> | 23 | #include <linux/ptrace.h> |
25 | #include <linux/signal.h> | 24 | #include <linux/signal.h> |
26 | #include <linux/audit.h> | ||
27 | #include <linux/capability.h> | 25 | #include <linux/capability.h> |
28 | #include <asm/param.h> | 26 | #include <asm/param.h> |
29 | #include <asm/uaccess.h> | 27 | #include <asm/uaccess.h> |
30 | #include <asm/unistd.h> | 28 | #include <asm/unistd.h> |
31 | #include <asm/siginfo.h> | 29 | #include <asm/siginfo.h> |
30 | #include "audit.h" /* audit_signal_info() */ | ||
32 | 31 | ||
33 | /* | 32 | /* |
34 | * SLAB caches for signal bits. | 33 | * SLAB caches for signal bits. |
@@ -584,7 +583,7 @@ static int check_kill_permission(int sig, struct siginfo *info, | |||
584 | && !capable(CAP_KILL)) | 583 | && !capable(CAP_KILL)) |
585 | return error; | 584 | return error; |
586 | 585 | ||
587 | error = security_task_kill(t, info, sig); | 586 | error = security_task_kill(t, info, sig, 0); |
588 | if (!error) | 587 | if (!error) |
589 | audit_signal_info(sig, t); /* Let audit system see the signal */ | 588 | audit_signal_info(sig, t); /* Let audit system see the signal */ |
590 | return error; | 589 | return error; |
@@ -1107,7 +1106,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) | |||
1107 | 1106 | ||
1108 | /* like kill_proc_info(), but doesn't use uid/euid of "current" */ | 1107 | /* like kill_proc_info(), but doesn't use uid/euid of "current" */ |
1109 | int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, | 1108 | int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, |
1110 | uid_t uid, uid_t euid) | 1109 | uid_t uid, uid_t euid, u32 secid) |
1111 | { | 1110 | { |
1112 | int ret = -EINVAL; | 1111 | int ret = -EINVAL; |
1113 | struct task_struct *p; | 1112 | struct task_struct *p; |
@@ -1127,6 +1126,9 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, | |||
1127 | ret = -EPERM; | 1126 | ret = -EPERM; |
1128 | goto out_unlock; | 1127 | goto out_unlock; |
1129 | } | 1128 | } |
1129 | ret = security_task_kill(p, info, sig, secid); | ||
1130 | if (ret) | ||
1131 | goto out_unlock; | ||
1130 | if (sig && p->sighand) { | 1132 | if (sig && p->sighand) { |
1131 | unsigned long flags; | 1133 | unsigned long flags; |
1132 | spin_lock_irqsave(&p->sighand->siglock, flags); | 1134 | spin_lock_irqsave(&p->sighand->siglock, flags); |
@@ -1531,6 +1533,35 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why) | |||
1531 | spin_unlock_irqrestore(&sighand->siglock, flags); | 1533 | spin_unlock_irqrestore(&sighand->siglock, flags); |
1532 | } | 1534 | } |
1533 | 1535 | ||
1536 | static inline int may_ptrace_stop(void) | ||
1537 | { | ||
1538 | if (!likely(current->ptrace & PT_PTRACED)) | ||
1539 | return 0; | ||
1540 | |||
1541 | if (unlikely(current->parent == current->real_parent && | ||
1542 | (current->ptrace & PT_ATTACHED))) | ||
1543 | return 0; | ||
1544 | |||
1545 | if (unlikely(current->signal == current->parent->signal) && | ||
1546 | unlikely(current->signal->flags & SIGNAL_GROUP_EXIT)) | ||
1547 | return 0; | ||
1548 | |||
1549 | /* | ||
1550 | * Are we in the middle of do_coredump? | ||
1551 | * If so and our tracer is also part of the coredump stopping | ||
1552 | * is a deadlock situation, and pointless because our tracer | ||
1553 | * is dead so don't allow us to stop. | ||
1554 | * If SIGKILL was already sent before the caller unlocked | ||
1555 | * ->siglock we must see ->core_waiters != 0. Otherwise it | ||
1556 | * is safe to enter schedule(). | ||
1557 | */ | ||
1558 | if (unlikely(current->mm->core_waiters) && | ||
1559 | unlikely(current->mm == current->parent->mm)) | ||
1560 | return 0; | ||
1561 | |||
1562 | return 1; | ||
1563 | } | ||
1564 | |||
1534 | /* | 1565 | /* |
1535 | * This must be called with current->sighand->siglock held. | 1566 | * This must be called with current->sighand->siglock held. |
1536 | * | 1567 | * |
@@ -1559,11 +1590,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) | |||
1559 | spin_unlock_irq(¤t->sighand->siglock); | 1590 | spin_unlock_irq(¤t->sighand->siglock); |
1560 | try_to_freeze(); | 1591 | try_to_freeze(); |
1561 | read_lock(&tasklist_lock); | 1592 | read_lock(&tasklist_lock); |
1562 | if (likely(current->ptrace & PT_PTRACED) && | 1593 | if (may_ptrace_stop()) { |
1563 | likely(current->parent != current->real_parent || | ||
1564 | !(current->ptrace & PT_ATTACHED)) && | ||
1565 | (likely(current->parent->signal != current->signal) || | ||
1566 | !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { | ||
1567 | do_notify_parent_cldstop(current, CLD_TRAPPED); | 1594 | do_notify_parent_cldstop(current, CLD_TRAPPED); |
1568 | read_unlock(&tasklist_lock); | 1595 | read_unlock(&tasklist_lock); |
1569 | schedule(); | 1596 | schedule(); |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 336f92d64e2e..8f03e3b89b55 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -446,7 +446,7 @@ static void takeover_tasklets(unsigned int cpu) | |||
446 | } | 446 | } |
447 | #endif /* CONFIG_HOTPLUG_CPU */ | 447 | #endif /* CONFIG_HOTPLUG_CPU */ |
448 | 448 | ||
449 | static int cpu_callback(struct notifier_block *nfb, | 449 | static int __devinit cpu_callback(struct notifier_block *nfb, |
450 | unsigned long action, | 450 | unsigned long action, |
451 | void *hcpu) | 451 | void *hcpu) |
452 | { | 452 | { |
@@ -470,6 +470,8 @@ static int cpu_callback(struct notifier_block *nfb, | |||
470 | break; | 470 | break; |
471 | #ifdef CONFIG_HOTPLUG_CPU | 471 | #ifdef CONFIG_HOTPLUG_CPU |
472 | case CPU_UP_CANCELED: | 472 | case CPU_UP_CANCELED: |
473 | if (!per_cpu(ksoftirqd, hotcpu)) | ||
474 | break; | ||
473 | /* Unbind so it can run. Fall thru. */ | 475 | /* Unbind so it can run. Fall thru. */ |
474 | kthread_bind(per_cpu(ksoftirqd, hotcpu), | 476 | kthread_bind(per_cpu(ksoftirqd, hotcpu), |
475 | any_online_cpu(cpu_online_map)); | 477 | any_online_cpu(cpu_online_map)); |
@@ -484,7 +486,7 @@ static int cpu_callback(struct notifier_block *nfb, | |||
484 | return NOTIFY_OK; | 486 | return NOTIFY_OK; |
485 | } | 487 | } |
486 | 488 | ||
487 | static struct notifier_block cpu_nfb = { | 489 | static struct notifier_block __devinitdata cpu_nfb = { |
488 | .notifier_call = cpu_callback | 490 | .notifier_call = cpu_callback |
489 | }; | 491 | }; |
490 | 492 | ||
diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 14c7faf02909..6b76caa22981 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c | |||
@@ -36,7 +36,7 @@ static struct notifier_block panic_block = { | |||
36 | 36 | ||
37 | void touch_softlockup_watchdog(void) | 37 | void touch_softlockup_watchdog(void) |
38 | { | 38 | { |
39 | per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies; | 39 | __raw_get_cpu_var(touch_timestamp) = jiffies; |
40 | } | 40 | } |
41 | EXPORT_SYMBOL(touch_softlockup_watchdog); | 41 | EXPORT_SYMBOL(touch_softlockup_watchdog); |
42 | 42 | ||
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu) | |||
104 | /* | 104 | /* |
105 | * Create/destroy watchdog threads as CPUs come and go: | 105 | * Create/destroy watchdog threads as CPUs come and go: |
106 | */ | 106 | */ |
107 | static int | 107 | static int __devinit |
108 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | 108 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) |
109 | { | 109 | { |
110 | int hotcpu = (unsigned long)hcpu; | 110 | int hotcpu = (unsigned long)hcpu; |
@@ -127,6 +127,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
127 | break; | 127 | break; |
128 | #ifdef CONFIG_HOTPLUG_CPU | 128 | #ifdef CONFIG_HOTPLUG_CPU |
129 | case CPU_UP_CANCELED: | 129 | case CPU_UP_CANCELED: |
130 | if (!per_cpu(watchdog_task, hotcpu)) | ||
131 | break; | ||
130 | /* Unbind so it can run. Fall thru. */ | 132 | /* Unbind so it can run. Fall thru. */ |
131 | kthread_bind(per_cpu(watchdog_task, hotcpu), | 133 | kthread_bind(per_cpu(watchdog_task, hotcpu), |
132 | any_online_cpu(cpu_online_map)); | 134 | any_online_cpu(cpu_online_map)); |
@@ -140,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
140 | return NOTIFY_OK; | 142 | return NOTIFY_OK; |
141 | } | 143 | } |
142 | 144 | ||
143 | static struct notifier_block cpu_nfb = { | 145 | static struct notifier_block __devinitdata cpu_nfb = { |
144 | .notifier_call = cpu_callback | 146 | .notifier_call = cpu_callback |
145 | }; | 147 | }; |
146 | 148 | ||
diff --git a/kernel/spinlock.c b/kernel/spinlock.c index d1b810782bc4..b31e54eadf56 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c | |||
@@ -9,7 +9,6 @@ | |||
9 | * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) | 9 | * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/config.h> | ||
13 | #include <linux/linkage.h> | 12 | #include <linux/linkage.h> |
14 | #include <linux/preempt.h> | 13 | #include <linux/preempt.h> |
15 | #include <linux/spinlock.h> | 14 | #include <linux/spinlock.h> |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index dcfb5d731466..2c0aacc37c55 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/cpu.h> | 4 | #include <linux/cpu.h> |
5 | #include <linux/err.h> | 5 | #include <linux/err.h> |
6 | #include <linux/syscalls.h> | 6 | #include <linux/syscalls.h> |
7 | #include <linux/kthread.h> | ||
7 | #include <asm/atomic.h> | 8 | #include <asm/atomic.h> |
8 | #include <asm/semaphore.h> | 9 | #include <asm/semaphore.h> |
9 | #include <asm/uaccess.h> | 10 | #include <asm/uaccess.h> |
@@ -25,13 +26,11 @@ static unsigned int stopmachine_num_threads; | |||
25 | static atomic_t stopmachine_thread_ack; | 26 | static atomic_t stopmachine_thread_ack; |
26 | static DECLARE_MUTEX(stopmachine_mutex); | 27 | static DECLARE_MUTEX(stopmachine_mutex); |
27 | 28 | ||
28 | static int stopmachine(void *cpu) | 29 | static int stopmachine(void *unused) |
29 | { | 30 | { |
30 | int irqs_disabled = 0; | 31 | int irqs_disabled = 0; |
31 | int prepared = 0; | 32 | int prepared = 0; |
32 | 33 | ||
33 | set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu)); | ||
34 | |||
35 | /* Ack: we are alive */ | 34 | /* Ack: we are alive */ |
36 | smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ | 35 | smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ |
37 | atomic_inc(&stopmachine_thread_ack); | 36 | atomic_inc(&stopmachine_thread_ack); |
@@ -85,7 +84,8 @@ static void stopmachine_set_state(enum stopmachine_state state) | |||
85 | 84 | ||
86 | static int stop_machine(void) | 85 | static int stop_machine(void) |
87 | { | 86 | { |
88 | int i, ret = 0; | 87 | int ret = 0; |
88 | unsigned int i; | ||
89 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 89 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; |
90 | 90 | ||
91 | /* One high-prio thread per cpu. We'll do this one. */ | 91 | /* One high-prio thread per cpu. We'll do this one. */ |
@@ -96,11 +96,16 @@ static int stop_machine(void) | |||
96 | stopmachine_state = STOPMACHINE_WAIT; | 96 | stopmachine_state = STOPMACHINE_WAIT; |
97 | 97 | ||
98 | for_each_online_cpu(i) { | 98 | for_each_online_cpu(i) { |
99 | struct task_struct *tsk; | ||
99 | if (i == raw_smp_processor_id()) | 100 | if (i == raw_smp_processor_id()) |
100 | continue; | 101 | continue; |
101 | ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); | 102 | tsk = kthread_create(stopmachine, NULL, "stopmachine"); |
102 | if (ret < 0) | 103 | if (IS_ERR(tsk)) { |
104 | ret = PTR_ERR(tsk); | ||
103 | break; | 105 | break; |
106 | } | ||
107 | kthread_bind(tsk, i); | ||
108 | wake_up_process(tsk); | ||
104 | stopmachine_num_threads++; | 109 | stopmachine_num_threads++; |
105 | } | 110 | } |
106 | 111 | ||
diff --git a/kernel/sys.c b/kernel/sys.c index 0b6ec0e7936f..dbb3b9c7ea64 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -4,7 +4,6 @@ | |||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | 4 | * Copyright (C) 1991, 1992 Linus Torvalds |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include <linux/config.h> | ||
8 | #include <linux/module.h> | 7 | #include <linux/module.h> |
9 | #include <linux/mm.h> | 8 | #include <linux/mm.h> |
10 | #include <linux/utsname.h> | 9 | #include <linux/utsname.h> |
@@ -13,7 +12,6 @@ | |||
13 | #include <linux/notifier.h> | 12 | #include <linux/notifier.h> |
14 | #include <linux/reboot.h> | 13 | #include <linux/reboot.h> |
15 | #include <linux/prctl.h> | 14 | #include <linux/prctl.h> |
16 | #include <linux/init.h> | ||
17 | #include <linux/highuid.h> | 15 | #include <linux/highuid.h> |
18 | #include <linux/fs.h> | 16 | #include <linux/fs.h> |
19 | #include <linux/kernel.h> | 17 | #include <linux/kernel.h> |
@@ -57,6 +55,12 @@ | |||
57 | #ifndef GET_FPEXC_CTL | 55 | #ifndef GET_FPEXC_CTL |
58 | # define GET_FPEXC_CTL(a,b) (-EINVAL) | 56 | # define GET_FPEXC_CTL(a,b) (-EINVAL) |
59 | #endif | 57 | #endif |
58 | #ifndef GET_ENDIAN | ||
59 | # define GET_ENDIAN(a,b) (-EINVAL) | ||
60 | #endif | ||
61 | #ifndef SET_ENDIAN | ||
62 | # define SET_ENDIAN(a,b) (-EINVAL) | ||
63 | #endif | ||
60 | 64 | ||
61 | /* | 65 | /* |
62 | * this is where the system-wide overflow UID and GID are defined, for | 66 | * this is where the system-wide overflow UID and GID are defined, for |
@@ -132,14 +136,15 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, | |||
132 | unsigned long val, void *v) | 136 | unsigned long val, void *v) |
133 | { | 137 | { |
134 | int ret = NOTIFY_DONE; | 138 | int ret = NOTIFY_DONE; |
135 | struct notifier_block *nb; | 139 | struct notifier_block *nb, *next_nb; |
136 | 140 | ||
137 | nb = rcu_dereference(*nl); | 141 | nb = rcu_dereference(*nl); |
138 | while (nb) { | 142 | while (nb) { |
143 | next_nb = rcu_dereference(nb->next); | ||
139 | ret = nb->notifier_call(nb, val, v); | 144 | ret = nb->notifier_call(nb, val, v); |
140 | if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) | 145 | if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) |
141 | break; | 146 | break; |
142 | nb = rcu_dereference(nb->next); | 147 | nb = next_nb; |
143 | } | 148 | } |
144 | return ret; | 149 | return ret; |
145 | } | 150 | } |
@@ -583,7 +588,7 @@ void emergency_restart(void) | |||
583 | } | 588 | } |
584 | EXPORT_SYMBOL_GPL(emergency_restart); | 589 | EXPORT_SYMBOL_GPL(emergency_restart); |
585 | 590 | ||
586 | void kernel_restart_prepare(char *cmd) | 591 | static void kernel_restart_prepare(char *cmd) |
587 | { | 592 | { |
588 | blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); | 593 | blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); |
589 | system_state = SYSTEM_RESTART; | 594 | system_state = SYSTEM_RESTART; |
@@ -617,7 +622,7 @@ EXPORT_SYMBOL_GPL(kernel_restart); | |||
617 | * Move into place and start executing a preloaded standalone | 622 | * Move into place and start executing a preloaded standalone |
618 | * executable. If nothing was preloaded return an error. | 623 | * executable. If nothing was preloaded return an error. |
619 | */ | 624 | */ |
620 | void kernel_kexec(void) | 625 | static void kernel_kexec(void) |
621 | { | 626 | { |
622 | #ifdef CONFIG_KEXEC | 627 | #ifdef CONFIG_KEXEC |
623 | struct kimage *image; | 628 | struct kimage *image; |
@@ -631,7 +636,6 @@ void kernel_kexec(void) | |||
631 | machine_kexec(image); | 636 | machine_kexec(image); |
632 | #endif | 637 | #endif |
633 | } | 638 | } |
634 | EXPORT_SYMBOL_GPL(kernel_kexec); | ||
635 | 639 | ||
636 | void kernel_shutdown_prepare(enum system_states state) | 640 | void kernel_shutdown_prepare(enum system_states state) |
637 | { | 641 | { |
@@ -1860,23 +1864,20 @@ out: | |||
1860 | * fields when reaping, so a sample either gets all the additions of a | 1864 | * fields when reaping, so a sample either gets all the additions of a |
1861 | * given child after it's reaped, or none so this sample is before reaping. | 1865 | * given child after it's reaped, or none so this sample is before reaping. |
1862 | * | 1866 | * |
1863 | * tasklist_lock locking optimisation: | 1867 | * Locking: |
1864 | * If we are current and single threaded, we do not need to take the tasklist | 1868 | * We need to take the siglock for CHILDEREN, SELF and BOTH |
1865 | * lock or the siglock. No one else can take our signal_struct away, | 1869 | * for the cases current multithreaded, non-current single threaded |
1866 | * no one else can reap the children to update signal->c* counters, and | 1870 | * non-current multithreaded. Thread traversal is now safe with |
1867 | * no one else can race with the signal-> fields. | 1871 | * the siglock held. |
1868 | * If we do not take the tasklist_lock, the signal-> fields could be read | 1872 | * Strictly speaking, we donot need to take the siglock if we are current and |
1869 | * out of order while another thread was just exiting. So we place a | 1873 | * single threaded, as no one else can take our signal_struct away, no one |
1870 | * read memory barrier when we avoid the lock. On the writer side, | 1874 | * else can reap the children to update signal->c* counters, and no one else |
1871 | * write memory barrier is implied in __exit_signal as __exit_signal releases | 1875 | * can race with the signal-> fields. If we do not take any lock, the |
1872 | * the siglock spinlock after updating the signal-> fields. | 1876 | * signal-> fields could be read out of order while another thread was just |
1873 | * | 1877 | * exiting. So we should place a read memory barrier when we avoid the lock. |
1874 | * We don't really need the siglock when we access the non c* fields | 1878 | * On the writer side, write memory barrier is implied in __exit_signal |
1875 | * of the signal_struct (for RUSAGE_SELF) even in multithreaded | 1879 | * as __exit_signal releases the siglock spinlock after updating the signal-> |
1876 | * case, since we take the tasklist lock for read and the non c* signal-> | 1880 | * fields. But we don't do this yet to keep things simple. |
1877 | * fields are updated only in __exit_signal, which is called with | ||
1878 | * tasklist_lock taken for write, hence these two threads cannot execute | ||
1879 | * concurrently. | ||
1880 | * | 1881 | * |
1881 | */ | 1882 | */ |
1882 | 1883 | ||
@@ -1885,35 +1886,25 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1885 | struct task_struct *t; | 1886 | struct task_struct *t; |
1886 | unsigned long flags; | 1887 | unsigned long flags; |
1887 | cputime_t utime, stime; | 1888 | cputime_t utime, stime; |
1888 | int need_lock = 0; | ||
1889 | 1889 | ||
1890 | memset((char *) r, 0, sizeof *r); | 1890 | memset((char *) r, 0, sizeof *r); |
1891 | utime = stime = cputime_zero; | 1891 | utime = stime = cputime_zero; |
1892 | 1892 | ||
1893 | if (p != current || !thread_group_empty(p)) | 1893 | rcu_read_lock(); |
1894 | need_lock = 1; | 1894 | if (!lock_task_sighand(p, &flags)) { |
1895 | 1895 | rcu_read_unlock(); | |
1896 | if (need_lock) { | 1896 | return; |
1897 | read_lock(&tasklist_lock); | 1897 | } |
1898 | if (unlikely(!p->signal)) { | ||
1899 | read_unlock(&tasklist_lock); | ||
1900 | return; | ||
1901 | } | ||
1902 | } else | ||
1903 | /* See locking comments above */ | ||
1904 | smp_rmb(); | ||
1905 | 1898 | ||
1906 | switch (who) { | 1899 | switch (who) { |
1907 | case RUSAGE_BOTH: | 1900 | case RUSAGE_BOTH: |
1908 | case RUSAGE_CHILDREN: | 1901 | case RUSAGE_CHILDREN: |
1909 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
1910 | utime = p->signal->cutime; | 1902 | utime = p->signal->cutime; |
1911 | stime = p->signal->cstime; | 1903 | stime = p->signal->cstime; |
1912 | r->ru_nvcsw = p->signal->cnvcsw; | 1904 | r->ru_nvcsw = p->signal->cnvcsw; |
1913 | r->ru_nivcsw = p->signal->cnivcsw; | 1905 | r->ru_nivcsw = p->signal->cnivcsw; |
1914 | r->ru_minflt = p->signal->cmin_flt; | 1906 | r->ru_minflt = p->signal->cmin_flt; |
1915 | r->ru_majflt = p->signal->cmaj_flt; | 1907 | r->ru_majflt = p->signal->cmaj_flt; |
1916 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | ||
1917 | 1908 | ||
1918 | if (who == RUSAGE_CHILDREN) | 1909 | if (who == RUSAGE_CHILDREN) |
1919 | break; | 1910 | break; |
@@ -1941,8 +1932,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1941 | BUG(); | 1932 | BUG(); |
1942 | } | 1933 | } |
1943 | 1934 | ||
1944 | if (need_lock) | 1935 | unlock_task_sighand(p, &flags); |
1945 | read_unlock(&tasklist_lock); | 1936 | rcu_read_unlock(); |
1937 | |||
1946 | cputime_to_timeval(utime, &r->ru_utime); | 1938 | cputime_to_timeval(utime, &r->ru_utime); |
1947 | cputime_to_timeval(stime, &r->ru_stime); | 1939 | cputime_to_timeval(stime, &r->ru_stime); |
1948 | } | 1940 | } |
@@ -2057,6 +2049,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | |||
2057 | return -EFAULT; | 2049 | return -EFAULT; |
2058 | return 0; | 2050 | return 0; |
2059 | } | 2051 | } |
2052 | case PR_GET_ENDIAN: | ||
2053 | error = GET_ENDIAN(current, arg2); | ||
2054 | break; | ||
2055 | case PR_SET_ENDIAN: | ||
2056 | error = SET_ENDIAN(current, arg2); | ||
2057 | break; | ||
2058 | |||
2060 | default: | 2059 | default: |
2061 | error = -EINVAL; | 2060 | error = -EINVAL; |
2062 | break; | 2061 | break; |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 5433195040f1..6991bece67e8 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -87,6 +87,7 @@ cond_syscall(sys_inotify_init); | |||
87 | cond_syscall(sys_inotify_add_watch); | 87 | cond_syscall(sys_inotify_add_watch); |
88 | cond_syscall(sys_inotify_rm_watch); | 88 | cond_syscall(sys_inotify_rm_watch); |
89 | cond_syscall(sys_migrate_pages); | 89 | cond_syscall(sys_migrate_pages); |
90 | cond_syscall(sys_move_pages); | ||
90 | cond_syscall(sys_chown16); | 91 | cond_syscall(sys_chown16); |
91 | cond_syscall(sys_fchown16); | 92 | cond_syscall(sys_fchown16); |
92 | cond_syscall(sys_getegid16); | 93 | cond_syscall(sys_getegid16); |
@@ -132,3 +133,4 @@ cond_syscall(sys_mincore); | |||
132 | cond_syscall(sys_madvise); | 133 | cond_syscall(sys_madvise); |
133 | cond_syscall(sys_mremap); | 134 | cond_syscall(sys_mremap); |
134 | cond_syscall(sys_remap_file_pages); | 135 | cond_syscall(sys_remap_file_pages); |
136 | cond_syscall(compat_sys_move_pages); | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e82726faeeff..99a58f279077 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -18,7 +18,6 @@ | |||
18 | * Removed it and replaced it with older style, 03/23/00, Bill Wendling | 18 | * Removed it and replaced it with older style, 03/23/00, Bill Wendling |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include <linux/config.h> | ||
22 | #include <linux/module.h> | 21 | #include <linux/module.h> |
23 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
24 | #include <linux/swap.h> | 23 | #include <linux/swap.h> |
@@ -59,6 +58,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp, | |||
59 | extern int C_A_D; | 58 | extern int C_A_D; |
60 | extern int sysctl_overcommit_memory; | 59 | extern int sysctl_overcommit_memory; |
61 | extern int sysctl_overcommit_ratio; | 60 | extern int sysctl_overcommit_ratio; |
61 | extern int sysctl_panic_on_oom; | ||
62 | extern int max_threads; | 62 | extern int max_threads; |
63 | extern int sysrq_enabled; | 63 | extern int sysrq_enabled; |
64 | extern int core_uses_pid; | 64 | extern int core_uses_pid; |
@@ -72,6 +72,7 @@ extern int printk_ratelimit_burst; | |||
72 | extern int pid_max_min, pid_max_max; | 72 | extern int pid_max_min, pid_max_max; |
73 | extern int sysctl_drop_caches; | 73 | extern int sysctl_drop_caches; |
74 | extern int percpu_pagelist_fraction; | 74 | extern int percpu_pagelist_fraction; |
75 | extern int compat_log; | ||
75 | 76 | ||
76 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | 77 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) |
77 | int unknown_nmi_panic; | 78 | int unknown_nmi_panic; |
@@ -131,6 +132,10 @@ extern int acct_parm[]; | |||
131 | extern int no_unaligned_warning; | 132 | extern int no_unaligned_warning; |
132 | #endif | 133 | #endif |
133 | 134 | ||
135 | #ifdef CONFIG_RT_MUTEXES | ||
136 | extern int max_lock_depth; | ||
137 | #endif | ||
138 | |||
134 | static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, | 139 | static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, |
135 | ctl_table *, void **); | 140 | ctl_table *, void **); |
136 | static int proc_doutsstring(ctl_table *table, int write, struct file *filp, | 141 | static int proc_doutsstring(ctl_table *table, int write, struct file *filp, |
@@ -142,7 +147,6 @@ static struct ctl_table_header root_table_header = | |||
142 | 147 | ||
143 | static ctl_table kern_table[]; | 148 | static ctl_table kern_table[]; |
144 | static ctl_table vm_table[]; | 149 | static ctl_table vm_table[]; |
145 | static ctl_table proc_table[]; | ||
146 | static ctl_table fs_table[]; | 150 | static ctl_table fs_table[]; |
147 | static ctl_table debug_table[]; | 151 | static ctl_table debug_table[]; |
148 | static ctl_table dev_table[]; | 152 | static ctl_table dev_table[]; |
@@ -150,7 +154,7 @@ extern ctl_table random_table[]; | |||
150 | #ifdef CONFIG_UNIX98_PTYS | 154 | #ifdef CONFIG_UNIX98_PTYS |
151 | extern ctl_table pty_table[]; | 155 | extern ctl_table pty_table[]; |
152 | #endif | 156 | #endif |
153 | #ifdef CONFIG_INOTIFY | 157 | #ifdef CONFIG_INOTIFY_USER |
154 | extern ctl_table inotify_table[]; | 158 | extern ctl_table inotify_table[]; |
155 | #endif | 159 | #endif |
156 | 160 | ||
@@ -202,12 +206,6 @@ static ctl_table root_table[] = { | |||
202 | }, | 206 | }, |
203 | #endif | 207 | #endif |
204 | { | 208 | { |
205 | .ctl_name = CTL_PROC, | ||
206 | .procname = "proc", | ||
207 | .mode = 0555, | ||
208 | .child = proc_table, | ||
209 | }, | ||
210 | { | ||
211 | .ctl_name = CTL_FS, | 209 | .ctl_name = CTL_FS, |
212 | .procname = "fs", | 210 | .procname = "fs", |
213 | .mode = 0555, | 211 | .mode = 0555, |
@@ -398,7 +396,7 @@ static ctl_table kern_table[] = { | |||
398 | .strategy = &sysctl_string, | 396 | .strategy = &sysctl_string, |
399 | }, | 397 | }, |
400 | #endif | 398 | #endif |
401 | #ifdef CONFIG_HOTPLUG | 399 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) |
402 | { | 400 | { |
403 | .ctl_name = KERN_HOTPLUG, | 401 | .ctl_name = KERN_HOTPLUG, |
404 | .procname = "hotplug", | 402 | .procname = "hotplug", |
@@ -683,6 +681,27 @@ static ctl_table kern_table[] = { | |||
683 | .proc_handler = &proc_dointvec, | 681 | .proc_handler = &proc_dointvec, |
684 | }, | 682 | }, |
685 | #endif | 683 | #endif |
684 | #ifdef CONFIG_COMPAT | ||
685 | { | ||
686 | .ctl_name = KERN_COMPAT_LOG, | ||
687 | .procname = "compat-log", | ||
688 | .data = &compat_log, | ||
689 | .maxlen = sizeof (int), | ||
690 | .mode = 0644, | ||
691 | .proc_handler = &proc_dointvec, | ||
692 | }, | ||
693 | #endif | ||
694 | #ifdef CONFIG_RT_MUTEXES | ||
695 | { | ||
696 | .ctl_name = KERN_MAX_LOCK_DEPTH, | ||
697 | .procname = "max_lock_depth", | ||
698 | .data = &max_lock_depth, | ||
699 | .maxlen = sizeof(int), | ||
700 | .mode = 0644, | ||
701 | .proc_handler = &proc_dointvec, | ||
702 | }, | ||
703 | #endif | ||
704 | |||
686 | { .ctl_name = 0 } | 705 | { .ctl_name = 0 } |
687 | }; | 706 | }; |
688 | 707 | ||
@@ -702,6 +721,14 @@ static ctl_table vm_table[] = { | |||
702 | .proc_handler = &proc_dointvec, | 721 | .proc_handler = &proc_dointvec, |
703 | }, | 722 | }, |
704 | { | 723 | { |
724 | .ctl_name = VM_PANIC_ON_OOM, | ||
725 | .procname = "panic_on_oom", | ||
726 | .data = &sysctl_panic_on_oom, | ||
727 | .maxlen = sizeof(sysctl_panic_on_oom), | ||
728 | .mode = 0644, | ||
729 | .proc_handler = &proc_dointvec, | ||
730 | }, | ||
731 | { | ||
705 | .ctl_name = VM_OVERCOMMIT_RATIO, | 732 | .ctl_name = VM_OVERCOMMIT_RATIO, |
706 | .procname = "overcommit_ratio", | 733 | .procname = "overcommit_ratio", |
707 | .data = &sysctl_overcommit_ratio, | 734 | .data = &sysctl_overcommit_ratio, |
@@ -905,23 +932,22 @@ static ctl_table vm_table[] = { | |||
905 | .strategy = &sysctl_intvec, | 932 | .strategy = &sysctl_intvec, |
906 | .extra1 = &zero, | 933 | .extra1 = &zero, |
907 | }, | 934 | }, |
935 | #endif | ||
936 | #ifdef CONFIG_X86_32 | ||
908 | { | 937 | { |
909 | .ctl_name = VM_ZONE_RECLAIM_INTERVAL, | 938 | .ctl_name = VM_VDSO_ENABLED, |
910 | .procname = "zone_reclaim_interval", | 939 | .procname = "vdso_enabled", |
911 | .data = &zone_reclaim_interval, | 940 | .data = &vdso_enabled, |
912 | .maxlen = sizeof(zone_reclaim_interval), | 941 | .maxlen = sizeof(vdso_enabled), |
913 | .mode = 0644, | 942 | .mode = 0644, |
914 | .proc_handler = &proc_dointvec_jiffies, | 943 | .proc_handler = &proc_dointvec, |
915 | .strategy = &sysctl_jiffies, | 944 | .strategy = &sysctl_intvec, |
945 | .extra1 = &zero, | ||
916 | }, | 946 | }, |
917 | #endif | 947 | #endif |
918 | { .ctl_name = 0 } | 948 | { .ctl_name = 0 } |
919 | }; | 949 | }; |
920 | 950 | ||
921 | static ctl_table proc_table[] = { | ||
922 | { .ctl_name = 0 } | ||
923 | }; | ||
924 | |||
925 | static ctl_table fs_table[] = { | 951 | static ctl_table fs_table[] = { |
926 | { | 952 | { |
927 | .ctl_name = FS_NRINODE, | 953 | .ctl_name = FS_NRINODE, |
@@ -1028,7 +1054,7 @@ static ctl_table fs_table[] = { | |||
1028 | .mode = 0644, | 1054 | .mode = 0644, |
1029 | .proc_handler = &proc_doulongvec_minmax, | 1055 | .proc_handler = &proc_doulongvec_minmax, |
1030 | }, | 1056 | }, |
1031 | #ifdef CONFIG_INOTIFY | 1057 | #ifdef CONFIG_INOTIFY_USER |
1032 | { | 1058 | { |
1033 | .ctl_name = FS_INOTIFY, | 1059 | .ctl_name = FS_INOTIFY, |
1034 | .procname = "inotify", | 1060 | .procname = "inotify", |
diff --git a/kernel/time.c b/kernel/time.c index b00ddc71cedb..5bd489747643 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -523,6 +523,7 @@ EXPORT_SYMBOL(do_gettimeofday); | |||
523 | 523 | ||
524 | 524 | ||
525 | #else | 525 | #else |
526 | #ifndef CONFIG_GENERIC_TIME | ||
526 | /* | 527 | /* |
527 | * Simulate gettimeofday using do_gettimeofday which only allows a timeval | 528 | * Simulate gettimeofday using do_gettimeofday which only allows a timeval |
528 | * and therefore only yields usec accuracy | 529 | * and therefore only yields usec accuracy |
@@ -537,6 +538,7 @@ void getnstimeofday(struct timespec *tv) | |||
537 | } | 538 | } |
538 | EXPORT_SYMBOL_GPL(getnstimeofday); | 539 | EXPORT_SYMBOL_GPL(getnstimeofday); |
539 | #endif | 540 | #endif |
541 | #endif | ||
540 | 542 | ||
541 | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. | 543 | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. |
542 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 | 544 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 |
diff --git a/kernel/time/Makefile b/kernel/time/Makefile new file mode 100644 index 000000000000..e1dfd8e86cce --- /dev/null +++ b/kernel/time/Makefile | |||
@@ -0,0 +1 @@ | |||
obj-y += clocksource.o jiffies.o | |||
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c new file mode 100644 index 000000000000..74eca5939bd9 --- /dev/null +++ b/kernel/time/clocksource.c | |||
@@ -0,0 +1,349 @@ | |||
1 | /* | ||
2 | * linux/kernel/time/clocksource.c | ||
3 | * | ||
4 | * This file contains the functions which manage clocksource drivers. | ||
5 | * | ||
6 | * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify | ||
9 | * it under the terms of the GNU General Public License as published by | ||
10 | * the Free Software Foundation; either version 2 of the License, or | ||
11 | * (at your option) any later version. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | * GNU General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public License | ||
19 | * along with this program; if not, write to the Free Software | ||
20 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
21 | * | ||
22 | * TODO WishList: | ||
23 | * o Allow clocksource drivers to be unregistered | ||
24 | * o get rid of clocksource_jiffies extern | ||
25 | */ | ||
26 | |||
27 | #include <linux/clocksource.h> | ||
28 | #include <linux/sysdev.h> | ||
29 | #include <linux/init.h> | ||
30 | #include <linux/module.h> | ||
31 | |||
32 | /* XXX - Would like a better way for initializing curr_clocksource */ | ||
33 | extern struct clocksource clocksource_jiffies; | ||
34 | |||
35 | /*[Clocksource internal variables]--------- | ||
36 | * curr_clocksource: | ||
37 | * currently selected clocksource. Initialized to clocksource_jiffies. | ||
38 | * next_clocksource: | ||
39 | * pending next selected clocksource. | ||
40 | * clocksource_list: | ||
41 | * linked list with the registered clocksources | ||
42 | * clocksource_lock: | ||
43 | * protects manipulations to curr_clocksource and next_clocksource | ||
44 | * and the clocksource_list | ||
45 | * override_name: | ||
46 | * Name of the user-specified clocksource. | ||
47 | */ | ||
48 | static struct clocksource *curr_clocksource = &clocksource_jiffies; | ||
49 | static struct clocksource *next_clocksource; | ||
50 | static LIST_HEAD(clocksource_list); | ||
51 | static DEFINE_SPINLOCK(clocksource_lock); | ||
52 | static char override_name[32]; | ||
53 | static int finished_booting; | ||
54 | |||
55 | /* clocksource_done_booting - Called near the end of bootup | ||
56 | * | ||
57 | * Hack to avoid lots of clocksource churn at boot time | ||
58 | */ | ||
59 | static int __init clocksource_done_booting(void) | ||
60 | { | ||
61 | finished_booting = 1; | ||
62 | return 0; | ||
63 | } | ||
64 | |||
65 | late_initcall(clocksource_done_booting); | ||
66 | |||
67 | /** | ||
68 | * clocksource_get_next - Returns the selected clocksource | ||
69 | * | ||
70 | */ | ||
71 | struct clocksource *clocksource_get_next(void) | ||
72 | { | ||
73 | unsigned long flags; | ||
74 | |||
75 | spin_lock_irqsave(&clocksource_lock, flags); | ||
76 | if (next_clocksource && finished_booting) { | ||
77 | curr_clocksource = next_clocksource; | ||
78 | next_clocksource = NULL; | ||
79 | } | ||
80 | spin_unlock_irqrestore(&clocksource_lock, flags); | ||
81 | |||
82 | return curr_clocksource; | ||
83 | } | ||
84 | |||
85 | /** | ||
86 | * select_clocksource - Finds the best registered clocksource. | ||
87 | * | ||
88 | * Private function. Must hold clocksource_lock when called. | ||
89 | * | ||
90 | * Looks through the list of registered clocksources, returning | ||
91 | * the one with the highest rating value. If there is a clocksource | ||
92 | * name that matches the override string, it returns that clocksource. | ||
93 | */ | ||
94 | static struct clocksource *select_clocksource(void) | ||
95 | { | ||
96 | struct clocksource *best = NULL; | ||
97 | struct list_head *tmp; | ||
98 | |||
99 | list_for_each(tmp, &clocksource_list) { | ||
100 | struct clocksource *src; | ||
101 | |||
102 | src = list_entry(tmp, struct clocksource, list); | ||
103 | if (!best) | ||
104 | best = src; | ||
105 | |||
106 | /* check for override: */ | ||
107 | if (strlen(src->name) == strlen(override_name) && | ||
108 | !strcmp(src->name, override_name)) { | ||
109 | best = src; | ||
110 | break; | ||
111 | } | ||
112 | /* pick the highest rating: */ | ||
113 | if (src->rating > best->rating) | ||
114 | best = src; | ||
115 | } | ||
116 | |||
117 | return best; | ||
118 | } | ||
119 | |||
120 | /** | ||
121 | * is_registered_source - Checks if clocksource is registered | ||
122 | * @c: pointer to a clocksource | ||
123 | * | ||
124 | * Private helper function. Must hold clocksource_lock when called. | ||
125 | * | ||
126 | * Returns one if the clocksource is already registered, zero otherwise. | ||
127 | */ | ||
128 | static int is_registered_source(struct clocksource *c) | ||
129 | { | ||
130 | int len = strlen(c->name); | ||
131 | struct list_head *tmp; | ||
132 | |||
133 | list_for_each(tmp, &clocksource_list) { | ||
134 | struct clocksource *src; | ||
135 | |||
136 | src = list_entry(tmp, struct clocksource, list); | ||
137 | if (strlen(src->name) == len && !strcmp(src->name, c->name)) | ||
138 | return 1; | ||
139 | } | ||
140 | |||
141 | return 0; | ||
142 | } | ||
143 | |||
144 | /** | ||
145 | * clocksource_register - Used to install new clocksources | ||
146 | * @t: clocksource to be registered | ||
147 | * | ||
148 | * Returns -EBUSY if registration fails, zero otherwise. | ||
149 | */ | ||
150 | int clocksource_register(struct clocksource *c) | ||
151 | { | ||
152 | int ret = 0; | ||
153 | unsigned long flags; | ||
154 | |||
155 | spin_lock_irqsave(&clocksource_lock, flags); | ||
156 | /* check if clocksource is already registered */ | ||
157 | if (is_registered_source(c)) { | ||
158 | printk("register_clocksource: Cannot register %s. " | ||
159 | "Already registered!", c->name); | ||
160 | ret = -EBUSY; | ||
161 | } else { | ||
162 | /* register it */ | ||
163 | list_add(&c->list, &clocksource_list); | ||
164 | /* scan the registered clocksources, and pick the best one */ | ||
165 | next_clocksource = select_clocksource(); | ||
166 | } | ||
167 | spin_unlock_irqrestore(&clocksource_lock, flags); | ||
168 | return ret; | ||
169 | } | ||
170 | EXPORT_SYMBOL(clocksource_register); | ||
171 | |||
172 | /** | ||
173 | * clocksource_reselect - Rescan list for next clocksource | ||
174 | * | ||
175 | * A quick helper function to be used if a clocksource changes its | ||
176 | * rating. Forces the clocksource list to be re-scanned for the best | ||
177 | * clocksource. | ||
178 | */ | ||
179 | void clocksource_reselect(void) | ||
180 | { | ||
181 | unsigned long flags; | ||
182 | |||
183 | spin_lock_irqsave(&clocksource_lock, flags); | ||
184 | next_clocksource = select_clocksource(); | ||
185 | spin_unlock_irqrestore(&clocksource_lock, flags); | ||
186 | } | ||
187 | EXPORT_SYMBOL(clocksource_reselect); | ||
188 | |||
189 | /** | ||
190 | * sysfs_show_current_clocksources - sysfs interface for current clocksource | ||
191 | * @dev: unused | ||
192 | * @buf: char buffer to be filled with clocksource list | ||
193 | * | ||
194 | * Provides sysfs interface for listing current clocksource. | ||
195 | */ | ||
196 | static ssize_t | ||
197 | sysfs_show_current_clocksources(struct sys_device *dev, char *buf) | ||
198 | { | ||
199 | char *curr = buf; | ||
200 | |||
201 | spin_lock_irq(&clocksource_lock); | ||
202 | curr += sprintf(curr, "%s ", curr_clocksource->name); | ||
203 | spin_unlock_irq(&clocksource_lock); | ||
204 | |||
205 | curr += sprintf(curr, "\n"); | ||
206 | |||
207 | return curr - buf; | ||
208 | } | ||
209 | |||
210 | /** | ||
211 | * sysfs_override_clocksource - interface for manually overriding clocksource | ||
212 | * @dev: unused | ||
213 | * @buf: name of override clocksource | ||
214 | * @count: length of buffer | ||
215 | * | ||
216 | * Takes input from sysfs interface for manually overriding the default | ||
217 | * clocksource selction. | ||
218 | */ | ||
219 | static ssize_t sysfs_override_clocksource(struct sys_device *dev, | ||
220 | const char *buf, size_t count) | ||
221 | { | ||
222 | size_t ret = count; | ||
223 | /* strings from sysfs write are not 0 terminated! */ | ||
224 | if (count >= sizeof(override_name)) | ||
225 | return -EINVAL; | ||
226 | |||
227 | /* strip of \n: */ | ||
228 | if (buf[count-1] == '\n') | ||
229 | count--; | ||
230 | if (count < 1) | ||
231 | return -EINVAL; | ||
232 | |||
233 | spin_lock_irq(&clocksource_lock); | ||
234 | |||
235 | /* copy the name given: */ | ||
236 | memcpy(override_name, buf, count); | ||
237 | override_name[count] = 0; | ||
238 | |||
239 | /* try to select it: */ | ||
240 | next_clocksource = select_clocksource(); | ||
241 | |||
242 | spin_unlock_irq(&clocksource_lock); | ||
243 | |||
244 | return ret; | ||
245 | } | ||
246 | |||
247 | /** | ||
248 | * sysfs_show_available_clocksources - sysfs interface for listing clocksource | ||
249 | * @dev: unused | ||
250 | * @buf: char buffer to be filled with clocksource list | ||
251 | * | ||
252 | * Provides sysfs interface for listing registered clocksources | ||
253 | */ | ||
254 | static ssize_t | ||
255 | sysfs_show_available_clocksources(struct sys_device *dev, char *buf) | ||
256 | { | ||
257 | struct list_head *tmp; | ||
258 | char *curr = buf; | ||
259 | |||
260 | spin_lock_irq(&clocksource_lock); | ||
261 | list_for_each(tmp, &clocksource_list) { | ||
262 | struct clocksource *src; | ||
263 | |||
264 | src = list_entry(tmp, struct clocksource, list); | ||
265 | curr += sprintf(curr, "%s ", src->name); | ||
266 | } | ||
267 | spin_unlock_irq(&clocksource_lock); | ||
268 | |||
269 | curr += sprintf(curr, "\n"); | ||
270 | |||
271 | return curr - buf; | ||
272 | } | ||
273 | |||
274 | /* | ||
275 | * Sysfs setup bits: | ||
276 | */ | ||
277 | static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, | ||
278 | sysfs_override_clocksource); | ||
279 | |||
280 | static SYSDEV_ATTR(available_clocksource, 0600, | ||
281 | sysfs_show_available_clocksources, NULL); | ||
282 | |||
283 | static struct sysdev_class clocksource_sysclass = { | ||
284 | set_kset_name("clocksource"), | ||
285 | }; | ||
286 | |||
287 | static struct sys_device device_clocksource = { | ||
288 | .id = 0, | ||
289 | .cls = &clocksource_sysclass, | ||
290 | }; | ||
291 | |||
292 | static int __init init_clocksource_sysfs(void) | ||
293 | { | ||
294 | int error = sysdev_class_register(&clocksource_sysclass); | ||
295 | |||
296 | if (!error) | ||
297 | error = sysdev_register(&device_clocksource); | ||
298 | if (!error) | ||
299 | error = sysdev_create_file( | ||
300 | &device_clocksource, | ||
301 | &attr_current_clocksource); | ||
302 | if (!error) | ||
303 | error = sysdev_create_file( | ||
304 | &device_clocksource, | ||
305 | &attr_available_clocksource); | ||
306 | return error; | ||
307 | } | ||
308 | |||
309 | device_initcall(init_clocksource_sysfs); | ||
310 | |||
311 | /** | ||
312 | * boot_override_clocksource - boot clock override | ||
313 | * @str: override name | ||
314 | * | ||
315 | * Takes a clocksource= boot argument and uses it | ||
316 | * as the clocksource override name. | ||
317 | */ | ||
318 | static int __init boot_override_clocksource(char* str) | ||
319 | { | ||
320 | unsigned long flags; | ||
321 | spin_lock_irqsave(&clocksource_lock, flags); | ||
322 | if (str) | ||
323 | strlcpy(override_name, str, sizeof(override_name)); | ||
324 | spin_unlock_irqrestore(&clocksource_lock, flags); | ||
325 | return 1; | ||
326 | } | ||
327 | |||
328 | __setup("clocksource=", boot_override_clocksource); | ||
329 | |||
330 | /** | ||
331 | * boot_override_clock - Compatibility layer for deprecated boot option | ||
332 | * @str: override name | ||
333 | * | ||
334 | * DEPRECATED! Takes a clock= boot argument and uses it | ||
335 | * as the clocksource override name | ||
336 | */ | ||
337 | static int __init boot_override_clock(char* str) | ||
338 | { | ||
339 | if (!strcmp(str, "pmtmr")) { | ||
340 | printk("Warning: clock=pmtmr is deprecated. " | ||
341 | "Use clocksource=acpi_pm.\n"); | ||
342 | return boot_override_clocksource("acpi_pm"); | ||
343 | } | ||
344 | printk("Warning! clock= boot option is deprecated. " | ||
345 | "Use clocksource=xyz\n"); | ||
346 | return boot_override_clocksource(str); | ||
347 | } | ||
348 | |||
349 | __setup("clock=", boot_override_clock); | ||
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c new file mode 100644 index 000000000000..126bb30c4afe --- /dev/null +++ b/kernel/time/jiffies.c | |||
@@ -0,0 +1,73 @@ | |||
1 | /*********************************************************************** | ||
2 | * linux/kernel/time/jiffies.c | ||
3 | * | ||
4 | * This file contains the jiffies based clocksource. | ||
5 | * | ||
6 | * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify | ||
9 | * it under the terms of the GNU General Public License as published by | ||
10 | * the Free Software Foundation; either version 2 of the License, or | ||
11 | * (at your option) any later version. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | * GNU General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public License | ||
19 | * along with this program; if not, write to the Free Software | ||
20 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
21 | * | ||
22 | ************************************************************************/ | ||
23 | #include <linux/clocksource.h> | ||
24 | #include <linux/jiffies.h> | ||
25 | #include <linux/init.h> | ||
26 | |||
27 | /* The Jiffies based clocksource is the lowest common | ||
28 | * denominator clock source which should function on | ||
29 | * all systems. It has the same coarse resolution as | ||
30 | * the timer interrupt frequency HZ and it suffers | ||
31 | * inaccuracies caused by missed or lost timer | ||
32 | * interrupts and the inability for the timer | ||
33 | * interrupt hardware to accuratly tick at the | ||
34 | * requested HZ value. It is also not reccomended | ||
35 | * for "tick-less" systems. | ||
36 | */ | ||
37 | #define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) | ||
38 | |||
39 | /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier | ||
40 | * conversion, the .shift value could be zero. However | ||
41 | * this would make NTP adjustments impossible as they are | ||
42 | * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to | ||
43 | * shift both the nominator and denominator the same | ||
44 | * amount, and give ntp adjustments in units of 1/2^8 | ||
45 | * | ||
46 | * The value 8 is somewhat carefully chosen, as anything | ||
47 | * larger can result in overflows. NSEC_PER_JIFFY grows as | ||
48 | * HZ shrinks, so values greater then 8 overflow 32bits when | ||
49 | * HZ=100. | ||
50 | */ | ||
51 | #define JIFFIES_SHIFT 8 | ||
52 | |||
53 | static cycle_t jiffies_read(void) | ||
54 | { | ||
55 | return (cycle_t) jiffies; | ||
56 | } | ||
57 | |||
58 | struct clocksource clocksource_jiffies = { | ||
59 | .name = "jiffies", | ||
60 | .rating = 0, /* lowest rating*/ | ||
61 | .read = jiffies_read, | ||
62 | .mask = 0xffffffff, /*32bits*/ | ||
63 | .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ | ||
64 | .shift = JIFFIES_SHIFT, | ||
65 | .is_continuous = 0, /* tick based, not free running */ | ||
66 | }; | ||
67 | |||
68 | static int __init init_jiffies_clocksource(void) | ||
69 | { | ||
70 | return clocksource_register(&clocksource_jiffies); | ||
71 | } | ||
72 | |||
73 | module_init(init_jiffies_clocksource); | ||
diff --git a/kernel/timer.c b/kernel/timer.c index 9e49deed468c..5a8960253063 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -146,7 +146,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) | |||
146 | void fastcall init_timer(struct timer_list *timer) | 146 | void fastcall init_timer(struct timer_list *timer) |
147 | { | 147 | { |
148 | timer->entry.next = NULL; | 148 | timer->entry.next = NULL; |
149 | timer->base = per_cpu(tvec_bases, raw_smp_processor_id()); | 149 | timer->base = __raw_get_cpu_var(tvec_bases); |
150 | } | 150 | } |
151 | EXPORT_SYMBOL(init_timer); | 151 | EXPORT_SYMBOL(init_timer); |
152 | 152 | ||
@@ -383,23 +383,19 @@ EXPORT_SYMBOL(del_timer_sync); | |||
383 | static int cascade(tvec_base_t *base, tvec_t *tv, int index) | 383 | static int cascade(tvec_base_t *base, tvec_t *tv, int index) |
384 | { | 384 | { |
385 | /* cascade all the timers from tv up one level */ | 385 | /* cascade all the timers from tv up one level */ |
386 | struct list_head *head, *curr; | 386 | struct timer_list *timer, *tmp; |
387 | struct list_head tv_list; | ||
388 | |||
389 | list_replace_init(tv->vec + index, &tv_list); | ||
387 | 390 | ||
388 | head = tv->vec + index; | ||
389 | curr = head->next; | ||
390 | /* | 391 | /* |
391 | * We are removing _all_ timers from the list, so we don't have to | 392 | * We are removing _all_ timers from the list, so we |
392 | * detach them individually, just clear the list afterwards. | 393 | * don't have to detach them individually. |
393 | */ | 394 | */ |
394 | while (curr != head) { | 395 | list_for_each_entry_safe(timer, tmp, &tv_list, entry) { |
395 | struct timer_list *tmp; | 396 | BUG_ON(timer->base != base); |
396 | 397 | internal_add_timer(base, timer); | |
397 | tmp = list_entry(curr, struct timer_list, entry); | ||
398 | BUG_ON(tmp->base != base); | ||
399 | curr = curr->next; | ||
400 | internal_add_timer(base, tmp); | ||
401 | } | 398 | } |
402 | INIT_LIST_HEAD(head); | ||
403 | 399 | ||
404 | return index; | 400 | return index; |
405 | } | 401 | } |
@@ -419,10 +415,10 @@ static inline void __run_timers(tvec_base_t *base) | |||
419 | 415 | ||
420 | spin_lock_irq(&base->lock); | 416 | spin_lock_irq(&base->lock); |
421 | while (time_after_eq(jiffies, base->timer_jiffies)) { | 417 | while (time_after_eq(jiffies, base->timer_jiffies)) { |
422 | struct list_head work_list = LIST_HEAD_INIT(work_list); | 418 | struct list_head work_list; |
423 | struct list_head *head = &work_list; | 419 | struct list_head *head = &work_list; |
424 | int index = base->timer_jiffies & TVR_MASK; | 420 | int index = base->timer_jiffies & TVR_MASK; |
425 | 421 | ||
426 | /* | 422 | /* |
427 | * Cascade timers: | 423 | * Cascade timers: |
428 | */ | 424 | */ |
@@ -431,8 +427,8 @@ static inline void __run_timers(tvec_base_t *base) | |||
431 | (!cascade(base, &base->tv3, INDEX(1))) && | 427 | (!cascade(base, &base->tv3, INDEX(1))) && |
432 | !cascade(base, &base->tv4, INDEX(2))) | 428 | !cascade(base, &base->tv4, INDEX(2))) |
433 | cascade(base, &base->tv5, INDEX(3)); | 429 | cascade(base, &base->tv5, INDEX(3)); |
434 | ++base->timer_jiffies; | 430 | ++base->timer_jiffies; |
435 | list_splice_init(base->tv1.vec + index, &work_list); | 431 | list_replace_init(base->tv1.vec + index, &work_list); |
436 | while (!list_empty(head)) { | 432 | while (!list_empty(head)) { |
437 | void (*fn)(unsigned long); | 433 | void (*fn)(unsigned long); |
438 | unsigned long data; | 434 | unsigned long data; |
@@ -601,7 +597,6 @@ long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ | |||
601 | long time_precision = 1; /* clock precision (us) */ | 597 | long time_precision = 1; /* clock precision (us) */ |
602 | long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ | 598 | long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ |
603 | long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ | 599 | long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ |
604 | static long time_phase; /* phase offset (scaled us) */ | ||
605 | long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; | 600 | long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; |
606 | /* frequency offset (scaled ppm)*/ | 601 | /* frequency offset (scaled ppm)*/ |
607 | static long time_adj; /* tick adjust (scaled 1 / HZ) */ | 602 | static long time_adj; /* tick adjust (scaled 1 / HZ) */ |
@@ -751,27 +746,14 @@ static long adjtime_adjustment(void) | |||
751 | } | 746 | } |
752 | 747 | ||
753 | /* in the NTP reference this is called "hardclock()" */ | 748 | /* in the NTP reference this is called "hardclock()" */ |
754 | static void update_wall_time_one_tick(void) | 749 | static void update_ntp_one_tick(void) |
755 | { | 750 | { |
756 | long time_adjust_step, delta_nsec; | 751 | long time_adjust_step; |
757 | 752 | ||
758 | time_adjust_step = adjtime_adjustment(); | 753 | time_adjust_step = adjtime_adjustment(); |
759 | if (time_adjust_step) | 754 | if (time_adjust_step) |
760 | /* Reduce by this step the amount of time left */ | 755 | /* Reduce by this step the amount of time left */ |
761 | time_adjust -= time_adjust_step; | 756 | time_adjust -= time_adjust_step; |
762 | delta_nsec = tick_nsec + time_adjust_step * 1000; | ||
763 | /* | ||
764 | * Advance the phase, once it gets to one microsecond, then | ||
765 | * advance the tick more. | ||
766 | */ | ||
767 | time_phase += time_adj; | ||
768 | if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) { | ||
769 | long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10)); | ||
770 | time_phase -= ltemp << (SHIFT_SCALE - 10); | ||
771 | delta_nsec += ltemp; | ||
772 | } | ||
773 | xtime.tv_nsec += delta_nsec; | ||
774 | time_interpolator_update(delta_nsec); | ||
775 | 757 | ||
776 | /* Changes by adjtime() do not take effect till next tick. */ | 758 | /* Changes by adjtime() do not take effect till next tick. */ |
777 | if (time_next_adjust != 0) { | 759 | if (time_next_adjust != 0) { |
@@ -784,36 +766,378 @@ static void update_wall_time_one_tick(void) | |||
784 | * Return how long ticks are at the moment, that is, how much time | 766 | * Return how long ticks are at the moment, that is, how much time |
785 | * update_wall_time_one_tick will add to xtime next time we call it | 767 | * update_wall_time_one_tick will add to xtime next time we call it |
786 | * (assuming no calls to do_adjtimex in the meantime). | 768 | * (assuming no calls to do_adjtimex in the meantime). |
787 | * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10 | 769 | * The return value is in fixed-point nanoseconds shifted by the |
788 | * bits to the right of the binary point. | 770 | * specified number of bits to the right of the binary point. |
789 | * This function has no side-effects. | 771 | * This function has no side-effects. |
790 | */ | 772 | */ |
791 | u64 current_tick_length(void) | 773 | u64 current_tick_length(void) |
792 | { | 774 | { |
793 | long delta_nsec; | 775 | long delta_nsec; |
776 | u64 ret; | ||
794 | 777 | ||
778 | /* calculate the finest interval NTP will allow. | ||
779 | * ie: nanosecond value shifted by (SHIFT_SCALE - 10) | ||
780 | */ | ||
795 | delta_nsec = tick_nsec + adjtime_adjustment() * 1000; | 781 | delta_nsec = tick_nsec + adjtime_adjustment() * 1000; |
796 | return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; | 782 | ret = (u64)delta_nsec << TICK_LENGTH_SHIFT; |
783 | ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10)); | ||
784 | |||
785 | return ret; | ||
797 | } | 786 | } |
798 | 787 | ||
799 | /* | 788 | /* XXX - all of this timekeeping code should be later moved to time.c */ |
800 | * Using a loop looks inefficient, but "ticks" is | 789 | #include <linux/clocksource.h> |
801 | * usually just one (we shouldn't be losing ticks, | 790 | static struct clocksource *clock; /* pointer to current clocksource */ |
802 | * we're doing this this way mainly for interrupt | 791 | |
803 | * latency reasons, not because we think we'll | 792 | #ifdef CONFIG_GENERIC_TIME |
804 | * have lots of lost timer ticks | 793 | /** |
794 | * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook | ||
795 | * | ||
796 | * private function, must hold xtime_lock lock when being | ||
797 | * called. Returns the number of nanoseconds since the | ||
798 | * last call to update_wall_time() (adjusted by NTP scaling) | ||
799 | */ | ||
800 | static inline s64 __get_nsec_offset(void) | ||
801 | { | ||
802 | cycle_t cycle_now, cycle_delta; | ||
803 | s64 ns_offset; | ||
804 | |||
805 | /* read clocksource: */ | ||
806 | cycle_now = clocksource_read(clock); | ||
807 | |||
808 | /* calculate the delta since the last update_wall_time: */ | ||
809 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | ||
810 | |||
811 | /* convert to nanoseconds: */ | ||
812 | ns_offset = cyc2ns(clock, cycle_delta); | ||
813 | |||
814 | return ns_offset; | ||
815 | } | ||
816 | |||
817 | /** | ||
818 | * __get_realtime_clock_ts - Returns the time of day in a timespec | ||
819 | * @ts: pointer to the timespec to be set | ||
820 | * | ||
821 | * Returns the time of day in a timespec. Used by | ||
822 | * do_gettimeofday() and get_realtime_clock_ts(). | ||
823 | */ | ||
824 | static inline void __get_realtime_clock_ts(struct timespec *ts) | ||
825 | { | ||
826 | unsigned long seq; | ||
827 | s64 nsecs; | ||
828 | |||
829 | do { | ||
830 | seq = read_seqbegin(&xtime_lock); | ||
831 | |||
832 | *ts = xtime; | ||
833 | nsecs = __get_nsec_offset(); | ||
834 | |||
835 | } while (read_seqretry(&xtime_lock, seq)); | ||
836 | |||
837 | timespec_add_ns(ts, nsecs); | ||
838 | } | ||
839 | |||
840 | /** | ||
841 | * getnstimeofday - Returns the time of day in a timespec | ||
842 | * @ts: pointer to the timespec to be set | ||
843 | * | ||
844 | * Returns the time of day in a timespec. | ||
845 | */ | ||
846 | void getnstimeofday(struct timespec *ts) | ||
847 | { | ||
848 | __get_realtime_clock_ts(ts); | ||
849 | } | ||
850 | |||
851 | EXPORT_SYMBOL(getnstimeofday); | ||
852 | |||
853 | /** | ||
854 | * do_gettimeofday - Returns the time of day in a timeval | ||
855 | * @tv: pointer to the timeval to be set | ||
856 | * | ||
857 | * NOTE: Users should be converted to using get_realtime_clock_ts() | ||
858 | */ | ||
859 | void do_gettimeofday(struct timeval *tv) | ||
860 | { | ||
861 | struct timespec now; | ||
862 | |||
863 | __get_realtime_clock_ts(&now); | ||
864 | tv->tv_sec = now.tv_sec; | ||
865 | tv->tv_usec = now.tv_nsec/1000; | ||
866 | } | ||
867 | |||
868 | EXPORT_SYMBOL(do_gettimeofday); | ||
869 | /** | ||
870 | * do_settimeofday - Sets the time of day | ||
871 | * @tv: pointer to the timespec variable containing the new time | ||
872 | * | ||
873 | * Sets the time of day to the new time and update NTP and notify hrtimers | ||
874 | */ | ||
875 | int do_settimeofday(struct timespec *tv) | ||
876 | { | ||
877 | unsigned long flags; | ||
878 | time_t wtm_sec, sec = tv->tv_sec; | ||
879 | long wtm_nsec, nsec = tv->tv_nsec; | ||
880 | |||
881 | if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) | ||
882 | return -EINVAL; | ||
883 | |||
884 | write_seqlock_irqsave(&xtime_lock, flags); | ||
885 | |||
886 | nsec -= __get_nsec_offset(); | ||
887 | |||
888 | wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); | ||
889 | wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); | ||
890 | |||
891 | set_normalized_timespec(&xtime, sec, nsec); | ||
892 | set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); | ||
893 | |||
894 | ntp_clear(); | ||
895 | |||
896 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
897 | |||
898 | /* signal hrtimers about time change */ | ||
899 | clock_was_set(); | ||
900 | |||
901 | return 0; | ||
902 | } | ||
903 | |||
904 | EXPORT_SYMBOL(do_settimeofday); | ||
905 | |||
906 | /** | ||
907 | * change_clocksource - Swaps clocksources if a new one is available | ||
908 | * | ||
909 | * Accumulates current time interval and initializes new clocksource | ||
910 | */ | ||
911 | static int change_clocksource(void) | ||
912 | { | ||
913 | struct clocksource *new; | ||
914 | cycle_t now; | ||
915 | u64 nsec; | ||
916 | new = clocksource_get_next(); | ||
917 | if (clock != new) { | ||
918 | now = clocksource_read(new); | ||
919 | nsec = __get_nsec_offset(); | ||
920 | timespec_add_ns(&xtime, nsec); | ||
921 | |||
922 | clock = new; | ||
923 | clock->cycle_last = now; | ||
924 | printk(KERN_INFO "Time: %s clocksource has been installed.\n", | ||
925 | clock->name); | ||
926 | return 1; | ||
927 | } else if (clock->update_callback) { | ||
928 | return clock->update_callback(); | ||
929 | } | ||
930 | return 0; | ||
931 | } | ||
932 | #else | ||
933 | #define change_clocksource() (0) | ||
934 | #endif | ||
935 | |||
936 | /** | ||
937 | * timeofday_is_continuous - check to see if timekeeping is free running | ||
805 | */ | 938 | */ |
806 | static void update_wall_time(unsigned long ticks) | 939 | int timekeeping_is_continuous(void) |
807 | { | 940 | { |
941 | unsigned long seq; | ||
942 | int ret; | ||
943 | |||
808 | do { | 944 | do { |
809 | ticks--; | 945 | seq = read_seqbegin(&xtime_lock); |
810 | update_wall_time_one_tick(); | 946 | |
811 | if (xtime.tv_nsec >= 1000000000) { | 947 | ret = clock->is_continuous; |
812 | xtime.tv_nsec -= 1000000000; | 948 | |
949 | } while (read_seqretry(&xtime_lock, seq)); | ||
950 | |||
951 | return ret; | ||
952 | } | ||
953 | |||
954 | /* | ||
955 | * timekeeping_init - Initializes the clocksource and common timekeeping values | ||
956 | */ | ||
957 | void __init timekeeping_init(void) | ||
958 | { | ||
959 | unsigned long flags; | ||
960 | |||
961 | write_seqlock_irqsave(&xtime_lock, flags); | ||
962 | clock = clocksource_get_next(); | ||
963 | clocksource_calculate_interval(clock, tick_nsec); | ||
964 | clock->cycle_last = clocksource_read(clock); | ||
965 | ntp_clear(); | ||
966 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
967 | } | ||
968 | |||
969 | |||
970 | /* | ||
971 | * timekeeping_resume - Resumes the generic timekeeping subsystem. | ||
972 | * @dev: unused | ||
973 | * | ||
974 | * This is for the generic clocksource timekeeping. | ||
975 | * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are | ||
976 | * still managed by arch specific suspend/resume code. | ||
977 | */ | ||
978 | static int timekeeping_resume(struct sys_device *dev) | ||
979 | { | ||
980 | unsigned long flags; | ||
981 | |||
982 | write_seqlock_irqsave(&xtime_lock, flags); | ||
983 | /* restart the last cycle value */ | ||
984 | clock->cycle_last = clocksource_read(clock); | ||
985 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
986 | return 0; | ||
987 | } | ||
988 | |||
989 | /* sysfs resume/suspend bits for timekeeping */ | ||
990 | static struct sysdev_class timekeeping_sysclass = { | ||
991 | .resume = timekeeping_resume, | ||
992 | set_kset_name("timekeeping"), | ||
993 | }; | ||
994 | |||
995 | static struct sys_device device_timer = { | ||
996 | .id = 0, | ||
997 | .cls = &timekeeping_sysclass, | ||
998 | }; | ||
999 | |||
1000 | static int __init timekeeping_init_device(void) | ||
1001 | { | ||
1002 | int error = sysdev_class_register(&timekeeping_sysclass); | ||
1003 | if (!error) | ||
1004 | error = sysdev_register(&device_timer); | ||
1005 | return error; | ||
1006 | } | ||
1007 | |||
1008 | device_initcall(timekeeping_init_device); | ||
1009 | |||
1010 | /* | ||
1011 | * If the error is already larger, we look ahead another tick, | ||
1012 | * to compensate for late or lost adjustments. | ||
1013 | */ | ||
1014 | static __always_inline int clocksource_bigadjust(int sign, s64 error, s64 *interval, s64 *offset) | ||
1015 | { | ||
1016 | int adj; | ||
1017 | |||
1018 | /* | ||
1019 | * As soon as the machine is synchronized to the external time | ||
1020 | * source this should be the common case. | ||
1021 | */ | ||
1022 | error >>= 2; | ||
1023 | if (likely(sign > 0 ? error <= *interval : error >= *interval)) | ||
1024 | return sign; | ||
1025 | |||
1026 | /* | ||
1027 | * An extra look ahead dampens the effect of the current error, | ||
1028 | * which can grow quite large with continously late updates, as | ||
1029 | * it would dominate the adjustment value and can lead to | ||
1030 | * oscillation. | ||
1031 | */ | ||
1032 | error += current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); | ||
1033 | error -= clock->xtime_interval >> 1; | ||
1034 | |||
1035 | adj = 0; | ||
1036 | while (1) { | ||
1037 | error >>= 1; | ||
1038 | if (sign > 0 ? error <= *interval : error >= *interval) | ||
1039 | break; | ||
1040 | adj++; | ||
1041 | } | ||
1042 | |||
1043 | /* | ||
1044 | * Add the current adjustments to the error and take the offset | ||
1045 | * into account, the latter can cause the error to be hardly | ||
1046 | * reduced at the next tick. Check the error again if there's | ||
1047 | * room for another adjustment, thus further reducing the error | ||
1048 | * which otherwise had to be corrected at the next update. | ||
1049 | */ | ||
1050 | error = (error << 1) - *interval + *offset; | ||
1051 | if (sign > 0 ? error > *interval : error < *interval) | ||
1052 | adj++; | ||
1053 | |||
1054 | *interval <<= adj; | ||
1055 | *offset <<= adj; | ||
1056 | return sign << adj; | ||
1057 | } | ||
1058 | |||
1059 | /* | ||
1060 | * Adjust the multiplier to reduce the error value, | ||
1061 | * this is optimized for the most common adjustments of -1,0,1, | ||
1062 | * for other values we can do a bit more work. | ||
1063 | */ | ||
1064 | static void clocksource_adjust(struct clocksource *clock, s64 offset) | ||
1065 | { | ||
1066 | s64 error, interval = clock->cycle_interval; | ||
1067 | int adj; | ||
1068 | |||
1069 | error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); | ||
1070 | if (error > interval) { | ||
1071 | adj = clocksource_bigadjust(1, error, &interval, &offset); | ||
1072 | } else if (error < -interval) { | ||
1073 | interval = -interval; | ||
1074 | offset = -offset; | ||
1075 | adj = clocksource_bigadjust(-1, error, &interval, &offset); | ||
1076 | } else | ||
1077 | return; | ||
1078 | |||
1079 | clock->mult += adj; | ||
1080 | clock->xtime_interval += interval; | ||
1081 | clock->xtime_nsec -= offset; | ||
1082 | clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); | ||
1083 | } | ||
1084 | |||
1085 | /* | ||
1086 | * update_wall_time - Uses the current clocksource to increment the wall time | ||
1087 | * | ||
1088 | * Called from the timer interrupt, must hold a write on xtime_lock. | ||
1089 | */ | ||
1090 | static void update_wall_time(void) | ||
1091 | { | ||
1092 | cycle_t offset; | ||
1093 | |||
1094 | clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; | ||
1095 | |||
1096 | #ifdef CONFIG_GENERIC_TIME | ||
1097 | offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; | ||
1098 | #else | ||
1099 | offset = clock->cycle_interval; | ||
1100 | #endif | ||
1101 | |||
1102 | /* normally this loop will run just once, however in the | ||
1103 | * case of lost or late ticks, it will accumulate correctly. | ||
1104 | */ | ||
1105 | while (offset >= clock->cycle_interval) { | ||
1106 | /* accumulate one interval */ | ||
1107 | clock->xtime_nsec += clock->xtime_interval; | ||
1108 | clock->cycle_last += clock->cycle_interval; | ||
1109 | offset -= clock->cycle_interval; | ||
1110 | |||
1111 | if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { | ||
1112 | clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; | ||
813 | xtime.tv_sec++; | 1113 | xtime.tv_sec++; |
814 | second_overflow(); | 1114 | second_overflow(); |
815 | } | 1115 | } |
816 | } while (ticks); | 1116 | |
1117 | /* interpolator bits */ | ||
1118 | time_interpolator_update(clock->xtime_interval | ||
1119 | >> clock->shift); | ||
1120 | /* increment the NTP state machine */ | ||
1121 | update_ntp_one_tick(); | ||
1122 | |||
1123 | /* accumulate error between NTP and clock interval */ | ||
1124 | clock->error += current_tick_length(); | ||
1125 | clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); | ||
1126 | } | ||
1127 | |||
1128 | /* correct the clock when NTP error is too big */ | ||
1129 | clocksource_adjust(clock, offset); | ||
1130 | |||
1131 | /* store full nanoseconds into xtime */ | ||
1132 | xtime.tv_nsec = clock->xtime_nsec >> clock->shift; | ||
1133 | clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; | ||
1134 | |||
1135 | /* check to see if there is a new clocksource to use */ | ||
1136 | if (change_clocksource()) { | ||
1137 | clock->error = 0; | ||
1138 | clock->xtime_nsec = 0; | ||
1139 | clocksource_calculate_interval(clock, tick_nsec); | ||
1140 | } | ||
817 | } | 1141 | } |
818 | 1142 | ||
819 | /* | 1143 | /* |
@@ -919,10 +1243,8 @@ static inline void update_times(void) | |||
919 | unsigned long ticks; | 1243 | unsigned long ticks; |
920 | 1244 | ||
921 | ticks = jiffies - wall_jiffies; | 1245 | ticks = jiffies - wall_jiffies; |
922 | if (ticks) { | 1246 | wall_jiffies += ticks; |
923 | wall_jiffies += ticks; | 1247 | update_wall_time(); |
924 | update_wall_time(ticks); | ||
925 | } | ||
926 | calc_load(ticks); | 1248 | calc_load(ticks); |
927 | } | 1249 | } |
928 | 1250 | ||
@@ -1330,7 +1652,7 @@ static void __devinit migrate_timers(int cpu) | |||
1330 | } | 1652 | } |
1331 | #endif /* CONFIG_HOTPLUG_CPU */ | 1653 | #endif /* CONFIG_HOTPLUG_CPU */ |
1332 | 1654 | ||
1333 | static int timer_cpu_notify(struct notifier_block *self, | 1655 | static int __devinit timer_cpu_notify(struct notifier_block *self, |
1334 | unsigned long action, void *hcpu) | 1656 | unsigned long action, void *hcpu) |
1335 | { | 1657 | { |
1336 | long cpu = (long)hcpu; | 1658 | long cpu = (long)hcpu; |
@@ -1350,7 +1672,7 @@ static int timer_cpu_notify(struct notifier_block *self, | |||
1350 | return NOTIFY_OK; | 1672 | return NOTIFY_OK; |
1351 | } | 1673 | } |
1352 | 1674 | ||
1353 | static struct notifier_block timers_nb = { | 1675 | static struct notifier_block __devinitdata timers_nb = { |
1354 | .notifier_call = timer_cpu_notify, | 1676 | .notifier_call = timer_cpu_notify, |
1355 | }; | 1677 | }; |
1356 | 1678 | ||
diff --git a/kernel/unwind.c b/kernel/unwind.c new file mode 100644 index 000000000000..f69c804c8e62 --- /dev/null +++ b/kernel/unwind.c | |||
@@ -0,0 +1,918 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2002-2006 Novell, Inc. | ||
3 | * Jan Beulich <jbeulich@novell.com> | ||
4 | * This code is released under version 2 of the GNU GPL. | ||
5 | * | ||
6 | * A simple API for unwinding kernel stacks. This is used for | ||
7 | * debugging and error reporting purposes. The kernel doesn't need | ||
8 | * full-blown stack unwinding with all the bells and whistles, so there | ||
9 | * is not much point in implementing the full Dwarf2 unwind API. | ||
10 | */ | ||
11 | |||
12 | #include <linux/unwind.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <linux/delay.h> | ||
15 | #include <linux/stop_machine.h> | ||
16 | #include <asm/sections.h> | ||
17 | #include <asm/uaccess.h> | ||
18 | #include <asm/unaligned.h> | ||
19 | |||
20 | extern char __start_unwind[], __end_unwind[]; | ||
21 | |||
22 | #define MAX_STACK_DEPTH 8 | ||
23 | |||
24 | #define EXTRA_INFO(f) { \ | ||
25 | BUILD_BUG_ON_ZERO(offsetof(struct unwind_frame_info, f) \ | ||
26 | % FIELD_SIZEOF(struct unwind_frame_info, f)) \ | ||
27 | + offsetof(struct unwind_frame_info, f) \ | ||
28 | / FIELD_SIZEOF(struct unwind_frame_info, f), \ | ||
29 | FIELD_SIZEOF(struct unwind_frame_info, f) \ | ||
30 | } | ||
31 | #define PTREGS_INFO(f) EXTRA_INFO(regs.f) | ||
32 | |||
33 | static const struct { | ||
34 | unsigned offs:BITS_PER_LONG / 2; | ||
35 | unsigned width:BITS_PER_LONG / 2; | ||
36 | } reg_info[] = { | ||
37 | UNW_REGISTER_INFO | ||
38 | }; | ||
39 | |||
40 | #undef PTREGS_INFO | ||
41 | #undef EXTRA_INFO | ||
42 | |||
43 | #ifndef REG_INVALID | ||
44 | #define REG_INVALID(r) (reg_info[r].width == 0) | ||
45 | #endif | ||
46 | |||
47 | #define DW_CFA_nop 0x00 | ||
48 | #define DW_CFA_set_loc 0x01 | ||
49 | #define DW_CFA_advance_loc1 0x02 | ||
50 | #define DW_CFA_advance_loc2 0x03 | ||
51 | #define DW_CFA_advance_loc4 0x04 | ||
52 | #define DW_CFA_offset_extended 0x05 | ||
53 | #define DW_CFA_restore_extended 0x06 | ||
54 | #define DW_CFA_undefined 0x07 | ||
55 | #define DW_CFA_same_value 0x08 | ||
56 | #define DW_CFA_register 0x09 | ||
57 | #define DW_CFA_remember_state 0x0a | ||
58 | #define DW_CFA_restore_state 0x0b | ||
59 | #define DW_CFA_def_cfa 0x0c | ||
60 | #define DW_CFA_def_cfa_register 0x0d | ||
61 | #define DW_CFA_def_cfa_offset 0x0e | ||
62 | #define DW_CFA_def_cfa_expression 0x0f | ||
63 | #define DW_CFA_expression 0x10 | ||
64 | #define DW_CFA_offset_extended_sf 0x11 | ||
65 | #define DW_CFA_def_cfa_sf 0x12 | ||
66 | #define DW_CFA_def_cfa_offset_sf 0x13 | ||
67 | #define DW_CFA_val_offset 0x14 | ||
68 | #define DW_CFA_val_offset_sf 0x15 | ||
69 | #define DW_CFA_val_expression 0x16 | ||
70 | #define DW_CFA_lo_user 0x1c | ||
71 | #define DW_CFA_GNU_window_save 0x2d | ||
72 | #define DW_CFA_GNU_args_size 0x2e | ||
73 | #define DW_CFA_GNU_negative_offset_extended 0x2f | ||
74 | #define DW_CFA_hi_user 0x3f | ||
75 | |||
76 | #define DW_EH_PE_FORM 0x07 | ||
77 | #define DW_EH_PE_native 0x00 | ||
78 | #define DW_EH_PE_leb128 0x01 | ||
79 | #define DW_EH_PE_data2 0x02 | ||
80 | #define DW_EH_PE_data4 0x03 | ||
81 | #define DW_EH_PE_data8 0x04 | ||
82 | #define DW_EH_PE_signed 0x08 | ||
83 | #define DW_EH_PE_ADJUST 0x70 | ||
84 | #define DW_EH_PE_abs 0x00 | ||
85 | #define DW_EH_PE_pcrel 0x10 | ||
86 | #define DW_EH_PE_textrel 0x20 | ||
87 | #define DW_EH_PE_datarel 0x30 | ||
88 | #define DW_EH_PE_funcrel 0x40 | ||
89 | #define DW_EH_PE_aligned 0x50 | ||
90 | #define DW_EH_PE_indirect 0x80 | ||
91 | #define DW_EH_PE_omit 0xff | ||
92 | |||
93 | typedef unsigned long uleb128_t; | ||
94 | typedef signed long sleb128_t; | ||
95 | |||
96 | static struct unwind_table { | ||
97 | struct { | ||
98 | unsigned long pc; | ||
99 | unsigned long range; | ||
100 | } core, init; | ||
101 | const void *address; | ||
102 | unsigned long size; | ||
103 | struct unwind_table *link; | ||
104 | const char *name; | ||
105 | } root_table, *last_table; | ||
106 | |||
107 | struct unwind_item { | ||
108 | enum item_location { | ||
109 | Nowhere, | ||
110 | Memory, | ||
111 | Register, | ||
112 | Value | ||
113 | } where; | ||
114 | uleb128_t value; | ||
115 | }; | ||
116 | |||
117 | struct unwind_state { | ||
118 | uleb128_t loc, org; | ||
119 | const u8 *cieStart, *cieEnd; | ||
120 | uleb128_t codeAlign; | ||
121 | sleb128_t dataAlign; | ||
122 | struct cfa { | ||
123 | uleb128_t reg, offs; | ||
124 | } cfa; | ||
125 | struct unwind_item regs[ARRAY_SIZE(reg_info)]; | ||
126 | unsigned stackDepth:8; | ||
127 | unsigned version:8; | ||
128 | const u8 *label; | ||
129 | const u8 *stack[MAX_STACK_DEPTH]; | ||
130 | }; | ||
131 | |||
132 | static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 }; | ||
133 | |||
134 | static struct unwind_table *find_table(unsigned long pc) | ||
135 | { | ||
136 | struct unwind_table *table; | ||
137 | |||
138 | for (table = &root_table; table; table = table->link) | ||
139 | if ((pc >= table->core.pc | ||
140 | && pc < table->core.pc + table->core.range) | ||
141 | || (pc >= table->init.pc | ||
142 | && pc < table->init.pc + table->init.range)) | ||
143 | break; | ||
144 | |||
145 | return table; | ||
146 | } | ||
147 | |||
148 | static void init_unwind_table(struct unwind_table *table, | ||
149 | const char *name, | ||
150 | const void *core_start, | ||
151 | unsigned long core_size, | ||
152 | const void *init_start, | ||
153 | unsigned long init_size, | ||
154 | const void *table_start, | ||
155 | unsigned long table_size) | ||
156 | { | ||
157 | table->core.pc = (unsigned long)core_start; | ||
158 | table->core.range = core_size; | ||
159 | table->init.pc = (unsigned long)init_start; | ||
160 | table->init.range = init_size; | ||
161 | table->address = table_start; | ||
162 | table->size = table_size; | ||
163 | table->link = NULL; | ||
164 | table->name = name; | ||
165 | } | ||
166 | |||
167 | void __init unwind_init(void) | ||
168 | { | ||
169 | init_unwind_table(&root_table, "kernel", | ||
170 | _text, _end - _text, | ||
171 | NULL, 0, | ||
172 | __start_unwind, __end_unwind - __start_unwind); | ||
173 | } | ||
174 | |||
175 | #ifdef CONFIG_MODULES | ||
176 | |||
177 | /* Must be called with module_mutex held. */ | ||
178 | void *unwind_add_table(struct module *module, | ||
179 | const void *table_start, | ||
180 | unsigned long table_size) | ||
181 | { | ||
182 | struct unwind_table *table; | ||
183 | |||
184 | if (table_size <= 0) | ||
185 | return NULL; | ||
186 | |||
187 | table = kmalloc(sizeof(*table), GFP_KERNEL); | ||
188 | if (!table) | ||
189 | return NULL; | ||
190 | |||
191 | init_unwind_table(table, module->name, | ||
192 | module->module_core, module->core_size, | ||
193 | module->module_init, module->init_size, | ||
194 | table_start, table_size); | ||
195 | |||
196 | if (last_table) | ||
197 | last_table->link = table; | ||
198 | else | ||
199 | root_table.link = table; | ||
200 | last_table = table; | ||
201 | |||
202 | return table; | ||
203 | } | ||
204 | |||
205 | struct unlink_table_info | ||
206 | { | ||
207 | struct unwind_table *table; | ||
208 | int init_only; | ||
209 | }; | ||
210 | |||
211 | static int unlink_table(void *arg) | ||
212 | { | ||
213 | struct unlink_table_info *info = arg; | ||
214 | struct unwind_table *table = info->table, *prev; | ||
215 | |||
216 | for (prev = &root_table; prev->link && prev->link != table; prev = prev->link) | ||
217 | ; | ||
218 | |||
219 | if (prev->link) { | ||
220 | if (info->init_only) { | ||
221 | table->init.pc = 0; | ||
222 | table->init.range = 0; | ||
223 | info->table = NULL; | ||
224 | } else { | ||
225 | prev->link = table->link; | ||
226 | if (!prev->link) | ||
227 | last_table = prev; | ||
228 | } | ||
229 | } else | ||
230 | info->table = NULL; | ||
231 | |||
232 | return 0; | ||
233 | } | ||
234 | |||
235 | /* Must be called with module_mutex held. */ | ||
236 | void unwind_remove_table(void *handle, int init_only) | ||
237 | { | ||
238 | struct unwind_table *table = handle; | ||
239 | struct unlink_table_info info; | ||
240 | |||
241 | if (!table || table == &root_table) | ||
242 | return; | ||
243 | |||
244 | if (init_only && table == last_table) { | ||
245 | table->init.pc = 0; | ||
246 | table->init.range = 0; | ||
247 | return; | ||
248 | } | ||
249 | |||
250 | info.table = table; | ||
251 | info.init_only = init_only; | ||
252 | stop_machine_run(unlink_table, &info, NR_CPUS); | ||
253 | |||
254 | if (info.table) | ||
255 | kfree(table); | ||
256 | } | ||
257 | |||
258 | #endif /* CONFIG_MODULES */ | ||
259 | |||
260 | static uleb128_t get_uleb128(const u8 **pcur, const u8 *end) | ||
261 | { | ||
262 | const u8 *cur = *pcur; | ||
263 | uleb128_t value; | ||
264 | unsigned shift; | ||
265 | |||
266 | for (shift = 0, value = 0; cur < end; shift += 7) { | ||
267 | if (shift + 7 > 8 * sizeof(value) | ||
268 | && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) { | ||
269 | cur = end + 1; | ||
270 | break; | ||
271 | } | ||
272 | value |= (uleb128_t)(*cur & 0x7f) << shift; | ||
273 | if (!(*cur++ & 0x80)) | ||
274 | break; | ||
275 | } | ||
276 | *pcur = cur; | ||
277 | |||
278 | return value; | ||
279 | } | ||
280 | |||
281 | static sleb128_t get_sleb128(const u8 **pcur, const u8 *end) | ||
282 | { | ||
283 | const u8 *cur = *pcur; | ||
284 | sleb128_t value; | ||
285 | unsigned shift; | ||
286 | |||
287 | for (shift = 0, value = 0; cur < end; shift += 7) { | ||
288 | if (shift + 7 > 8 * sizeof(value) | ||
289 | && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) { | ||
290 | cur = end + 1; | ||
291 | break; | ||
292 | } | ||
293 | value |= (sleb128_t)(*cur & 0x7f) << shift; | ||
294 | if (!(*cur & 0x80)) { | ||
295 | value |= -(*cur++ & 0x40) << shift; | ||
296 | break; | ||
297 | } | ||
298 | } | ||
299 | *pcur = cur; | ||
300 | |||
301 | return value; | ||
302 | } | ||
303 | |||
304 | static unsigned long read_pointer(const u8 **pLoc, | ||
305 | const void *end, | ||
306 | signed ptrType) | ||
307 | { | ||
308 | unsigned long value = 0; | ||
309 | union { | ||
310 | const u8 *p8; | ||
311 | const u16 *p16u; | ||
312 | const s16 *p16s; | ||
313 | const u32 *p32u; | ||
314 | const s32 *p32s; | ||
315 | const unsigned long *pul; | ||
316 | } ptr; | ||
317 | |||
318 | if (ptrType < 0 || ptrType == DW_EH_PE_omit) | ||
319 | return 0; | ||
320 | ptr.p8 = *pLoc; | ||
321 | switch(ptrType & DW_EH_PE_FORM) { | ||
322 | case DW_EH_PE_data2: | ||
323 | if (end < (const void *)(ptr.p16u + 1)) | ||
324 | return 0; | ||
325 | if(ptrType & DW_EH_PE_signed) | ||
326 | value = get_unaligned(ptr.p16s++); | ||
327 | else | ||
328 | value = get_unaligned(ptr.p16u++); | ||
329 | break; | ||
330 | case DW_EH_PE_data4: | ||
331 | #ifdef CONFIG_64BIT | ||
332 | if (end < (const void *)(ptr.p32u + 1)) | ||
333 | return 0; | ||
334 | if(ptrType & DW_EH_PE_signed) | ||
335 | value = get_unaligned(ptr.p32s++); | ||
336 | else | ||
337 | value = get_unaligned(ptr.p32u++); | ||
338 | break; | ||
339 | case DW_EH_PE_data8: | ||
340 | BUILD_BUG_ON(sizeof(u64) != sizeof(value)); | ||
341 | #else | ||
342 | BUILD_BUG_ON(sizeof(u32) != sizeof(value)); | ||
343 | #endif | ||
344 | case DW_EH_PE_native: | ||
345 | if (end < (const void *)(ptr.pul + 1)) | ||
346 | return 0; | ||
347 | value = get_unaligned(ptr.pul++); | ||
348 | break; | ||
349 | case DW_EH_PE_leb128: | ||
350 | BUILD_BUG_ON(sizeof(uleb128_t) > sizeof(value)); | ||
351 | value = ptrType & DW_EH_PE_signed | ||
352 | ? get_sleb128(&ptr.p8, end) | ||
353 | : get_uleb128(&ptr.p8, end); | ||
354 | if ((const void *)ptr.p8 > end) | ||
355 | return 0; | ||
356 | break; | ||
357 | default: | ||
358 | return 0; | ||
359 | } | ||
360 | switch(ptrType & DW_EH_PE_ADJUST) { | ||
361 | case DW_EH_PE_abs: | ||
362 | break; | ||
363 | case DW_EH_PE_pcrel: | ||
364 | value += (unsigned long)*pLoc; | ||
365 | break; | ||
366 | default: | ||
367 | return 0; | ||
368 | } | ||
369 | if ((ptrType & DW_EH_PE_indirect) | ||
370 | && __get_user(value, (unsigned long *)value)) | ||
371 | return 0; | ||
372 | *pLoc = ptr.p8; | ||
373 | |||
374 | return value; | ||
375 | } | ||
376 | |||
377 | static signed fde_pointer_type(const u32 *cie) | ||
378 | { | ||
379 | const u8 *ptr = (const u8 *)(cie + 2); | ||
380 | unsigned version = *ptr; | ||
381 | |||
382 | if (version != 1) | ||
383 | return -1; /* unsupported */ | ||
384 | if (*++ptr) { | ||
385 | const char *aug; | ||
386 | const u8 *end = (const u8 *)(cie + 1) + *cie; | ||
387 | uleb128_t len; | ||
388 | |||
389 | /* check if augmentation size is first (and thus present) */ | ||
390 | if (*ptr != 'z') | ||
391 | return -1; | ||
392 | /* check if augmentation string is nul-terminated */ | ||
393 | if ((ptr = memchr(aug = (const void *)ptr, 0, end - ptr)) == NULL) | ||
394 | return -1; | ||
395 | ++ptr; /* skip terminator */ | ||
396 | get_uleb128(&ptr, end); /* skip code alignment */ | ||
397 | get_sleb128(&ptr, end); /* skip data alignment */ | ||
398 | /* skip return address column */ | ||
399 | version <= 1 ? (void)++ptr : (void)get_uleb128(&ptr, end); | ||
400 | len = get_uleb128(&ptr, end); /* augmentation length */ | ||
401 | if (ptr + len < ptr || ptr + len > end) | ||
402 | return -1; | ||
403 | end = ptr + len; | ||
404 | while (*++aug) { | ||
405 | if (ptr >= end) | ||
406 | return -1; | ||
407 | switch(*aug) { | ||
408 | case 'L': | ||
409 | ++ptr; | ||
410 | break; | ||
411 | case 'P': { | ||
412 | signed ptrType = *ptr++; | ||
413 | |||
414 | if (!read_pointer(&ptr, end, ptrType) || ptr > end) | ||
415 | return -1; | ||
416 | } | ||
417 | break; | ||
418 | case 'R': | ||
419 | return *ptr; | ||
420 | default: | ||
421 | return -1; | ||
422 | } | ||
423 | } | ||
424 | } | ||
425 | return DW_EH_PE_native|DW_EH_PE_abs; | ||
426 | } | ||
427 | |||
428 | static int advance_loc(unsigned long delta, struct unwind_state *state) | ||
429 | { | ||
430 | state->loc += delta * state->codeAlign; | ||
431 | |||
432 | return delta > 0; | ||
433 | } | ||
434 | |||
435 | static void set_rule(uleb128_t reg, | ||
436 | enum item_location where, | ||
437 | uleb128_t value, | ||
438 | struct unwind_state *state) | ||
439 | { | ||
440 | if (reg < ARRAY_SIZE(state->regs)) { | ||
441 | state->regs[reg].where = where; | ||
442 | state->regs[reg].value = value; | ||
443 | } | ||
444 | } | ||
445 | |||
446 | static int processCFI(const u8 *start, | ||
447 | const u8 *end, | ||
448 | unsigned long targetLoc, | ||
449 | signed ptrType, | ||
450 | struct unwind_state *state) | ||
451 | { | ||
452 | union { | ||
453 | const u8 *p8; | ||
454 | const u16 *p16; | ||
455 | const u32 *p32; | ||
456 | } ptr; | ||
457 | int result = 1; | ||
458 | |||
459 | if (start != state->cieStart) { | ||
460 | state->loc = state->org; | ||
461 | result = processCFI(state->cieStart, state->cieEnd, 0, ptrType, state); | ||
462 | if (targetLoc == 0 && state->label == NULL) | ||
463 | return result; | ||
464 | } | ||
465 | for (ptr.p8 = start; result && ptr.p8 < end; ) { | ||
466 | switch(*ptr.p8 >> 6) { | ||
467 | uleb128_t value; | ||
468 | |||
469 | case 0: | ||
470 | switch(*ptr.p8++) { | ||
471 | case DW_CFA_nop: | ||
472 | break; | ||
473 | case DW_CFA_set_loc: | ||
474 | if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0) | ||
475 | result = 0; | ||
476 | break; | ||
477 | case DW_CFA_advance_loc1: | ||
478 | result = ptr.p8 < end && advance_loc(*ptr.p8++, state); | ||
479 | break; | ||
480 | case DW_CFA_advance_loc2: | ||
481 | result = ptr.p8 <= end + 2 | ||
482 | && advance_loc(*ptr.p16++, state); | ||
483 | break; | ||
484 | case DW_CFA_advance_loc4: | ||
485 | result = ptr.p8 <= end + 4 | ||
486 | && advance_loc(*ptr.p32++, state); | ||
487 | break; | ||
488 | case DW_CFA_offset_extended: | ||
489 | value = get_uleb128(&ptr.p8, end); | ||
490 | set_rule(value, Memory, get_uleb128(&ptr.p8, end), state); | ||
491 | break; | ||
492 | case DW_CFA_val_offset: | ||
493 | value = get_uleb128(&ptr.p8, end); | ||
494 | set_rule(value, Value, get_uleb128(&ptr.p8, end), state); | ||
495 | break; | ||
496 | case DW_CFA_offset_extended_sf: | ||
497 | value = get_uleb128(&ptr.p8, end); | ||
498 | set_rule(value, Memory, get_sleb128(&ptr.p8, end), state); | ||
499 | break; | ||
500 | case DW_CFA_val_offset_sf: | ||
501 | value = get_uleb128(&ptr.p8, end); | ||
502 | set_rule(value, Value, get_sleb128(&ptr.p8, end), state); | ||
503 | break; | ||
504 | case DW_CFA_restore_extended: | ||
505 | case DW_CFA_undefined: | ||
506 | case DW_CFA_same_value: | ||
507 | set_rule(get_uleb128(&ptr.p8, end), Nowhere, 0, state); | ||
508 | break; | ||
509 | case DW_CFA_register: | ||
510 | value = get_uleb128(&ptr.p8, end); | ||
511 | set_rule(value, | ||
512 | Register, | ||
513 | get_uleb128(&ptr.p8, end), state); | ||
514 | break; | ||
515 | case DW_CFA_remember_state: | ||
516 | if (ptr.p8 == state->label) { | ||
517 | state->label = NULL; | ||
518 | return 1; | ||
519 | } | ||
520 | if (state->stackDepth >= MAX_STACK_DEPTH) | ||
521 | return 0; | ||
522 | state->stack[state->stackDepth++] = ptr.p8; | ||
523 | break; | ||
524 | case DW_CFA_restore_state: | ||
525 | if (state->stackDepth) { | ||
526 | const uleb128_t loc = state->loc; | ||
527 | const u8 *label = state->label; | ||
528 | |||
529 | state->label = state->stack[state->stackDepth - 1]; | ||
530 | memcpy(&state->cfa, &badCFA, sizeof(state->cfa)); | ||
531 | memset(state->regs, 0, sizeof(state->regs)); | ||
532 | state->stackDepth = 0; | ||
533 | result = processCFI(start, end, 0, ptrType, state); | ||
534 | state->loc = loc; | ||
535 | state->label = label; | ||
536 | } else | ||
537 | return 0; | ||
538 | break; | ||
539 | case DW_CFA_def_cfa: | ||
540 | state->cfa.reg = get_uleb128(&ptr.p8, end); | ||
541 | /*nobreak*/ | ||
542 | case DW_CFA_def_cfa_offset: | ||
543 | state->cfa.offs = get_uleb128(&ptr.p8, end); | ||
544 | break; | ||
545 | case DW_CFA_def_cfa_sf: | ||
546 | state->cfa.reg = get_uleb128(&ptr.p8, end); | ||
547 | /*nobreak*/ | ||
548 | case DW_CFA_def_cfa_offset_sf: | ||
549 | state->cfa.offs = get_sleb128(&ptr.p8, end) | ||
550 | * state->dataAlign; | ||
551 | break; | ||
552 | case DW_CFA_def_cfa_register: | ||
553 | state->cfa.reg = get_uleb128(&ptr.p8, end); | ||
554 | break; | ||
555 | /*todo case DW_CFA_def_cfa_expression: */ | ||
556 | /*todo case DW_CFA_expression: */ | ||
557 | /*todo case DW_CFA_val_expression: */ | ||
558 | case DW_CFA_GNU_args_size: | ||
559 | get_uleb128(&ptr.p8, end); | ||
560 | break; | ||
561 | case DW_CFA_GNU_negative_offset_extended: | ||
562 | value = get_uleb128(&ptr.p8, end); | ||
563 | set_rule(value, | ||
564 | Memory, | ||
565 | (uleb128_t)0 - get_uleb128(&ptr.p8, end), state); | ||
566 | break; | ||
567 | case DW_CFA_GNU_window_save: | ||
568 | default: | ||
569 | result = 0; | ||
570 | break; | ||
571 | } | ||
572 | break; | ||
573 | case 1: | ||
574 | result = advance_loc(*ptr.p8++ & 0x3f, state); | ||
575 | break; | ||
576 | case 2: | ||
577 | value = *ptr.p8++ & 0x3f; | ||
578 | set_rule(value, Memory, get_uleb128(&ptr.p8, end), state); | ||
579 | break; | ||
580 | case 3: | ||
581 | set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state); | ||
582 | break; | ||
583 | } | ||
584 | if (ptr.p8 > end) | ||
585 | result = 0; | ||
586 | if (result && targetLoc != 0 && targetLoc < state->loc) | ||
587 | return 1; | ||
588 | } | ||
589 | |||
590 | return result | ||
591 | && ptr.p8 == end | ||
592 | && (targetLoc == 0 | ||
593 | || (/*todo While in theory this should apply, gcc in practice omits | ||
594 | everything past the function prolog, and hence the location | ||
595 | never reaches the end of the function. | ||
596 | targetLoc < state->loc &&*/ state->label == NULL)); | ||
597 | } | ||
598 | |||
599 | /* Unwind to previous to frame. Returns 0 if successful, negative | ||
600 | * number in case of an error. */ | ||
601 | int unwind(struct unwind_frame_info *frame) | ||
602 | { | ||
603 | #define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs]) | ||
604 | const u32 *fde = NULL, *cie = NULL; | ||
605 | const u8 *ptr = NULL, *end = NULL; | ||
606 | unsigned long startLoc = 0, endLoc = 0, cfa; | ||
607 | unsigned i; | ||
608 | signed ptrType = -1; | ||
609 | uleb128_t retAddrReg = 0; | ||
610 | struct unwind_table *table; | ||
611 | struct unwind_state state; | ||
612 | |||
613 | if (UNW_PC(frame) == 0) | ||
614 | return -EINVAL; | ||
615 | if ((table = find_table(UNW_PC(frame))) != NULL | ||
616 | && !(table->size & (sizeof(*fde) - 1))) { | ||
617 | unsigned long tableSize = table->size; | ||
618 | |||
619 | for (fde = table->address; | ||
620 | tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde; | ||
621 | tableSize -= sizeof(*fde) + *fde, | ||
622 | fde += 1 + *fde / sizeof(*fde)) { | ||
623 | if (!*fde || (*fde & (sizeof(*fde) - 1))) | ||
624 | break; | ||
625 | if (!fde[1]) | ||
626 | continue; /* this is a CIE */ | ||
627 | if ((fde[1] & (sizeof(*fde) - 1)) | ||
628 | || fde[1] > (unsigned long)(fde + 1) | ||
629 | - (unsigned long)table->address) | ||
630 | continue; /* this is not a valid FDE */ | ||
631 | cie = fde + 1 - fde[1] / sizeof(*fde); | ||
632 | if (*cie <= sizeof(*cie) + 4 | ||
633 | || *cie >= fde[1] - sizeof(*fde) | ||
634 | || (*cie & (sizeof(*cie) - 1)) | ||
635 | || cie[1] | ||
636 | || (ptrType = fde_pointer_type(cie)) < 0) { | ||
637 | cie = NULL; /* this is not a (valid) CIE */ | ||
638 | continue; | ||
639 | } | ||
640 | ptr = (const u8 *)(fde + 2); | ||
641 | startLoc = read_pointer(&ptr, | ||
642 | (const u8 *)(fde + 1) + *fde, | ||
643 | ptrType); | ||
644 | endLoc = startLoc | ||
645 | + read_pointer(&ptr, | ||
646 | (const u8 *)(fde + 1) + *fde, | ||
647 | ptrType & DW_EH_PE_indirect | ||
648 | ? ptrType | ||
649 | : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed)); | ||
650 | if (UNW_PC(frame) >= startLoc && UNW_PC(frame) < endLoc) | ||
651 | break; | ||
652 | cie = NULL; | ||
653 | } | ||
654 | } | ||
655 | if (cie != NULL) { | ||
656 | memset(&state, 0, sizeof(state)); | ||
657 | state.cieEnd = ptr; /* keep here temporarily */ | ||
658 | ptr = (const u8 *)(cie + 2); | ||
659 | end = (const u8 *)(cie + 1) + *cie; | ||
660 | if ((state.version = *ptr) != 1) | ||
661 | cie = NULL; /* unsupported version */ | ||
662 | else if (*++ptr) { | ||
663 | /* check if augmentation size is first (and thus present) */ | ||
664 | if (*ptr == 'z') { | ||
665 | /* check for ignorable (or already handled) | ||
666 | * nul-terminated augmentation string */ | ||
667 | while (++ptr < end && *ptr) | ||
668 | if (strchr("LPR", *ptr) == NULL) | ||
669 | break; | ||
670 | } | ||
671 | if (ptr >= end || *ptr) | ||
672 | cie = NULL; | ||
673 | } | ||
674 | ++ptr; | ||
675 | } | ||
676 | if (cie != NULL) { | ||
677 | /* get code aligment factor */ | ||
678 | state.codeAlign = get_uleb128(&ptr, end); | ||
679 | /* get data aligment factor */ | ||
680 | state.dataAlign = get_sleb128(&ptr, end); | ||
681 | if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end) | ||
682 | cie = NULL; | ||
683 | else { | ||
684 | retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end); | ||
685 | /* skip augmentation */ | ||
686 | if (((const char *)(cie + 2))[1] == 'z') | ||
687 | ptr += get_uleb128(&ptr, end); | ||
688 | if (ptr > end | ||
689 | || retAddrReg >= ARRAY_SIZE(reg_info) | ||
690 | || REG_INVALID(retAddrReg) | ||
691 | || reg_info[retAddrReg].width != sizeof(unsigned long)) | ||
692 | cie = NULL; | ||
693 | } | ||
694 | } | ||
695 | if (cie != NULL) { | ||
696 | state.cieStart = ptr; | ||
697 | ptr = state.cieEnd; | ||
698 | state.cieEnd = end; | ||
699 | end = (const u8 *)(fde + 1) + *fde; | ||
700 | /* skip augmentation */ | ||
701 | if (((const char *)(cie + 2))[1] == 'z') { | ||
702 | uleb128_t augSize = get_uleb128(&ptr, end); | ||
703 | |||
704 | if ((ptr += augSize) > end) | ||
705 | fde = NULL; | ||
706 | } | ||
707 | } | ||
708 | if (cie == NULL || fde == NULL) { | ||
709 | #ifdef CONFIG_FRAME_POINTER | ||
710 | unsigned long top, bottom; | ||
711 | #endif | ||
712 | |||
713 | #ifdef CONFIG_FRAME_POINTER | ||
714 | top = STACK_TOP(frame->task); | ||
715 | bottom = STACK_BOTTOM(frame->task); | ||
716 | # if FRAME_RETADDR_OFFSET < 0 | ||
717 | if (UNW_SP(frame) < top | ||
718 | && UNW_FP(frame) <= UNW_SP(frame) | ||
719 | && bottom < UNW_FP(frame) | ||
720 | # else | ||
721 | if (UNW_SP(frame) > top | ||
722 | && UNW_FP(frame) >= UNW_SP(frame) | ||
723 | && bottom > UNW_FP(frame) | ||
724 | # endif | ||
725 | && !((UNW_SP(frame) | UNW_FP(frame)) | ||
726 | & (sizeof(unsigned long) - 1))) { | ||
727 | unsigned long link; | ||
728 | |||
729 | if (!__get_user(link, | ||
730 | (unsigned long *)(UNW_FP(frame) | ||
731 | + FRAME_LINK_OFFSET)) | ||
732 | # if FRAME_RETADDR_OFFSET < 0 | ||
733 | && link > bottom && link < UNW_FP(frame) | ||
734 | # else | ||
735 | && link > UNW_FP(frame) && link < bottom | ||
736 | # endif | ||
737 | && !(link & (sizeof(link) - 1)) | ||
738 | && !__get_user(UNW_PC(frame), | ||
739 | (unsigned long *)(UNW_FP(frame) | ||
740 | + FRAME_RETADDR_OFFSET))) { | ||
741 | UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET | ||
742 | # if FRAME_RETADDR_OFFSET < 0 | ||
743 | - | ||
744 | # else | ||
745 | + | ||
746 | # endif | ||
747 | sizeof(UNW_PC(frame)); | ||
748 | UNW_FP(frame) = link; | ||
749 | return 0; | ||
750 | } | ||
751 | } | ||
752 | #endif | ||
753 | return -ENXIO; | ||
754 | } | ||
755 | state.org = startLoc; | ||
756 | memcpy(&state.cfa, &badCFA, sizeof(state.cfa)); | ||
757 | /* process instructions */ | ||
758 | if (!processCFI(ptr, end, UNW_PC(frame), ptrType, &state) | ||
759 | || state.loc > endLoc | ||
760 | || state.regs[retAddrReg].where == Nowhere | ||
761 | || state.cfa.reg >= ARRAY_SIZE(reg_info) | ||
762 | || reg_info[state.cfa.reg].width != sizeof(unsigned long) | ||
763 | || state.cfa.offs % sizeof(unsigned long)) | ||
764 | return -EIO; | ||
765 | /* update frame */ | ||
766 | cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs; | ||
767 | startLoc = min((unsigned long)UNW_SP(frame), cfa); | ||
768 | endLoc = max((unsigned long)UNW_SP(frame), cfa); | ||
769 | if (STACK_LIMIT(startLoc) != STACK_LIMIT(endLoc)) { | ||
770 | startLoc = min(STACK_LIMIT(cfa), cfa); | ||
771 | endLoc = max(STACK_LIMIT(cfa), cfa); | ||
772 | } | ||
773 | #ifndef CONFIG_64BIT | ||
774 | # define CASES CASE(8); CASE(16); CASE(32) | ||
775 | #else | ||
776 | # define CASES CASE(8); CASE(16); CASE(32); CASE(64) | ||
777 | #endif | ||
778 | for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { | ||
779 | if (REG_INVALID(i)) { | ||
780 | if (state.regs[i].where == Nowhere) | ||
781 | continue; | ||
782 | return -EIO; | ||
783 | } | ||
784 | switch(state.regs[i].where) { | ||
785 | default: | ||
786 | break; | ||
787 | case Register: | ||
788 | if (state.regs[i].value >= ARRAY_SIZE(reg_info) | ||
789 | || REG_INVALID(state.regs[i].value) | ||
790 | || reg_info[i].width > reg_info[state.regs[i].value].width) | ||
791 | return -EIO; | ||
792 | switch(reg_info[state.regs[i].value].width) { | ||
793 | #define CASE(n) \ | ||
794 | case sizeof(u##n): \ | ||
795 | state.regs[i].value = FRAME_REG(state.regs[i].value, \ | ||
796 | const u##n); \ | ||
797 | break | ||
798 | CASES; | ||
799 | #undef CASE | ||
800 | default: | ||
801 | return -EIO; | ||
802 | } | ||
803 | break; | ||
804 | } | ||
805 | } | ||
806 | for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { | ||
807 | if (REG_INVALID(i)) | ||
808 | continue; | ||
809 | switch(state.regs[i].where) { | ||
810 | case Nowhere: | ||
811 | if (reg_info[i].width != sizeof(UNW_SP(frame)) | ||
812 | || &FRAME_REG(i, __typeof__(UNW_SP(frame))) | ||
813 | != &UNW_SP(frame)) | ||
814 | continue; | ||
815 | UNW_SP(frame) = cfa; | ||
816 | break; | ||
817 | case Register: | ||
818 | switch(reg_info[i].width) { | ||
819 | #define CASE(n) case sizeof(u##n): \ | ||
820 | FRAME_REG(i, u##n) = state.regs[i].value; \ | ||
821 | break | ||
822 | CASES; | ||
823 | #undef CASE | ||
824 | default: | ||
825 | return -EIO; | ||
826 | } | ||
827 | break; | ||
828 | case Value: | ||
829 | if (reg_info[i].width != sizeof(unsigned long)) | ||
830 | return -EIO; | ||
831 | FRAME_REG(i, unsigned long) = cfa + state.regs[i].value | ||
832 | * state.dataAlign; | ||
833 | break; | ||
834 | case Memory: { | ||
835 | unsigned long addr = cfa + state.regs[i].value | ||
836 | * state.dataAlign; | ||
837 | |||
838 | if ((state.regs[i].value * state.dataAlign) | ||
839 | % sizeof(unsigned long) | ||
840 | || addr < startLoc | ||
841 | || addr + sizeof(unsigned long) < addr | ||
842 | || addr + sizeof(unsigned long) > endLoc) | ||
843 | return -EIO; | ||
844 | switch(reg_info[i].width) { | ||
845 | #define CASE(n) case sizeof(u##n): \ | ||
846 | __get_user(FRAME_REG(i, u##n), (u##n *)addr); \ | ||
847 | break | ||
848 | CASES; | ||
849 | #undef CASE | ||
850 | default: | ||
851 | return -EIO; | ||
852 | } | ||
853 | } | ||
854 | break; | ||
855 | } | ||
856 | } | ||
857 | |||
858 | return 0; | ||
859 | #undef CASES | ||
860 | #undef FRAME_REG | ||
861 | } | ||
862 | EXPORT_SYMBOL(unwind); | ||
863 | |||
864 | int unwind_init_frame_info(struct unwind_frame_info *info, | ||
865 | struct task_struct *tsk, | ||
866 | /*const*/ struct pt_regs *regs) | ||
867 | { | ||
868 | info->task = tsk; | ||
869 | arch_unw_init_frame_info(info, regs); | ||
870 | |||
871 | return 0; | ||
872 | } | ||
873 | EXPORT_SYMBOL(unwind_init_frame_info); | ||
874 | |||
875 | /* | ||
876 | * Prepare to unwind a blocked task. | ||
877 | */ | ||
878 | int unwind_init_blocked(struct unwind_frame_info *info, | ||
879 | struct task_struct *tsk) | ||
880 | { | ||
881 | info->task = tsk; | ||
882 | arch_unw_init_blocked(info); | ||
883 | |||
884 | return 0; | ||
885 | } | ||
886 | EXPORT_SYMBOL(unwind_init_blocked); | ||
887 | |||
888 | /* | ||
889 | * Prepare to unwind the currently running thread. | ||
890 | */ | ||
891 | int unwind_init_running(struct unwind_frame_info *info, | ||
892 | asmlinkage int (*callback)(struct unwind_frame_info *, | ||
893 | void *arg), | ||
894 | void *arg) | ||
895 | { | ||
896 | info->task = current; | ||
897 | |||
898 | return arch_unwind_init_running(info, callback, arg); | ||
899 | } | ||
900 | EXPORT_SYMBOL(unwind_init_running); | ||
901 | |||
902 | /* | ||
903 | * Unwind until the return pointer is in user-land (or until an error | ||
904 | * occurs). Returns 0 if successful, negative number in case of | ||
905 | * error. | ||
906 | */ | ||
907 | int unwind_to_user(struct unwind_frame_info *info) | ||
908 | { | ||
909 | while (!arch_unw_user_mode(info)) { | ||
910 | int err = unwind(info); | ||
911 | |||
912 | if (err < 0) | ||
913 | return err; | ||
914 | } | ||
915 | |||
916 | return 0; | ||
917 | } | ||
918 | EXPORT_SYMBOL(unwind_to_user); | ||
diff --git a/kernel/user.c b/kernel/user.c index 2116642f42c6..6408c0424291 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -140,7 +140,7 @@ struct user_struct * alloc_uid(uid_t uid) | |||
140 | atomic_set(&new->processes, 0); | 140 | atomic_set(&new->processes, 0); |
141 | atomic_set(&new->files, 0); | 141 | atomic_set(&new->files, 0); |
142 | atomic_set(&new->sigpending, 0); | 142 | atomic_set(&new->sigpending, 0); |
143 | #ifdef CONFIG_INOTIFY | 143 | #ifdef CONFIG_INOTIFY_USER |
144 | atomic_set(&new->inotify_watches, 0); | 144 | atomic_set(&new->inotify_watches, 0); |
145 | atomic_set(&new->inotify_devs, 0); | 145 | atomic_set(&new->inotify_devs, 0); |
146 | #endif | 146 | #endif |
@@ -148,7 +148,7 @@ struct user_struct * alloc_uid(uid_t uid) | |||
148 | new->mq_bytes = 0; | 148 | new->mq_bytes = 0; |
149 | new->locked_shm = 0; | 149 | new->locked_shm = 0; |
150 | 150 | ||
151 | if (alloc_uid_keyring(new) < 0) { | 151 | if (alloc_uid_keyring(new, current) < 0) { |
152 | kmem_cache_free(uid_cachep, new); | 152 | kmem_cache_free(uid_cachep, new); |
153 | return NULL; | 153 | return NULL; |
154 | } | 154 | } |
diff --git a/kernel/wait.c b/kernel/wait.c index 791681cfea98..5985d866531f 100644 --- a/kernel/wait.c +++ b/kernel/wait.c | |||
@@ -3,7 +3,6 @@ | |||
3 | * | 3 | * |
4 | * (C) 2004 William Irwin, Oracle | 4 | * (C) 2004 William Irwin, Oracle |
5 | */ | 5 | */ |
6 | #include <linux/config.h> | ||
7 | #include <linux/init.h> | 6 | #include <linux/init.h> |
8 | #include <linux/module.h> | 7 | #include <linux/module.h> |
9 | #include <linux/sched.h> | 8 | #include <linux/sched.h> |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 880fb415a8f6..59f0b42bd89e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -428,22 +428,34 @@ int schedule_delayed_work_on(int cpu, | |||
428 | return ret; | 428 | return ret; |
429 | } | 429 | } |
430 | 430 | ||
431 | int schedule_on_each_cpu(void (*func) (void *info), void *info) | 431 | /** |
432 | * schedule_on_each_cpu - call a function on each online CPU from keventd | ||
433 | * @func: the function to call | ||
434 | * @info: a pointer to pass to func() | ||
435 | * | ||
436 | * Returns zero on success. | ||
437 | * Returns -ve errno on failure. | ||
438 | * | ||
439 | * Appears to be racy against CPU hotplug. | ||
440 | * | ||
441 | * schedule_on_each_cpu() is very slow. | ||
442 | */ | ||
443 | int schedule_on_each_cpu(void (*func)(void *info), void *info) | ||
432 | { | 444 | { |
433 | int cpu; | 445 | int cpu; |
434 | struct work_struct *work; | 446 | struct work_struct *works; |
435 | 447 | ||
436 | work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL); | 448 | works = alloc_percpu(struct work_struct); |
437 | 449 | if (!works) | |
438 | if (!work) | ||
439 | return -ENOMEM; | 450 | return -ENOMEM; |
451 | |||
440 | for_each_online_cpu(cpu) { | 452 | for_each_online_cpu(cpu) { |
441 | INIT_WORK(work + cpu, func, info); | 453 | INIT_WORK(per_cpu_ptr(works, cpu), func, info); |
442 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), | 454 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), |
443 | work + cpu); | 455 | per_cpu_ptr(works, cpu)); |
444 | } | 456 | } |
445 | flush_workqueue(keventd_wq); | 457 | flush_workqueue(keventd_wq); |
446 | kfree(work); | 458 | free_percpu(works); |
447 | return 0; | 459 | return 0; |
448 | } | 460 | } |
449 | 461 | ||
@@ -531,11 +543,11 @@ int current_is_keventd(void) | |||
531 | static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) | 543 | static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) |
532 | { | 544 | { |
533 | struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); | 545 | struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); |
534 | LIST_HEAD(list); | 546 | struct list_head list; |
535 | struct work_struct *work; | 547 | struct work_struct *work; |
536 | 548 | ||
537 | spin_lock_irq(&cwq->lock); | 549 | spin_lock_irq(&cwq->lock); |
538 | list_splice_init(&cwq->worklist, &list); | 550 | list_replace_init(&cwq->worklist, &list); |
539 | 551 | ||
540 | while (!list_empty(&list)) { | 552 | while (!list_empty(&list)) { |
541 | printk("Taking work for %s\n", wq->name); | 553 | printk("Taking work for %s\n", wq->name); |
@@ -547,7 +559,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) | |||
547 | } | 559 | } |
548 | 560 | ||
549 | /* We're holding the cpucontrol mutex here */ | 561 | /* We're holding the cpucontrol mutex here */ |
550 | static int workqueue_cpu_callback(struct notifier_block *nfb, | 562 | static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, |
551 | unsigned long action, | 563 | unsigned long action, |
552 | void *hcpu) | 564 | void *hcpu) |
553 | { | 565 | { |
@@ -578,6 +590,8 @@ static int workqueue_cpu_callback(struct notifier_block *nfb, | |||
578 | 590 | ||
579 | case CPU_UP_CANCELED: | 591 | case CPU_UP_CANCELED: |
580 | list_for_each_entry(wq, &workqueues, list) { | 592 | list_for_each_entry(wq, &workqueues, list) { |
593 | if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread) | ||
594 | continue; | ||
581 | /* Unbind so it can run. */ | 595 | /* Unbind so it can run. */ |
582 | kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, | 596 | kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, |
583 | any_online_cpu(cpu_online_map)); | 597 | any_online_cpu(cpu_online_map)); |