diff options
Diffstat (limited to 'kernel')
47 files changed, 4506 insertions, 1949 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore new file mode 100644 index 000000000000..f2ab70073bd4 --- /dev/null +++ b/kernel/.gitignore | |||
| @@ -0,0 +1,5 @@ | |||
| 1 | # | ||
| 2 | # Generated files | ||
| 3 | # | ||
| 4 | config_data.h | ||
| 5 | config_data.gz | ||
diff --git a/kernel/Makefile b/kernel/Makefile index 4f5a1453093a..355126606d1b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -7,8 +7,10 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ | |||
| 7 | sysctl.o capability.o ptrace.o timer.o user.o \ | 7 | sysctl.o capability.o ptrace.o timer.o user.o \ |
| 8 | signal.o sys.o kmod.o workqueue.o pid.o \ | 8 | signal.o sys.o kmod.o workqueue.o pid.o \ |
| 9 | rcupdate.o intermodule.o extable.o params.o posix-timers.o \ | 9 | rcupdate.o intermodule.o extable.o params.o posix-timers.o \ |
| 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
| 11 | hrtimer.o | ||
| 11 | 12 | ||
| 13 | obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o | ||
| 12 | obj-$(CONFIG_FUTEX) += futex.o | 14 | obj-$(CONFIG_FUTEX) += futex.o |
| 13 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 15 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
| 14 | obj-$(CONFIG_SMP) += cpu.o spinlock.o | 16 | obj-$(CONFIG_SMP) += cpu.o spinlock.o |
| @@ -29,7 +31,6 @@ obj-$(CONFIG_KPROBES) += kprobes.o | |||
| 29 | obj-$(CONFIG_SYSFS) += ksysfs.o | 31 | obj-$(CONFIG_SYSFS) += ksysfs.o |
| 30 | obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o | 32 | obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o |
| 31 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | 33 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ |
| 32 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | ||
| 33 | obj-$(CONFIG_SECCOMP) += seccomp.o | 34 | obj-$(CONFIG_SECCOMP) += seccomp.o |
| 34 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 35 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
| 35 | 36 | ||
diff --git a/kernel/acct.c b/kernel/acct.c index 6312d6bd43e3..065d8b4e51ef 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
| @@ -47,6 +47,7 @@ | |||
| 47 | #include <linux/mm.h> | 47 | #include <linux/mm.h> |
| 48 | #include <linux/slab.h> | 48 | #include <linux/slab.h> |
| 49 | #include <linux/acct.h> | 49 | #include <linux/acct.h> |
| 50 | #include <linux/capability.h> | ||
| 50 | #include <linux/file.h> | 51 | #include <linux/file.h> |
| 51 | #include <linux/tty.h> | 52 | #include <linux/tty.h> |
| 52 | #include <linux/security.h> | 53 | #include <linux/security.h> |
| @@ -427,6 +428,7 @@ static void do_acct_process(long exitcode, struct file *file) | |||
| 427 | u64 elapsed; | 428 | u64 elapsed; |
| 428 | u64 run_time; | 429 | u64 run_time; |
| 429 | struct timespec uptime; | 430 | struct timespec uptime; |
| 431 | unsigned long jiffies; | ||
| 430 | 432 | ||
| 431 | /* | 433 | /* |
| 432 | * First check to see if there is enough free_space to continue | 434 | * First check to see if there is enough free_space to continue |
| @@ -467,12 +469,12 @@ static void do_acct_process(long exitcode, struct file *file) | |||
| 467 | #endif | 469 | #endif |
| 468 | do_div(elapsed, AHZ); | 470 | do_div(elapsed, AHZ); |
| 469 | ac.ac_btime = xtime.tv_sec - elapsed; | 471 | ac.ac_btime = xtime.tv_sec - elapsed; |
| 470 | ac.ac_utime = encode_comp_t(jiffies_to_AHZ( | 472 | jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime, |
| 471 | current->signal->utime + | 473 | current->signal->utime)); |
| 472 | current->group_leader->utime)); | 474 | ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies)); |
| 473 | ac.ac_stime = encode_comp_t(jiffies_to_AHZ( | 475 | jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime, |
| 474 | current->signal->stime + | 476 | current->signal->stime)); |
| 475 | current->group_leader->stime)); | 477 | ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies)); |
| 476 | /* we really need to bite the bullet and change layout */ | 478 | /* we really need to bite the bullet and change layout */ |
| 477 | ac.ac_uid = current->uid; | 479 | ac.ac_uid = current->uid; |
| 478 | ac.ac_gid = current->gid; | 480 | ac.ac_gid = current->gid; |
| @@ -580,7 +582,8 @@ void acct_process(long exitcode) | |||
| 580 | void acct_update_integrals(struct task_struct *tsk) | 582 | void acct_update_integrals(struct task_struct *tsk) |
| 581 | { | 583 | { |
| 582 | if (likely(tsk->mm)) { | 584 | if (likely(tsk->mm)) { |
| 583 | long delta = tsk->stime - tsk->acct_stimexpd; | 585 | long delta = |
| 586 | cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd; | ||
| 584 | 587 | ||
| 585 | if (delta == 0) | 588 | if (delta == 0) |
| 586 | return; | 589 | return; |
diff --git a/kernel/audit.c b/kernel/audit.c index 32fa03ad1984..d13ab7d2d899 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -267,7 +267,7 @@ static int audit_set_failure(int state, uid_t loginuid) | |||
| 267 | return old; | 267 | return old; |
| 268 | } | 268 | } |
| 269 | 269 | ||
| 270 | int kauditd_thread(void *dummy) | 270 | static int kauditd_thread(void *dummy) |
| 271 | { | 271 | { |
| 272 | struct sk_buff *skb; | 272 | struct sk_buff *skb; |
| 273 | 273 | ||
diff --git a/kernel/capability.c b/kernel/capability.c index 8986a37a67ea..bfa3c92e16f2 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
| @@ -7,6 +7,7 @@ | |||
| 7 | * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> | 7 | * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> |
| 8 | */ | 8 | */ |
| 9 | 9 | ||
| 10 | #include <linux/capability.h> | ||
| 10 | #include <linux/mm.h> | 11 | #include <linux/mm.h> |
| 11 | #include <linux/module.h> | 12 | #include <linux/module.h> |
| 12 | #include <linux/security.h> | 13 | #include <linux/security.h> |
diff --git a/kernel/compat.c b/kernel/compat.c index 102296e21ea8..256e5d9f0647 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
| @@ -514,6 +514,24 @@ static int put_compat_itimerspec(struct compat_itimerspec __user *dst, | |||
| 514 | return 0; | 514 | return 0; |
| 515 | } | 515 | } |
| 516 | 516 | ||
| 517 | long compat_sys_timer_create(clockid_t which_clock, | ||
| 518 | struct compat_sigevent __user *timer_event_spec, | ||
| 519 | timer_t __user *created_timer_id) | ||
| 520 | { | ||
| 521 | struct sigevent __user *event = NULL; | ||
| 522 | |||
| 523 | if (timer_event_spec) { | ||
| 524 | struct sigevent kevent; | ||
| 525 | |||
| 526 | event = compat_alloc_user_space(sizeof(*event)); | ||
| 527 | if (get_compat_sigevent(&kevent, timer_event_spec) || | ||
| 528 | copy_to_user(event, &kevent, sizeof(*event))) | ||
| 529 | return -EFAULT; | ||
| 530 | } | ||
| 531 | |||
| 532 | return sys_timer_create(which_clock, event, created_timer_id); | ||
| 533 | } | ||
| 534 | |||
| 517 | long compat_sys_timer_settime(timer_t timer_id, int flags, | 535 | long compat_sys_timer_settime(timer_t timer_id, int flags, |
| 518 | struct compat_itimerspec __user *new, | 536 | struct compat_itimerspec __user *new, |
| 519 | struct compat_itimerspec __user *old) | 537 | struct compat_itimerspec __user *old) |
| @@ -649,8 +667,6 @@ int get_compat_sigevent(struct sigevent *event, | |||
| 649 | ? -EFAULT : 0; | 667 | ? -EFAULT : 0; |
| 650 | } | 668 | } |
| 651 | 669 | ||
| 652 | /* timer_create is architecture specific because it needs sigevent conversion */ | ||
| 653 | |||
| 654 | long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask, | 670 | long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask, |
| 655 | unsigned long bitmap_size) | 671 | unsigned long bitmap_size) |
| 656 | { | 672 | { |
diff --git a/kernel/configs.c b/kernel/configs.c index 986f7af31e0a..009e1ebdcb88 100644 --- a/kernel/configs.c +++ b/kernel/configs.c | |||
| @@ -3,7 +3,7 @@ | |||
| 3 | * Echo the kernel .config file used to build the kernel | 3 | * Echo the kernel .config file used to build the kernel |
| 4 | * | 4 | * |
| 5 | * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com> | 5 | * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com> |
| 6 | * Copyright (C) 2002 Randy Dunlap <rddunlap@osdl.org> | 6 | * Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net> |
| 7 | * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com> | 7 | * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com> |
| 8 | * Copyright (C) 2002 Hewlett-Packard Company | 8 | * Copyright (C) 2002 Hewlett-Packard Company |
| 9 | * | 9 | * |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7430640f9816..2a75e44e1a41 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -39,6 +39,7 @@ | |||
| 39 | #include <linux/namei.h> | 39 | #include <linux/namei.h> |
| 40 | #include <linux/pagemap.h> | 40 | #include <linux/pagemap.h> |
| 41 | #include <linux/proc_fs.h> | 41 | #include <linux/proc_fs.h> |
| 42 | #include <linux/rcupdate.h> | ||
| 42 | #include <linux/sched.h> | 43 | #include <linux/sched.h> |
| 43 | #include <linux/seq_file.h> | 44 | #include <linux/seq_file.h> |
| 44 | #include <linux/slab.h> | 45 | #include <linux/slab.h> |
| @@ -54,7 +55,23 @@ | |||
| 54 | #include <asm/atomic.h> | 55 | #include <asm/atomic.h> |
| 55 | #include <asm/semaphore.h> | 56 | #include <asm/semaphore.h> |
| 56 | 57 | ||
| 57 | #define CPUSET_SUPER_MAGIC 0x27e0eb | 58 | #define CPUSET_SUPER_MAGIC 0x27e0eb |
| 59 | |||
| 60 | /* | ||
| 61 | * Tracks how many cpusets are currently defined in system. | ||
| 62 | * When there is only one cpuset (the root cpuset) we can | ||
| 63 | * short circuit some hooks. | ||
| 64 | */ | ||
| 65 | int number_of_cpusets __read_mostly; | ||
| 66 | |||
| 67 | /* See "Frequency meter" comments, below. */ | ||
| 68 | |||
| 69 | struct fmeter { | ||
| 70 | int cnt; /* unprocessed events count */ | ||
| 71 | int val; /* most recent output value */ | ||
| 72 | time_t time; /* clock (secs) when val computed */ | ||
| 73 | spinlock_t lock; /* guards read or write of above */ | ||
| 74 | }; | ||
| 58 | 75 | ||
| 59 | struct cpuset { | 76 | struct cpuset { |
| 60 | unsigned long flags; /* "unsigned long" so bitops work */ | 77 | unsigned long flags; /* "unsigned long" so bitops work */ |
| @@ -80,13 +97,16 @@ struct cpuset { | |||
| 80 | * Copy of global cpuset_mems_generation as of the most | 97 | * Copy of global cpuset_mems_generation as of the most |
| 81 | * recent time this cpuset changed its mems_allowed. | 98 | * recent time this cpuset changed its mems_allowed. |
| 82 | */ | 99 | */ |
| 83 | int mems_generation; | 100 | int mems_generation; |
| 101 | |||
| 102 | struct fmeter fmeter; /* memory_pressure filter */ | ||
| 84 | }; | 103 | }; |
| 85 | 104 | ||
| 86 | /* bits in struct cpuset flags field */ | 105 | /* bits in struct cpuset flags field */ |
| 87 | typedef enum { | 106 | typedef enum { |
| 88 | CS_CPU_EXCLUSIVE, | 107 | CS_CPU_EXCLUSIVE, |
| 89 | CS_MEM_EXCLUSIVE, | 108 | CS_MEM_EXCLUSIVE, |
| 109 | CS_MEMORY_MIGRATE, | ||
| 90 | CS_REMOVED, | 110 | CS_REMOVED, |
| 91 | CS_NOTIFY_ON_RELEASE | 111 | CS_NOTIFY_ON_RELEASE |
| 92 | } cpuset_flagbits_t; | 112 | } cpuset_flagbits_t; |
| @@ -112,6 +132,11 @@ static inline int notify_on_release(const struct cpuset *cs) | |||
| 112 | return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 132 | return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
| 113 | } | 133 | } |
| 114 | 134 | ||
| 135 | static inline int is_memory_migrate(const struct cpuset *cs) | ||
| 136 | { | ||
| 137 | return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags); | ||
| 138 | } | ||
| 139 | |||
| 115 | /* | 140 | /* |
| 116 | * Increment this atomic integer everytime any cpuset changes its | 141 | * Increment this atomic integer everytime any cpuset changes its |
| 117 | * mems_allowed value. Users of cpusets can track this generation | 142 | * mems_allowed value. Users of cpusets can track this generation |
| @@ -137,13 +162,10 @@ static struct cpuset top_cpuset = { | |||
| 137 | .count = ATOMIC_INIT(0), | 162 | .count = ATOMIC_INIT(0), |
| 138 | .sibling = LIST_HEAD_INIT(top_cpuset.sibling), | 163 | .sibling = LIST_HEAD_INIT(top_cpuset.sibling), |
| 139 | .children = LIST_HEAD_INIT(top_cpuset.children), | 164 | .children = LIST_HEAD_INIT(top_cpuset.children), |
| 140 | .parent = NULL, | ||
| 141 | .dentry = NULL, | ||
| 142 | .mems_generation = 0, | ||
| 143 | }; | 165 | }; |
| 144 | 166 | ||
| 145 | static struct vfsmount *cpuset_mount; | 167 | static struct vfsmount *cpuset_mount; |
| 146 | static struct super_block *cpuset_sb = NULL; | 168 | static struct super_block *cpuset_sb; |
| 147 | 169 | ||
| 148 | /* | 170 | /* |
| 149 | * We have two global cpuset semaphores below. They can nest. | 171 | * We have two global cpuset semaphores below. They can nest. |
| @@ -227,6 +249,11 @@ static struct super_block *cpuset_sb = NULL; | |||
| 227 | * a tasks cpuset pointer we use task_lock(), which acts on a spinlock | 249 | * a tasks cpuset pointer we use task_lock(), which acts on a spinlock |
| 228 | * (task->alloc_lock) already in the task_struct routinely used for | 250 | * (task->alloc_lock) already in the task_struct routinely used for |
| 229 | * such matters. | 251 | * such matters. |
| 252 | * | ||
| 253 | * P.S. One more locking exception. RCU is used to guard the | ||
| 254 | * update of a tasks cpuset pointer by attach_task() and the | ||
| 255 | * access of task->cpuset->mems_generation via that pointer in | ||
| 256 | * the routine cpuset_update_task_memory_state(). | ||
| 230 | */ | 257 | */ |
| 231 | 258 | ||
| 232 | static DECLARE_MUTEX(manage_sem); | 259 | static DECLARE_MUTEX(manage_sem); |
| @@ -304,7 +331,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry) | |||
| 304 | spin_lock(&dcache_lock); | 331 | spin_lock(&dcache_lock); |
| 305 | node = dentry->d_subdirs.next; | 332 | node = dentry->d_subdirs.next; |
| 306 | while (node != &dentry->d_subdirs) { | 333 | while (node != &dentry->d_subdirs) { |
| 307 | struct dentry *d = list_entry(node, struct dentry, d_child); | 334 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); |
| 308 | list_del_init(node); | 335 | list_del_init(node); |
| 309 | if (d->d_inode) { | 336 | if (d->d_inode) { |
| 310 | d = dget_locked(d); | 337 | d = dget_locked(d); |
| @@ -316,7 +343,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry) | |||
| 316 | } | 343 | } |
| 317 | node = dentry->d_subdirs.next; | 344 | node = dentry->d_subdirs.next; |
| 318 | } | 345 | } |
| 319 | list_del_init(&dentry->d_child); | 346 | list_del_init(&dentry->d_u.d_child); |
| 320 | spin_unlock(&dcache_lock); | 347 | spin_unlock(&dcache_lock); |
| 321 | remove_dir(dentry); | 348 | remove_dir(dentry); |
| 322 | } | 349 | } |
| @@ -570,20 +597,43 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
| 570 | BUG_ON(!nodes_intersects(*pmask, node_online_map)); | 597 | BUG_ON(!nodes_intersects(*pmask, node_online_map)); |
| 571 | } | 598 | } |
| 572 | 599 | ||
| 573 | /* | 600 | /** |
| 574 | * Refresh current tasks mems_allowed and mems_generation from current | 601 | * cpuset_update_task_memory_state - update task memory placement |
| 575 | * tasks cpuset. | ||
| 576 | * | 602 | * |
| 577 | * Call without callback_sem or task_lock() held. May be called with | 603 | * If the current tasks cpusets mems_allowed changed behind our |
| 578 | * or without manage_sem held. Will acquire task_lock() and might | 604 | * backs, update current->mems_allowed, mems_generation and task NUMA |
| 579 | * acquire callback_sem during call. | 605 | * mempolicy to the new value. |
| 606 | * | ||
| 607 | * Task mempolicy is updated by rebinding it relative to the | ||
| 608 | * current->cpuset if a task has its memory placement changed. | ||
| 609 | * Do not call this routine if in_interrupt(). | ||
| 580 | * | 610 | * |
| 581 | * The task_lock() is required to dereference current->cpuset safely. | 611 | * Call without callback_sem or task_lock() held. May be called |
| 582 | * Without it, we could pick up the pointer value of current->cpuset | 612 | * with or without manage_sem held. Doesn't need task_lock to guard |
| 583 | * in one instruction, and then attach_task could give us a different | 613 | * against another task changing a non-NULL cpuset pointer to NULL, |
| 584 | * cpuset, and then the cpuset we had could be removed and freed, | 614 | * as that is only done by a task on itself, and if the current task |
| 585 | * and then on our next instruction, we could dereference a no longer | 615 | * is here, it is not simultaneously in the exit code NULL'ing its |
| 586 | * valid cpuset pointer to get its mems_generation field. | 616 | * cpuset pointer. This routine also might acquire callback_sem and |
| 617 | * current->mm->mmap_sem during call. | ||
| 618 | * | ||
| 619 | * Reading current->cpuset->mems_generation doesn't need task_lock | ||
| 620 | * to guard the current->cpuset derefence, because it is guarded | ||
| 621 | * from concurrent freeing of current->cpuset by attach_task(), | ||
| 622 | * using RCU. | ||
| 623 | * | ||
| 624 | * The rcu_dereference() is technically probably not needed, | ||
| 625 | * as I don't actually mind if I see a new cpuset pointer but | ||
| 626 | * an old value of mems_generation. However this really only | ||
| 627 | * matters on alpha systems using cpusets heavily. If I dropped | ||
| 628 | * that rcu_dereference(), it would save them a memory barrier. | ||
| 629 | * For all other arch's, rcu_dereference is a no-op anyway, and for | ||
| 630 | * alpha systems not using cpusets, another planned optimization, | ||
| 631 | * avoiding the rcu critical section for tasks in the root cpuset | ||
| 632 | * which is statically allocated, so can't vanish, will make this | ||
| 633 | * irrelevant. Better to use RCU as intended, than to engage in | ||
| 634 | * some cute trick to save a memory barrier that is impossible to | ||
| 635 | * test, for alpha systems using cpusets heavily, which might not | ||
| 636 | * even exist. | ||
| 587 | * | 637 | * |
| 588 | * This routine is needed to update the per-task mems_allowed data, | 638 | * This routine is needed to update the per-task mems_allowed data, |
| 589 | * within the tasks context, when it is trying to allocate memory | 639 | * within the tasks context, when it is trying to allocate memory |
| @@ -591,27 +641,31 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
| 591 | * task has been modifying its cpuset. | 641 | * task has been modifying its cpuset. |
| 592 | */ | 642 | */ |
| 593 | 643 | ||
| 594 | static void refresh_mems(void) | 644 | void cpuset_update_task_memory_state() |
| 595 | { | 645 | { |
| 596 | int my_cpusets_mem_gen; | 646 | int my_cpusets_mem_gen; |
| 647 | struct task_struct *tsk = current; | ||
| 648 | struct cpuset *cs; | ||
| 597 | 649 | ||
| 598 | task_lock(current); | 650 | if (tsk->cpuset == &top_cpuset) { |
| 599 | my_cpusets_mem_gen = current->cpuset->mems_generation; | 651 | /* Don't need rcu for top_cpuset. It's never freed. */ |
| 600 | task_unlock(current); | 652 | my_cpusets_mem_gen = top_cpuset.mems_generation; |
| 601 | 653 | } else { | |
| 602 | if (current->cpuset_mems_generation != my_cpusets_mem_gen) { | 654 | rcu_read_lock(); |
| 603 | struct cpuset *cs; | 655 | cs = rcu_dereference(tsk->cpuset); |
| 604 | nodemask_t oldmem = current->mems_allowed; | 656 | my_cpusets_mem_gen = cs->mems_generation; |
| 657 | rcu_read_unlock(); | ||
| 658 | } | ||
| 605 | 659 | ||
| 660 | if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { | ||
| 606 | down(&callback_sem); | 661 | down(&callback_sem); |
| 607 | task_lock(current); | 662 | task_lock(tsk); |
| 608 | cs = current->cpuset; | 663 | cs = tsk->cpuset; /* Maybe changed when task not locked */ |
| 609 | guarantee_online_mems(cs, ¤t->mems_allowed); | 664 | guarantee_online_mems(cs, &tsk->mems_allowed); |
| 610 | current->cpuset_mems_generation = cs->mems_generation; | 665 | tsk->cpuset_mems_generation = cs->mems_generation; |
| 611 | task_unlock(current); | 666 | task_unlock(tsk); |
| 612 | up(&callback_sem); | 667 | up(&callback_sem); |
| 613 | if (!nodes_equal(oldmem, current->mems_allowed)) | 668 | mpol_rebind_task(tsk, &tsk->mems_allowed); |
| 614 | numa_policy_rebind(&oldmem, ¤t->mems_allowed); | ||
| 615 | } | 669 | } |
| 616 | } | 670 | } |
| 617 | 671 | ||
| @@ -766,36 +820,150 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
| 766 | } | 820 | } |
| 767 | 821 | ||
| 768 | /* | 822 | /* |
| 823 | * Handle user request to change the 'mems' memory placement | ||
| 824 | * of a cpuset. Needs to validate the request, update the | ||
| 825 | * cpusets mems_allowed and mems_generation, and for each | ||
| 826 | * task in the cpuset, rebind any vma mempolicies and if | ||
| 827 | * the cpuset is marked 'memory_migrate', migrate the tasks | ||
| 828 | * pages to the new memory. | ||
| 829 | * | ||
| 769 | * Call with manage_sem held. May take callback_sem during call. | 830 | * Call with manage_sem held. May take callback_sem during call. |
| 831 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | ||
| 832 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | ||
| 833 | * their mempolicies to the cpusets new mems_allowed. | ||
| 770 | */ | 834 | */ |
| 771 | 835 | ||
| 772 | static int update_nodemask(struct cpuset *cs, char *buf) | 836 | static int update_nodemask(struct cpuset *cs, char *buf) |
| 773 | { | 837 | { |
| 774 | struct cpuset trialcs; | 838 | struct cpuset trialcs; |
| 839 | nodemask_t oldmem; | ||
| 840 | struct task_struct *g, *p; | ||
| 841 | struct mm_struct **mmarray; | ||
| 842 | int i, n, ntasks; | ||
| 843 | int migrate; | ||
| 844 | int fudge; | ||
| 775 | int retval; | 845 | int retval; |
| 776 | 846 | ||
| 777 | trialcs = *cs; | 847 | trialcs = *cs; |
| 778 | retval = nodelist_parse(buf, trialcs.mems_allowed); | 848 | retval = nodelist_parse(buf, trialcs.mems_allowed); |
| 779 | if (retval < 0) | 849 | if (retval < 0) |
| 780 | return retval; | 850 | goto done; |
| 781 | nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); | 851 | nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); |
| 782 | if (nodes_empty(trialcs.mems_allowed)) | 852 | oldmem = cs->mems_allowed; |
| 783 | return -ENOSPC; | 853 | if (nodes_equal(oldmem, trialcs.mems_allowed)) { |
| 854 | retval = 0; /* Too easy - nothing to do */ | ||
| 855 | goto done; | ||
| 856 | } | ||
| 857 | if (nodes_empty(trialcs.mems_allowed)) { | ||
| 858 | retval = -ENOSPC; | ||
| 859 | goto done; | ||
| 860 | } | ||
| 784 | retval = validate_change(cs, &trialcs); | 861 | retval = validate_change(cs, &trialcs); |
| 785 | if (retval == 0) { | 862 | if (retval < 0) |
| 786 | down(&callback_sem); | 863 | goto done; |
| 787 | cs->mems_allowed = trialcs.mems_allowed; | 864 | |
| 788 | atomic_inc(&cpuset_mems_generation); | 865 | down(&callback_sem); |
| 789 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | 866 | cs->mems_allowed = trialcs.mems_allowed; |
| 790 | up(&callback_sem); | 867 | atomic_inc(&cpuset_mems_generation); |
| 868 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | ||
| 869 | up(&callback_sem); | ||
| 870 | |||
| 871 | set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ | ||
| 872 | |||
| 873 | fudge = 10; /* spare mmarray[] slots */ | ||
| 874 | fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ | ||
| 875 | retval = -ENOMEM; | ||
| 876 | |||
| 877 | /* | ||
| 878 | * Allocate mmarray[] to hold mm reference for each task | ||
| 879 | * in cpuset cs. Can't kmalloc GFP_KERNEL while holding | ||
| 880 | * tasklist_lock. We could use GFP_ATOMIC, but with a | ||
| 881 | * few more lines of code, we can retry until we get a big | ||
| 882 | * enough mmarray[] w/o using GFP_ATOMIC. | ||
| 883 | */ | ||
| 884 | while (1) { | ||
| 885 | ntasks = atomic_read(&cs->count); /* guess */ | ||
| 886 | ntasks += fudge; | ||
| 887 | mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); | ||
| 888 | if (!mmarray) | ||
| 889 | goto done; | ||
| 890 | write_lock_irq(&tasklist_lock); /* block fork */ | ||
| 891 | if (atomic_read(&cs->count) <= ntasks) | ||
| 892 | break; /* got enough */ | ||
| 893 | write_unlock_irq(&tasklist_lock); /* try again */ | ||
| 894 | kfree(mmarray); | ||
| 791 | } | 895 | } |
| 896 | |||
| 897 | n = 0; | ||
| 898 | |||
| 899 | /* Load up mmarray[] with mm reference for each task in cpuset. */ | ||
| 900 | do_each_thread(g, p) { | ||
| 901 | struct mm_struct *mm; | ||
| 902 | |||
| 903 | if (n >= ntasks) { | ||
| 904 | printk(KERN_WARNING | ||
| 905 | "Cpuset mempolicy rebind incomplete.\n"); | ||
| 906 | continue; | ||
| 907 | } | ||
| 908 | if (p->cpuset != cs) | ||
| 909 | continue; | ||
| 910 | mm = get_task_mm(p); | ||
| 911 | if (!mm) | ||
| 912 | continue; | ||
| 913 | mmarray[n++] = mm; | ||
| 914 | } while_each_thread(g, p); | ||
| 915 | write_unlock_irq(&tasklist_lock); | ||
| 916 | |||
| 917 | /* | ||
| 918 | * Now that we've dropped the tasklist spinlock, we can | ||
| 919 | * rebind the vma mempolicies of each mm in mmarray[] to their | ||
| 920 | * new cpuset, and release that mm. The mpol_rebind_mm() | ||
| 921 | * call takes mmap_sem, which we couldn't take while holding | ||
| 922 | * tasklist_lock. Forks can happen again now - the mpol_copy() | ||
| 923 | * cpuset_being_rebound check will catch such forks, and rebind | ||
| 924 | * their vma mempolicies too. Because we still hold the global | ||
| 925 | * cpuset manage_sem, we know that no other rebind effort will | ||
| 926 | * be contending for the global variable cpuset_being_rebound. | ||
| 927 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() | ||
| 928 | * is idempotent. Also migrate pages in each mm to new nodes. | ||
| 929 | */ | ||
| 930 | migrate = is_memory_migrate(cs); | ||
| 931 | for (i = 0; i < n; i++) { | ||
| 932 | struct mm_struct *mm = mmarray[i]; | ||
| 933 | |||
| 934 | mpol_rebind_mm(mm, &cs->mems_allowed); | ||
| 935 | if (migrate) { | ||
| 936 | do_migrate_pages(mm, &oldmem, &cs->mems_allowed, | ||
| 937 | MPOL_MF_MOVE_ALL); | ||
| 938 | } | ||
| 939 | mmput(mm); | ||
| 940 | } | ||
| 941 | |||
| 942 | /* We're done rebinding vma's to this cpusets new mems_allowed. */ | ||
| 943 | kfree(mmarray); | ||
| 944 | set_cpuset_being_rebound(NULL); | ||
| 945 | retval = 0; | ||
| 946 | done: | ||
| 792 | return retval; | 947 | return retval; |
| 793 | } | 948 | } |
| 794 | 949 | ||
| 795 | /* | 950 | /* |
| 951 | * Call with manage_sem held. | ||
| 952 | */ | ||
| 953 | |||
| 954 | static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) | ||
| 955 | { | ||
| 956 | if (simple_strtoul(buf, NULL, 10) != 0) | ||
| 957 | cpuset_memory_pressure_enabled = 1; | ||
| 958 | else | ||
| 959 | cpuset_memory_pressure_enabled = 0; | ||
| 960 | return 0; | ||
| 961 | } | ||
| 962 | |||
| 963 | /* | ||
| 796 | * update_flag - read a 0 or a 1 in a file and update associated flag | 964 | * update_flag - read a 0 or a 1 in a file and update associated flag |
| 797 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, | 965 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, |
| 798 | * CS_NOTIFY_ON_RELEASE) | 966 | * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) |
| 799 | * cs: the cpuset to update | 967 | * cs: the cpuset to update |
| 800 | * buf: the buffer where we read the 0 or 1 | 968 | * buf: the buffer where we read the 0 or 1 |
| 801 | * | 969 | * |
| @@ -834,6 +1002,104 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
| 834 | } | 1002 | } |
| 835 | 1003 | ||
| 836 | /* | 1004 | /* |
| 1005 | * Frequency meter - How fast is some event occuring? | ||
| 1006 | * | ||
| 1007 | * These routines manage a digitally filtered, constant time based, | ||
| 1008 | * event frequency meter. There are four routines: | ||
| 1009 | * fmeter_init() - initialize a frequency meter. | ||
| 1010 | * fmeter_markevent() - called each time the event happens. | ||
| 1011 | * fmeter_getrate() - returns the recent rate of such events. | ||
| 1012 | * fmeter_update() - internal routine used to update fmeter. | ||
| 1013 | * | ||
| 1014 | * A common data structure is passed to each of these routines, | ||
| 1015 | * which is used to keep track of the state required to manage the | ||
| 1016 | * frequency meter and its digital filter. | ||
| 1017 | * | ||
| 1018 | * The filter works on the number of events marked per unit time. | ||
| 1019 | * The filter is single-pole low-pass recursive (IIR). The time unit | ||
| 1020 | * is 1 second. Arithmetic is done using 32-bit integers scaled to | ||
| 1021 | * simulate 3 decimal digits of precision (multiplied by 1000). | ||
| 1022 | * | ||
| 1023 | * With an FM_COEF of 933, and a time base of 1 second, the filter | ||
| 1024 | * has a half-life of 10 seconds, meaning that if the events quit | ||
| 1025 | * happening, then the rate returned from the fmeter_getrate() | ||
| 1026 | * will be cut in half each 10 seconds, until it converges to zero. | ||
| 1027 | * | ||
| 1028 | * It is not worth doing a real infinitely recursive filter. If more | ||
| 1029 | * than FM_MAXTICKS ticks have elapsed since the last filter event, | ||
| 1030 | * just compute FM_MAXTICKS ticks worth, by which point the level | ||
| 1031 | * will be stable. | ||
| 1032 | * | ||
| 1033 | * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid | ||
| 1034 | * arithmetic overflow in the fmeter_update() routine. | ||
| 1035 | * | ||
| 1036 | * Given the simple 32 bit integer arithmetic used, this meter works | ||
| 1037 | * best for reporting rates between one per millisecond (msec) and | ||
| 1038 | * one per 32 (approx) seconds. At constant rates faster than one | ||
| 1039 | * per msec it maxes out at values just under 1,000,000. At constant | ||
| 1040 | * rates between one per msec, and one per second it will stabilize | ||
| 1041 | * to a value N*1000, where N is the rate of events per second. | ||
| 1042 | * At constant rates between one per second and one per 32 seconds, | ||
| 1043 | * it will be choppy, moving up on the seconds that have an event, | ||
| 1044 | * and then decaying until the next event. At rates slower than | ||
| 1045 | * about one in 32 seconds, it decays all the way back to zero between | ||
| 1046 | * each event. | ||
| 1047 | */ | ||
| 1048 | |||
| 1049 | #define FM_COEF 933 /* coefficient for half-life of 10 secs */ | ||
| 1050 | #define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */ | ||
| 1051 | #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ | ||
| 1052 | #define FM_SCALE 1000 /* faux fixed point scale */ | ||
| 1053 | |||
| 1054 | /* Initialize a frequency meter */ | ||
| 1055 | static void fmeter_init(struct fmeter *fmp) | ||
| 1056 | { | ||
| 1057 | fmp->cnt = 0; | ||
| 1058 | fmp->val = 0; | ||
| 1059 | fmp->time = 0; | ||
| 1060 | spin_lock_init(&fmp->lock); | ||
| 1061 | } | ||
| 1062 | |||
| 1063 | /* Internal meter update - process cnt events and update value */ | ||
| 1064 | static void fmeter_update(struct fmeter *fmp) | ||
| 1065 | { | ||
| 1066 | time_t now = get_seconds(); | ||
| 1067 | time_t ticks = now - fmp->time; | ||
| 1068 | |||
| 1069 | if (ticks == 0) | ||
| 1070 | return; | ||
| 1071 | |||
| 1072 | ticks = min(FM_MAXTICKS, ticks); | ||
| 1073 | while (ticks-- > 0) | ||
| 1074 | fmp->val = (FM_COEF * fmp->val) / FM_SCALE; | ||
| 1075 | fmp->time = now; | ||
| 1076 | |||
| 1077 | fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; | ||
| 1078 | fmp->cnt = 0; | ||
| 1079 | } | ||
| 1080 | |||
| 1081 | /* Process any previous ticks, then bump cnt by one (times scale). */ | ||
| 1082 | static void fmeter_markevent(struct fmeter *fmp) | ||
| 1083 | { | ||
| 1084 | spin_lock(&fmp->lock); | ||
| 1085 | fmeter_update(fmp); | ||
| 1086 | fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); | ||
| 1087 | spin_unlock(&fmp->lock); | ||
| 1088 | } | ||
| 1089 | |||
| 1090 | /* Process any previous ticks, then return current value. */ | ||
| 1091 | static int fmeter_getrate(struct fmeter *fmp) | ||
| 1092 | { | ||
| 1093 | int val; | ||
| 1094 | |||
| 1095 | spin_lock(&fmp->lock); | ||
| 1096 | fmeter_update(fmp); | ||
| 1097 | val = fmp->val; | ||
| 1098 | spin_unlock(&fmp->lock); | ||
| 1099 | return val; | ||
| 1100 | } | ||
| 1101 | |||
| 1102 | /* | ||
| 837 | * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly | 1103 | * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly |
| 838 | * writing the path of the old cpuset in 'ppathbuf' if it needs to be | 1104 | * writing the path of the old cpuset in 'ppathbuf' if it needs to be |
| 839 | * notified on release. | 1105 | * notified on release. |
| @@ -848,6 +1114,8 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
| 848 | struct task_struct *tsk; | 1114 | struct task_struct *tsk; |
| 849 | struct cpuset *oldcs; | 1115 | struct cpuset *oldcs; |
| 850 | cpumask_t cpus; | 1116 | cpumask_t cpus; |
| 1117 | nodemask_t from, to; | ||
| 1118 | struct mm_struct *mm; | ||
| 851 | 1119 | ||
| 852 | if (sscanf(pidbuf, "%d", &pid) != 1) | 1120 | if (sscanf(pidbuf, "%d", &pid) != 1) |
| 853 | return -EIO; | 1121 | return -EIO; |
| @@ -887,14 +1155,27 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
| 887 | return -ESRCH; | 1155 | return -ESRCH; |
| 888 | } | 1156 | } |
| 889 | atomic_inc(&cs->count); | 1157 | atomic_inc(&cs->count); |
| 890 | tsk->cpuset = cs; | 1158 | rcu_assign_pointer(tsk->cpuset, cs); |
| 891 | task_unlock(tsk); | 1159 | task_unlock(tsk); |
| 892 | 1160 | ||
| 893 | guarantee_online_cpus(cs, &cpus); | 1161 | guarantee_online_cpus(cs, &cpus); |
| 894 | set_cpus_allowed(tsk, cpus); | 1162 | set_cpus_allowed(tsk, cpus); |
| 895 | 1163 | ||
| 1164 | from = oldcs->mems_allowed; | ||
| 1165 | to = cs->mems_allowed; | ||
| 1166 | |||
| 896 | up(&callback_sem); | 1167 | up(&callback_sem); |
| 1168 | |||
| 1169 | mm = get_task_mm(tsk); | ||
| 1170 | if (mm) { | ||
| 1171 | mpol_rebind_mm(mm, &to); | ||
| 1172 | mmput(mm); | ||
| 1173 | } | ||
| 1174 | |||
| 1175 | if (is_memory_migrate(cs)) | ||
| 1176 | do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL); | ||
| 897 | put_task_struct(tsk); | 1177 | put_task_struct(tsk); |
| 1178 | synchronize_rcu(); | ||
| 898 | if (atomic_dec_and_test(&oldcs->count)) | 1179 | if (atomic_dec_and_test(&oldcs->count)) |
| 899 | check_for_release(oldcs, ppathbuf); | 1180 | check_for_release(oldcs, ppathbuf); |
| 900 | return 0; | 1181 | return 0; |
| @@ -905,11 +1186,14 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
| 905 | typedef enum { | 1186 | typedef enum { |
| 906 | FILE_ROOT, | 1187 | FILE_ROOT, |
| 907 | FILE_DIR, | 1188 | FILE_DIR, |
| 1189 | FILE_MEMORY_MIGRATE, | ||
| 908 | FILE_CPULIST, | 1190 | FILE_CPULIST, |
| 909 | FILE_MEMLIST, | 1191 | FILE_MEMLIST, |
| 910 | FILE_CPU_EXCLUSIVE, | 1192 | FILE_CPU_EXCLUSIVE, |
| 911 | FILE_MEM_EXCLUSIVE, | 1193 | FILE_MEM_EXCLUSIVE, |
| 912 | FILE_NOTIFY_ON_RELEASE, | 1194 | FILE_NOTIFY_ON_RELEASE, |
| 1195 | FILE_MEMORY_PRESSURE_ENABLED, | ||
| 1196 | FILE_MEMORY_PRESSURE, | ||
| 913 | FILE_TASKLIST, | 1197 | FILE_TASKLIST, |
| 914 | } cpuset_filetype_t; | 1198 | } cpuset_filetype_t; |
| 915 | 1199 | ||
| @@ -960,6 +1244,15 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
| 960 | case FILE_NOTIFY_ON_RELEASE: | 1244 | case FILE_NOTIFY_ON_RELEASE: |
| 961 | retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); | 1245 | retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); |
| 962 | break; | 1246 | break; |
| 1247 | case FILE_MEMORY_MIGRATE: | ||
| 1248 | retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); | ||
| 1249 | break; | ||
| 1250 | case FILE_MEMORY_PRESSURE_ENABLED: | ||
| 1251 | retval = update_memory_pressure_enabled(cs, buffer); | ||
| 1252 | break; | ||
| 1253 | case FILE_MEMORY_PRESSURE: | ||
| 1254 | retval = -EACCES; | ||
| 1255 | break; | ||
| 963 | case FILE_TASKLIST: | 1256 | case FILE_TASKLIST: |
| 964 | retval = attach_task(cs, buffer, &pathbuf); | 1257 | retval = attach_task(cs, buffer, &pathbuf); |
| 965 | break; | 1258 | break; |
| @@ -1060,6 +1353,15 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | |||
| 1060 | case FILE_NOTIFY_ON_RELEASE: | 1353 | case FILE_NOTIFY_ON_RELEASE: |
| 1061 | *s++ = notify_on_release(cs) ? '1' : '0'; | 1354 | *s++ = notify_on_release(cs) ? '1' : '0'; |
| 1062 | break; | 1355 | break; |
| 1356 | case FILE_MEMORY_MIGRATE: | ||
| 1357 | *s++ = is_memory_migrate(cs) ? '1' : '0'; | ||
| 1358 | break; | ||
| 1359 | case FILE_MEMORY_PRESSURE_ENABLED: | ||
| 1360 | *s++ = cpuset_memory_pressure_enabled ? '1' : '0'; | ||
| 1361 | break; | ||
| 1362 | case FILE_MEMORY_PRESSURE: | ||
| 1363 | s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter)); | ||
| 1364 | break; | ||
| 1063 | default: | 1365 | default: |
| 1064 | retval = -EINVAL; | 1366 | retval = -EINVAL; |
| 1065 | goto out; | 1367 | goto out; |
| @@ -1178,7 +1480,7 @@ static int cpuset_create_file(struct dentry *dentry, int mode) | |||
| 1178 | 1480 | ||
| 1179 | /* | 1481 | /* |
| 1180 | * cpuset_create_dir - create a directory for an object. | 1482 | * cpuset_create_dir - create a directory for an object. |
| 1181 | * cs: the cpuset we create the directory for. | 1483 | * cs: the cpuset we create the directory for. |
| 1182 | * It must have a valid ->parent field | 1484 | * It must have a valid ->parent field |
| 1183 | * And we are going to fill its ->dentry field. | 1485 | * And we are going to fill its ->dentry field. |
| 1184 | * name: The name to give to the cpuset directory. Will be copied. | 1486 | * name: The name to give to the cpuset directory. Will be copied. |
| @@ -1211,7 +1513,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft) | |||
| 1211 | struct dentry *dentry; | 1513 | struct dentry *dentry; |
| 1212 | int error; | 1514 | int error; |
| 1213 | 1515 | ||
| 1214 | down(&dir->d_inode->i_sem); | 1516 | mutex_lock(&dir->d_inode->i_mutex); |
| 1215 | dentry = cpuset_get_dentry(dir, cft->name); | 1517 | dentry = cpuset_get_dentry(dir, cft->name); |
| 1216 | if (!IS_ERR(dentry)) { | 1518 | if (!IS_ERR(dentry)) { |
| 1217 | error = cpuset_create_file(dentry, 0644 | S_IFREG); | 1519 | error = cpuset_create_file(dentry, 0644 | S_IFREG); |
| @@ -1220,7 +1522,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft) | |||
| 1220 | dput(dentry); | 1522 | dput(dentry); |
| 1221 | } else | 1523 | } else |
| 1222 | error = PTR_ERR(dentry); | 1524 | error = PTR_ERR(dentry); |
| 1223 | up(&dir->d_inode->i_sem); | 1525 | mutex_unlock(&dir->d_inode->i_mutex); |
| 1224 | return error; | 1526 | return error; |
| 1225 | } | 1527 | } |
| 1226 | 1528 | ||
| @@ -1408,6 +1710,21 @@ static struct cftype cft_notify_on_release = { | |||
| 1408 | .private = FILE_NOTIFY_ON_RELEASE, | 1710 | .private = FILE_NOTIFY_ON_RELEASE, |
| 1409 | }; | 1711 | }; |
| 1410 | 1712 | ||
| 1713 | static struct cftype cft_memory_migrate = { | ||
| 1714 | .name = "memory_migrate", | ||
| 1715 | .private = FILE_MEMORY_MIGRATE, | ||
| 1716 | }; | ||
| 1717 | |||
| 1718 | static struct cftype cft_memory_pressure_enabled = { | ||
| 1719 | .name = "memory_pressure_enabled", | ||
| 1720 | .private = FILE_MEMORY_PRESSURE_ENABLED, | ||
| 1721 | }; | ||
| 1722 | |||
| 1723 | static struct cftype cft_memory_pressure = { | ||
| 1724 | .name = "memory_pressure", | ||
| 1725 | .private = FILE_MEMORY_PRESSURE, | ||
| 1726 | }; | ||
| 1727 | |||
| 1411 | static int cpuset_populate_dir(struct dentry *cs_dentry) | 1728 | static int cpuset_populate_dir(struct dentry *cs_dentry) |
| 1412 | { | 1729 | { |
| 1413 | int err; | 1730 | int err; |
| @@ -1422,6 +1739,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry) | |||
| 1422 | return err; | 1739 | return err; |
| 1423 | if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) | 1740 | if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) |
| 1424 | return err; | 1741 | return err; |
| 1742 | if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0) | ||
| 1743 | return err; | ||
| 1744 | if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) | ||
| 1745 | return err; | ||
| 1425 | if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) | 1746 | if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) |
| 1426 | return err; | 1747 | return err; |
| 1427 | return 0; | 1748 | return 0; |
| @@ -1446,7 +1767,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
| 1446 | return -ENOMEM; | 1767 | return -ENOMEM; |
| 1447 | 1768 | ||
| 1448 | down(&manage_sem); | 1769 | down(&manage_sem); |
| 1449 | refresh_mems(); | 1770 | cpuset_update_task_memory_state(); |
| 1450 | cs->flags = 0; | 1771 | cs->flags = 0; |
| 1451 | if (notify_on_release(parent)) | 1772 | if (notify_on_release(parent)) |
| 1452 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 1773 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
| @@ -1457,11 +1778,13 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
| 1457 | INIT_LIST_HEAD(&cs->children); | 1778 | INIT_LIST_HEAD(&cs->children); |
| 1458 | atomic_inc(&cpuset_mems_generation); | 1779 | atomic_inc(&cpuset_mems_generation); |
| 1459 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | 1780 | cs->mems_generation = atomic_read(&cpuset_mems_generation); |
| 1781 | fmeter_init(&cs->fmeter); | ||
| 1460 | 1782 | ||
| 1461 | cs->parent = parent; | 1783 | cs->parent = parent; |
| 1462 | 1784 | ||
| 1463 | down(&callback_sem); | 1785 | down(&callback_sem); |
| 1464 | list_add(&cs->sibling, &cs->parent->children); | 1786 | list_add(&cs->sibling, &cs->parent->children); |
| 1787 | number_of_cpusets++; | ||
| 1465 | up(&callback_sem); | 1788 | up(&callback_sem); |
| 1466 | 1789 | ||
| 1467 | err = cpuset_create_dir(cs, name, mode); | 1790 | err = cpuset_create_dir(cs, name, mode); |
| @@ -1470,7 +1793,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
| 1470 | 1793 | ||
| 1471 | /* | 1794 | /* |
| 1472 | * Release manage_sem before cpuset_populate_dir() because it | 1795 | * Release manage_sem before cpuset_populate_dir() because it |
| 1473 | * will down() this new directory's i_sem and if we race with | 1796 | * will down() this new directory's i_mutex and if we race with |
| 1474 | * another mkdir, we might deadlock. | 1797 | * another mkdir, we might deadlock. |
| 1475 | */ | 1798 | */ |
| 1476 | up(&manage_sem); | 1799 | up(&manage_sem); |
| @@ -1489,7 +1812,7 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode) | |||
| 1489 | { | 1812 | { |
| 1490 | struct cpuset *c_parent = dentry->d_parent->d_fsdata; | 1813 | struct cpuset *c_parent = dentry->d_parent->d_fsdata; |
| 1491 | 1814 | ||
| 1492 | /* the vfs holds inode->i_sem already */ | 1815 | /* the vfs holds inode->i_mutex already */ |
| 1493 | return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); | 1816 | return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); |
| 1494 | } | 1817 | } |
| 1495 | 1818 | ||
| @@ -1500,10 +1823,10 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
| 1500 | struct cpuset *parent; | 1823 | struct cpuset *parent; |
| 1501 | char *pathbuf = NULL; | 1824 | char *pathbuf = NULL; |
| 1502 | 1825 | ||
| 1503 | /* the vfs holds both inode->i_sem already */ | 1826 | /* the vfs holds both inode->i_mutex already */ |
| 1504 | 1827 | ||
| 1505 | down(&manage_sem); | 1828 | down(&manage_sem); |
| 1506 | refresh_mems(); | 1829 | cpuset_update_task_memory_state(); |
| 1507 | if (atomic_read(&cs->count) > 0) { | 1830 | if (atomic_read(&cs->count) > 0) { |
| 1508 | up(&manage_sem); | 1831 | up(&manage_sem); |
| 1509 | return -EBUSY; | 1832 | return -EBUSY; |
| @@ -1524,6 +1847,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
| 1524 | spin_unlock(&d->d_lock); | 1847 | spin_unlock(&d->d_lock); |
| 1525 | cpuset_d_remove_dir(d); | 1848 | cpuset_d_remove_dir(d); |
| 1526 | dput(d); | 1849 | dput(d); |
| 1850 | number_of_cpusets--; | ||
| 1527 | up(&callback_sem); | 1851 | up(&callback_sem); |
| 1528 | if (list_empty(&parent->children)) | 1852 | if (list_empty(&parent->children)) |
| 1529 | check_for_release(parent, &pathbuf); | 1853 | check_for_release(parent, &pathbuf); |
| @@ -1532,6 +1856,21 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
| 1532 | return 0; | 1856 | return 0; |
| 1533 | } | 1857 | } |
| 1534 | 1858 | ||
| 1859 | /* | ||
| 1860 | * cpuset_init_early - just enough so that the calls to | ||
| 1861 | * cpuset_update_task_memory_state() in early init code | ||
| 1862 | * are harmless. | ||
| 1863 | */ | ||
| 1864 | |||
| 1865 | int __init cpuset_init_early(void) | ||
| 1866 | { | ||
| 1867 | struct task_struct *tsk = current; | ||
| 1868 | |||
| 1869 | tsk->cpuset = &top_cpuset; | ||
| 1870 | tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation); | ||
| 1871 | return 0; | ||
| 1872 | } | ||
| 1873 | |||
| 1535 | /** | 1874 | /** |
| 1536 | * cpuset_init - initialize cpusets at system boot | 1875 | * cpuset_init - initialize cpusets at system boot |
| 1537 | * | 1876 | * |
| @@ -1546,6 +1885,7 @@ int __init cpuset_init(void) | |||
| 1546 | top_cpuset.cpus_allowed = CPU_MASK_ALL; | 1885 | top_cpuset.cpus_allowed = CPU_MASK_ALL; |
| 1547 | top_cpuset.mems_allowed = NODE_MASK_ALL; | 1886 | top_cpuset.mems_allowed = NODE_MASK_ALL; |
| 1548 | 1887 | ||
| 1888 | fmeter_init(&top_cpuset.fmeter); | ||
| 1549 | atomic_inc(&cpuset_mems_generation); | 1889 | atomic_inc(&cpuset_mems_generation); |
| 1550 | top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); | 1890 | top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); |
| 1551 | 1891 | ||
| @@ -1566,7 +1906,11 @@ int __init cpuset_init(void) | |||
| 1566 | root->d_inode->i_nlink++; | 1906 | root->d_inode->i_nlink++; |
| 1567 | top_cpuset.dentry = root; | 1907 | top_cpuset.dentry = root; |
| 1568 | root->d_inode->i_op = &cpuset_dir_inode_operations; | 1908 | root->d_inode->i_op = &cpuset_dir_inode_operations; |
| 1909 | number_of_cpusets = 1; | ||
| 1569 | err = cpuset_populate_dir(root); | 1910 | err = cpuset_populate_dir(root); |
| 1911 | /* memory_pressure_enabled is in root cpuset only */ | ||
| 1912 | if (err == 0) | ||
| 1913 | err = cpuset_add_file(root, &cft_memory_pressure_enabled); | ||
| 1570 | out: | 1914 | out: |
| 1571 | return err; | 1915 | return err; |
| 1572 | } | 1916 | } |
| @@ -1632,15 +1976,13 @@ void cpuset_fork(struct task_struct *child) | |||
| 1632 | * | 1976 | * |
| 1633 | * We don't need to task_lock() this reference to tsk->cpuset, | 1977 | * We don't need to task_lock() this reference to tsk->cpuset, |
| 1634 | * because tsk is already marked PF_EXITING, so attach_task() won't | 1978 | * because tsk is already marked PF_EXITING, so attach_task() won't |
| 1635 | * mess with it. | 1979 | * mess with it, or task is a failed fork, never visible to attach_task. |
| 1636 | **/ | 1980 | **/ |
| 1637 | 1981 | ||
| 1638 | void cpuset_exit(struct task_struct *tsk) | 1982 | void cpuset_exit(struct task_struct *tsk) |
| 1639 | { | 1983 | { |
| 1640 | struct cpuset *cs; | 1984 | struct cpuset *cs; |
| 1641 | 1985 | ||
| 1642 | BUG_ON(!(tsk->flags & PF_EXITING)); | ||
| 1643 | |||
| 1644 | cs = tsk->cpuset; | 1986 | cs = tsk->cpuset; |
| 1645 | tsk->cpuset = NULL; | 1987 | tsk->cpuset = NULL; |
| 1646 | 1988 | ||
| @@ -1667,14 +2009,14 @@ void cpuset_exit(struct task_struct *tsk) | |||
| 1667 | * tasks cpuset. | 2009 | * tasks cpuset. |
| 1668 | **/ | 2010 | **/ |
| 1669 | 2011 | ||
| 1670 | cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) | 2012 | cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) |
| 1671 | { | 2013 | { |
| 1672 | cpumask_t mask; | 2014 | cpumask_t mask; |
| 1673 | 2015 | ||
| 1674 | down(&callback_sem); | 2016 | down(&callback_sem); |
| 1675 | task_lock((struct task_struct *)tsk); | 2017 | task_lock(tsk); |
| 1676 | guarantee_online_cpus(tsk->cpuset, &mask); | 2018 | guarantee_online_cpus(tsk->cpuset, &mask); |
| 1677 | task_unlock((struct task_struct *)tsk); | 2019 | task_unlock(tsk); |
| 1678 | up(&callback_sem); | 2020 | up(&callback_sem); |
| 1679 | 2021 | ||
| 1680 | return mask; | 2022 | return mask; |
| @@ -1686,43 +2028,26 @@ void cpuset_init_current_mems_allowed(void) | |||
| 1686 | } | 2028 | } |
| 1687 | 2029 | ||
| 1688 | /** | 2030 | /** |
| 1689 | * cpuset_update_current_mems_allowed - update mems parameters to new values | 2031 | * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. |
| 1690 | * | 2032 | * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. |
| 1691 | * If the current tasks cpusets mems_allowed changed behind our backs, | ||
| 1692 | * update current->mems_allowed and mems_generation to the new value. | ||
| 1693 | * Do not call this routine if in_interrupt(). | ||
| 1694 | * | 2033 | * |
| 1695 | * Call without callback_sem or task_lock() held. May be called | 2034 | * Description: Returns the nodemask_t mems_allowed of the cpuset |
| 1696 | * with or without manage_sem held. Unless exiting, it will acquire | 2035 | * attached to the specified @tsk. Guaranteed to return some non-empty |
| 1697 | * task_lock(). Also might acquire callback_sem during call to | 2036 | * subset of node_online_map, even if this means going outside the |
| 1698 | * refresh_mems(). | 2037 | * tasks cpuset. |
| 1699 | */ | 2038 | **/ |
| 1700 | 2039 | ||
| 1701 | void cpuset_update_current_mems_allowed(void) | 2040 | nodemask_t cpuset_mems_allowed(struct task_struct *tsk) |
| 1702 | { | 2041 | { |
| 1703 | struct cpuset *cs; | 2042 | nodemask_t mask; |
| 1704 | int need_to_refresh = 0; | ||
| 1705 | 2043 | ||
| 1706 | task_lock(current); | 2044 | down(&callback_sem); |
| 1707 | cs = current->cpuset; | 2045 | task_lock(tsk); |
| 1708 | if (!cs) | 2046 | guarantee_online_mems(tsk->cpuset, &mask); |
| 1709 | goto done; | 2047 | task_unlock(tsk); |
| 1710 | if (current->cpuset_mems_generation != cs->mems_generation) | 2048 | up(&callback_sem); |
| 1711 | need_to_refresh = 1; | ||
| 1712 | done: | ||
| 1713 | task_unlock(current); | ||
| 1714 | if (need_to_refresh) | ||
| 1715 | refresh_mems(); | ||
| 1716 | } | ||
| 1717 | 2049 | ||
| 1718 | /** | 2050 | return mask; |
| 1719 | * cpuset_restrict_to_mems_allowed - limit nodes to current mems_allowed | ||
| 1720 | * @nodes: pointer to a node bitmap that is and-ed with mems_allowed | ||
| 1721 | */ | ||
| 1722 | void cpuset_restrict_to_mems_allowed(unsigned long *nodes) | ||
| 1723 | { | ||
| 1724 | bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed), | ||
| 1725 | MAX_NUMNODES); | ||
| 1726 | } | 2051 | } |
| 1727 | 2052 | ||
| 1728 | /** | 2053 | /** |
| @@ -1795,7 +2120,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | |||
| 1795 | * GFP_USER - only nodes in current tasks mems allowed ok. | 2120 | * GFP_USER - only nodes in current tasks mems allowed ok. |
| 1796 | **/ | 2121 | **/ |
| 1797 | 2122 | ||
| 1798 | int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | 2123 | int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) |
| 1799 | { | 2124 | { |
| 1800 | int node; /* node that zone z is on */ | 2125 | int node; /* node that zone z is on */ |
| 1801 | const struct cpuset *cs; /* current cpuset ancestors */ | 2126 | const struct cpuset *cs; /* current cpuset ancestors */ |
| @@ -1867,6 +2192,42 @@ done: | |||
| 1867 | } | 2192 | } |
| 1868 | 2193 | ||
| 1869 | /* | 2194 | /* |
| 2195 | * Collection of memory_pressure is suppressed unless | ||
| 2196 | * this flag is enabled by writing "1" to the special | ||
| 2197 | * cpuset file 'memory_pressure_enabled' in the root cpuset. | ||
| 2198 | */ | ||
| 2199 | |||
| 2200 | int cpuset_memory_pressure_enabled __read_mostly; | ||
| 2201 | |||
| 2202 | /** | ||
| 2203 | * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. | ||
| 2204 | * | ||
| 2205 | * Keep a running average of the rate of synchronous (direct) | ||
| 2206 | * page reclaim efforts initiated by tasks in each cpuset. | ||
| 2207 | * | ||
| 2208 | * This represents the rate at which some task in the cpuset | ||
| 2209 | * ran low on memory on all nodes it was allowed to use, and | ||
| 2210 | * had to enter the kernels page reclaim code in an effort to | ||
| 2211 | * create more free memory by tossing clean pages or swapping | ||
| 2212 | * or writing dirty pages. | ||
| 2213 | * | ||
| 2214 | * Display to user space in the per-cpuset read-only file | ||
| 2215 | * "memory_pressure". Value displayed is an integer | ||
| 2216 | * representing the recent rate of entry into the synchronous | ||
| 2217 | * (direct) page reclaim by any task attached to the cpuset. | ||
| 2218 | **/ | ||
| 2219 | |||
| 2220 | void __cpuset_memory_pressure_bump(void) | ||
| 2221 | { | ||
| 2222 | struct cpuset *cs; | ||
| 2223 | |||
| 2224 | task_lock(current); | ||
| 2225 | cs = current->cpuset; | ||
| 2226 | fmeter_markevent(&cs->fmeter); | ||
| 2227 | task_unlock(current); | ||
| 2228 | } | ||
| 2229 | |||
| 2230 | /* | ||
| 1870 | * proc_cpuset_show() | 2231 | * proc_cpuset_show() |
| 1871 | * - Print tasks cpuset path into seq_file. | 2232 | * - Print tasks cpuset path into seq_file. |
| 1872 | * - Used for /proc/<pid>/cpuset. | 2233 | * - Used for /proc/<pid>/cpuset. |
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c deleted file mode 100644 index 334c37f5218a..000000000000 --- a/kernel/crash_dump.c +++ /dev/null | |||
| @@ -1,61 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * kernel/crash_dump.c - Memory preserving reboot related code. | ||
| 3 | * | ||
| 4 | * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) | ||
| 5 | * Copyright (C) IBM Corporation, 2004. All rights reserved | ||
| 6 | */ | ||
| 7 | |||
| 8 | #include <linux/smp_lock.h> | ||
| 9 | #include <linux/errno.h> | ||
| 10 | #include <linux/proc_fs.h> | ||
| 11 | #include <linux/bootmem.h> | ||
| 12 | #include <linux/highmem.h> | ||
| 13 | #include <linux/crash_dump.h> | ||
| 14 | |||
| 15 | #include <asm/io.h> | ||
| 16 | #include <asm/uaccess.h> | ||
| 17 | |||
| 18 | /* Stores the physical address of elf header of crash image. */ | ||
| 19 | unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; | ||
| 20 | |||
| 21 | /** | ||
| 22 | * copy_oldmem_page - copy one page from "oldmem" | ||
| 23 | * @pfn: page frame number to be copied | ||
| 24 | * @buf: target memory address for the copy; this can be in kernel address | ||
| 25 | * space or user address space (see @userbuf) | ||
| 26 | * @csize: number of bytes to copy | ||
| 27 | * @offset: offset in bytes into the page (based on pfn) to begin the copy | ||
| 28 | * @userbuf: if set, @buf is in user address space, use copy_to_user(), | ||
| 29 | * otherwise @buf is in kernel address space, use memcpy(). | ||
| 30 | * | ||
| 31 | * Copy a page from "oldmem". For this page, there is no pte mapped | ||
| 32 | * in the current kernel. We stitch up a pte, similar to kmap_atomic. | ||
| 33 | */ | ||
| 34 | ssize_t copy_oldmem_page(unsigned long pfn, char *buf, | ||
| 35 | size_t csize, unsigned long offset, int userbuf) | ||
| 36 | { | ||
| 37 | void *page, *vaddr; | ||
| 38 | |||
| 39 | if (!csize) | ||
| 40 | return 0; | ||
| 41 | |||
| 42 | page = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
| 43 | if (!page) | ||
| 44 | return -ENOMEM; | ||
| 45 | |||
| 46 | vaddr = kmap_atomic_pfn(pfn, KM_PTE0); | ||
| 47 | copy_page(page, vaddr); | ||
| 48 | kunmap_atomic(vaddr, KM_PTE0); | ||
| 49 | |||
| 50 | if (userbuf) { | ||
| 51 | if (copy_to_user(buf, (page + offset), csize)) { | ||
| 52 | kfree(page); | ||
| 53 | return -EFAULT; | ||
| 54 | } | ||
| 55 | } else { | ||
| 56 | memcpy(buf, (page + offset), csize); | ||
| 57 | } | ||
| 58 | |||
| 59 | kfree(page); | ||
| 60 | return csize; | ||
| 61 | } | ||
diff --git a/kernel/exit.c b/kernel/exit.c index ee515683b92d..f8e609ff1893 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -10,6 +10,7 @@ | |||
| 10 | #include <linux/interrupt.h> | 10 | #include <linux/interrupt.h> |
| 11 | #include <linux/smp_lock.h> | 11 | #include <linux/smp_lock.h> |
| 12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
| 13 | #include <linux/capability.h> | ||
| 13 | #include <linux/completion.h> | 14 | #include <linux/completion.h> |
| 14 | #include <linux/personality.h> | 15 | #include <linux/personality.h> |
| 15 | #include <linux/tty.h> | 16 | #include <linux/tty.h> |
| @@ -29,6 +30,7 @@ | |||
| 29 | #include <linux/syscalls.h> | 30 | #include <linux/syscalls.h> |
| 30 | #include <linux/signal.h> | 31 | #include <linux/signal.h> |
| 31 | #include <linux/cn_proc.h> | 32 | #include <linux/cn_proc.h> |
| 33 | #include <linux/mutex.h> | ||
| 32 | 34 | ||
| 33 | #include <asm/uaccess.h> | 35 | #include <asm/uaccess.h> |
| 34 | #include <asm/unistd.h> | 36 | #include <asm/unistd.h> |
| @@ -72,7 +74,6 @@ repeat: | |||
| 72 | __ptrace_unlink(p); | 74 | __ptrace_unlink(p); |
| 73 | BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); | 75 | BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); |
| 74 | __exit_signal(p); | 76 | __exit_signal(p); |
| 75 | __exit_sighand(p); | ||
| 76 | /* | 77 | /* |
| 77 | * Note that the fastpath in sys_times depends on __exit_signal having | 78 | * Note that the fastpath in sys_times depends on __exit_signal having |
| 78 | * updated the counters before a task is removed from the tasklist of | 79 | * updated the counters before a task is removed from the tasklist of |
| @@ -258,7 +259,7 @@ static inline void reparent_to_init(void) | |||
| 258 | 259 | ||
| 259 | void __set_special_pids(pid_t session, pid_t pgrp) | 260 | void __set_special_pids(pid_t session, pid_t pgrp) |
| 260 | { | 261 | { |
| 261 | struct task_struct *curr = current; | 262 | struct task_struct *curr = current->group_leader; |
| 262 | 263 | ||
| 263 | if (curr->signal->session != session) { | 264 | if (curr->signal->session != session) { |
| 264 | detach_pid(curr, PIDTYPE_SID); | 265 | detach_pid(curr, PIDTYPE_SID); |
| @@ -842,7 +843,7 @@ fastcall NORET_TYPE void do_exit(long code) | |||
| 842 | } | 843 | } |
| 843 | group_dead = atomic_dec_and_test(&tsk->signal->live); | 844 | group_dead = atomic_dec_and_test(&tsk->signal->live); |
| 844 | if (group_dead) { | 845 | if (group_dead) { |
| 845 | del_timer_sync(&tsk->signal->real_timer); | 846 | hrtimer_cancel(&tsk->signal->real_timer); |
| 846 | exit_itimers(tsk->signal); | 847 | exit_itimers(tsk->signal); |
| 847 | acct_process(code); | 848 | acct_process(code); |
| 848 | } | 849 | } |
| @@ -870,6 +871,10 @@ fastcall NORET_TYPE void do_exit(long code) | |||
| 870 | mpol_free(tsk->mempolicy); | 871 | mpol_free(tsk->mempolicy); |
| 871 | tsk->mempolicy = NULL; | 872 | tsk->mempolicy = NULL; |
| 872 | #endif | 873 | #endif |
| 874 | /* | ||
| 875 | * If DEBUG_MUTEXES is on, make sure we are holding no locks: | ||
| 876 | */ | ||
| 877 | mutex_debug_check_no_locks_held(tsk); | ||
| 873 | 878 | ||
| 874 | /* PF_DEAD causes final put_task_struct after we schedule. */ | 879 | /* PF_DEAD causes final put_task_struct after we schedule. */ |
| 875 | preempt_disable(); | 880 | preempt_disable(); |
| @@ -926,7 +931,6 @@ do_group_exit(int exit_code) | |||
| 926 | /* Another thread got here before we took the lock. */ | 931 | /* Another thread got here before we took the lock. */ |
| 927 | exit_code = sig->group_exit_code; | 932 | exit_code = sig->group_exit_code; |
| 928 | else { | 933 | else { |
| 929 | sig->flags = SIGNAL_GROUP_EXIT; | ||
| 930 | sig->group_exit_code = exit_code; | 934 | sig->group_exit_code = exit_code; |
| 931 | zap_other_threads(current); | 935 | zap_other_threads(current); |
| 932 | } | 936 | } |
| @@ -1068,6 +1072,9 @@ static int wait_task_zombie(task_t *p, int noreap, | |||
| 1068 | } | 1072 | } |
| 1069 | 1073 | ||
| 1070 | if (likely(p->real_parent == p->parent) && likely(p->signal)) { | 1074 | if (likely(p->real_parent == p->parent) && likely(p->signal)) { |
| 1075 | struct signal_struct *psig; | ||
| 1076 | struct signal_struct *sig; | ||
| 1077 | |||
| 1071 | /* | 1078 | /* |
| 1072 | * The resource counters for the group leader are in its | 1079 | * The resource counters for the group leader are in its |
| 1073 | * own task_struct. Those for dead threads in the group | 1080 | * own task_struct. Those for dead threads in the group |
| @@ -1084,24 +1091,26 @@ static int wait_task_zombie(task_t *p, int noreap, | |||
| 1084 | * here reaping other children at the same time. | 1091 | * here reaping other children at the same time. |
| 1085 | */ | 1092 | */ |
| 1086 | spin_lock_irq(&p->parent->sighand->siglock); | 1093 | spin_lock_irq(&p->parent->sighand->siglock); |
| 1087 | p->parent->signal->cutime = | 1094 | psig = p->parent->signal; |
| 1088 | cputime_add(p->parent->signal->cutime, | 1095 | sig = p->signal; |
| 1096 | psig->cutime = | ||
| 1097 | cputime_add(psig->cutime, | ||
| 1089 | cputime_add(p->utime, | 1098 | cputime_add(p->utime, |
| 1090 | cputime_add(p->signal->utime, | 1099 | cputime_add(sig->utime, |
| 1091 | p->signal->cutime))); | 1100 | sig->cutime))); |
| 1092 | p->parent->signal->cstime = | 1101 | psig->cstime = |
| 1093 | cputime_add(p->parent->signal->cstime, | 1102 | cputime_add(psig->cstime, |
| 1094 | cputime_add(p->stime, | 1103 | cputime_add(p->stime, |
| 1095 | cputime_add(p->signal->stime, | 1104 | cputime_add(sig->stime, |
| 1096 | p->signal->cstime))); | 1105 | sig->cstime))); |
| 1097 | p->parent->signal->cmin_flt += | 1106 | psig->cmin_flt += |
| 1098 | p->min_flt + p->signal->min_flt + p->signal->cmin_flt; | 1107 | p->min_flt + sig->min_flt + sig->cmin_flt; |
| 1099 | p->parent->signal->cmaj_flt += | 1108 | psig->cmaj_flt += |
| 1100 | p->maj_flt + p->signal->maj_flt + p->signal->cmaj_flt; | 1109 | p->maj_flt + sig->maj_flt + sig->cmaj_flt; |
| 1101 | p->parent->signal->cnvcsw += | 1110 | psig->cnvcsw += |
| 1102 | p->nvcsw + p->signal->nvcsw + p->signal->cnvcsw; | 1111 | p->nvcsw + sig->nvcsw + sig->cnvcsw; |
| 1103 | p->parent->signal->cnivcsw += | 1112 | psig->cnivcsw += |
| 1104 | p->nivcsw + p->signal->nivcsw + p->signal->cnivcsw; | 1113 | p->nivcsw + sig->nivcsw + sig->cnivcsw; |
| 1105 | spin_unlock_irq(&p->parent->sighand->siglock); | 1114 | spin_unlock_irq(&p->parent->sighand->siglock); |
| 1106 | } | 1115 | } |
| 1107 | 1116 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index fb8572a42297..4ae8cfc1c89c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -28,6 +28,7 @@ | |||
| 28 | #include <linux/binfmts.h> | 28 | #include <linux/binfmts.h> |
| 29 | #include <linux/mman.h> | 29 | #include <linux/mman.h> |
| 30 | #include <linux/fs.h> | 30 | #include <linux/fs.h> |
| 31 | #include <linux/capability.h> | ||
| 31 | #include <linux/cpu.h> | 32 | #include <linux/cpu.h> |
| 32 | #include <linux/cpuset.h> | 33 | #include <linux/cpuset.h> |
| 33 | #include <linux/security.h> | 34 | #include <linux/security.h> |
| @@ -743,6 +744,14 @@ int unshare_files(void) | |||
| 743 | 744 | ||
| 744 | EXPORT_SYMBOL(unshare_files); | 745 | EXPORT_SYMBOL(unshare_files); |
| 745 | 746 | ||
| 747 | void sighand_free_cb(struct rcu_head *rhp) | ||
| 748 | { | ||
| 749 | struct sighand_struct *sp; | ||
| 750 | |||
| 751 | sp = container_of(rhp, struct sighand_struct, rcu); | ||
| 752 | kmem_cache_free(sighand_cachep, sp); | ||
| 753 | } | ||
| 754 | |||
| 746 | static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) | 755 | static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) |
| 747 | { | 756 | { |
| 748 | struct sighand_struct *sig; | 757 | struct sighand_struct *sig; |
| @@ -752,7 +761,7 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t | |||
| 752 | return 0; | 761 | return 0; |
| 753 | } | 762 | } |
| 754 | sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); | 763 | sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); |
| 755 | tsk->sighand = sig; | 764 | rcu_assign_pointer(tsk->sighand, sig); |
| 756 | if (!sig) | 765 | if (!sig) |
| 757 | return -ENOMEM; | 766 | return -ENOMEM; |
| 758 | spin_lock_init(&sig->siglock); | 767 | spin_lock_init(&sig->siglock); |
| @@ -793,19 +802,16 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
| 793 | init_sigpending(&sig->shared_pending); | 802 | init_sigpending(&sig->shared_pending); |
| 794 | INIT_LIST_HEAD(&sig->posix_timers); | 803 | INIT_LIST_HEAD(&sig->posix_timers); |
| 795 | 804 | ||
| 796 | sig->it_real_value = sig->it_real_incr = 0; | 805 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC); |
| 806 | sig->it_real_incr.tv64 = 0; | ||
| 797 | sig->real_timer.function = it_real_fn; | 807 | sig->real_timer.function = it_real_fn; |
| 798 | sig->real_timer.data = (unsigned long) tsk; | 808 | sig->real_timer.data = tsk; |
| 799 | init_timer(&sig->real_timer); | ||
| 800 | 809 | ||
| 801 | sig->it_virt_expires = cputime_zero; | 810 | sig->it_virt_expires = cputime_zero; |
| 802 | sig->it_virt_incr = cputime_zero; | 811 | sig->it_virt_incr = cputime_zero; |
| 803 | sig->it_prof_expires = cputime_zero; | 812 | sig->it_prof_expires = cputime_zero; |
| 804 | sig->it_prof_incr = cputime_zero; | 813 | sig->it_prof_incr = cputime_zero; |
| 805 | 814 | ||
| 806 | sig->tty = current->signal->tty; | ||
| 807 | sig->pgrp = process_group(current); | ||
| 808 | sig->session = current->signal->session; | ||
| 809 | sig->leader = 0; /* session leadership doesn't inherit */ | 815 | sig->leader = 0; /* session leadership doesn't inherit */ |
| 810 | sig->tty_old_pgrp = 0; | 816 | sig->tty_old_pgrp = 0; |
| 811 | 817 | ||
| @@ -964,15 +970,20 @@ static task_t *copy_process(unsigned long clone_flags, | |||
| 964 | p->io_context = NULL; | 970 | p->io_context = NULL; |
| 965 | p->io_wait = NULL; | 971 | p->io_wait = NULL; |
| 966 | p->audit_context = NULL; | 972 | p->audit_context = NULL; |
| 973 | cpuset_fork(p); | ||
| 967 | #ifdef CONFIG_NUMA | 974 | #ifdef CONFIG_NUMA |
| 968 | p->mempolicy = mpol_copy(p->mempolicy); | 975 | p->mempolicy = mpol_copy(p->mempolicy); |
| 969 | if (IS_ERR(p->mempolicy)) { | 976 | if (IS_ERR(p->mempolicy)) { |
| 970 | retval = PTR_ERR(p->mempolicy); | 977 | retval = PTR_ERR(p->mempolicy); |
| 971 | p->mempolicy = NULL; | 978 | p->mempolicy = NULL; |
| 972 | goto bad_fork_cleanup; | 979 | goto bad_fork_cleanup_cpuset; |
| 973 | } | 980 | } |
| 974 | #endif | 981 | #endif |
| 975 | 982 | ||
| 983 | #ifdef CONFIG_DEBUG_MUTEXES | ||
| 984 | p->blocked_on = NULL; /* not blocked yet */ | ||
| 985 | #endif | ||
| 986 | |||
| 976 | p->tgid = p->pid; | 987 | p->tgid = p->pid; |
| 977 | if (clone_flags & CLONE_THREAD) | 988 | if (clone_flags & CLONE_THREAD) |
| 978 | p->tgid = current->tgid; | 989 | p->tgid = current->tgid; |
| @@ -1127,25 +1138,19 @@ static task_t *copy_process(unsigned long clone_flags, | |||
| 1127 | attach_pid(p, PIDTYPE_PID, p->pid); | 1138 | attach_pid(p, PIDTYPE_PID, p->pid); |
| 1128 | attach_pid(p, PIDTYPE_TGID, p->tgid); | 1139 | attach_pid(p, PIDTYPE_TGID, p->tgid); |
| 1129 | if (thread_group_leader(p)) { | 1140 | if (thread_group_leader(p)) { |
| 1141 | p->signal->tty = current->signal->tty; | ||
| 1142 | p->signal->pgrp = process_group(current); | ||
| 1143 | p->signal->session = current->signal->session; | ||
| 1130 | attach_pid(p, PIDTYPE_PGID, process_group(p)); | 1144 | attach_pid(p, PIDTYPE_PGID, process_group(p)); |
| 1131 | attach_pid(p, PIDTYPE_SID, p->signal->session); | 1145 | attach_pid(p, PIDTYPE_SID, p->signal->session); |
| 1132 | if (p->pid) | 1146 | if (p->pid) |
| 1133 | __get_cpu_var(process_counts)++; | 1147 | __get_cpu_var(process_counts)++; |
| 1134 | } | 1148 | } |
| 1135 | 1149 | ||
| 1136 | if (!current->signal->tty && p->signal->tty) | ||
| 1137 | p->signal->tty = NULL; | ||
| 1138 | |||
| 1139 | nr_threads++; | 1150 | nr_threads++; |
| 1140 | total_forks++; | 1151 | total_forks++; |
| 1141 | write_unlock_irq(&tasklist_lock); | 1152 | write_unlock_irq(&tasklist_lock); |
| 1142 | proc_fork_connector(p); | 1153 | proc_fork_connector(p); |
| 1143 | cpuset_fork(p); | ||
| 1144 | retval = 0; | ||
| 1145 | |||
| 1146 | fork_out: | ||
| 1147 | if (retval) | ||
| 1148 | return ERR_PTR(retval); | ||
| 1149 | return p; | 1154 | return p; |
| 1150 | 1155 | ||
| 1151 | bad_fork_cleanup_namespace: | 1156 | bad_fork_cleanup_namespace: |
| @@ -1172,7 +1177,9 @@ bad_fork_cleanup_security: | |||
| 1172 | bad_fork_cleanup_policy: | 1177 | bad_fork_cleanup_policy: |
| 1173 | #ifdef CONFIG_NUMA | 1178 | #ifdef CONFIG_NUMA |
| 1174 | mpol_free(p->mempolicy); | 1179 | mpol_free(p->mempolicy); |
| 1180 | bad_fork_cleanup_cpuset: | ||
| 1175 | #endif | 1181 | #endif |
| 1182 | cpuset_exit(p); | ||
| 1176 | bad_fork_cleanup: | 1183 | bad_fork_cleanup: |
| 1177 | if (p->binfmt) | 1184 | if (p->binfmt) |
| 1178 | module_put(p->binfmt->module); | 1185 | module_put(p->binfmt->module); |
| @@ -1184,7 +1191,8 @@ bad_fork_cleanup_count: | |||
| 1184 | free_uid(p->user); | 1191 | free_uid(p->user); |
| 1185 | bad_fork_free: | 1192 | bad_fork_free: |
| 1186 | free_task(p); | 1193 | free_task(p); |
| 1187 | goto fork_out; | 1194 | fork_out: |
| 1195 | return ERR_PTR(retval); | ||
| 1188 | } | 1196 | } |
| 1189 | 1197 | ||
| 1190 | struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) | 1198 | struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) |
| @@ -1290,6 +1298,10 @@ long do_fork(unsigned long clone_flags, | |||
| 1290 | return pid; | 1298 | return pid; |
| 1291 | } | 1299 | } |
| 1292 | 1300 | ||
| 1301 | #ifndef ARCH_MIN_MMSTRUCT_ALIGN | ||
| 1302 | #define ARCH_MIN_MMSTRUCT_ALIGN 0 | ||
| 1303 | #endif | ||
| 1304 | |||
| 1293 | void __init proc_caches_init(void) | 1305 | void __init proc_caches_init(void) |
| 1294 | { | 1306 | { |
| 1295 | sighand_cachep = kmem_cache_create("sighand_cache", | 1307 | sighand_cachep = kmem_cache_create("sighand_cache", |
| @@ -1308,6 +1320,6 @@ void __init proc_caches_init(void) | |||
| 1308 | sizeof(struct vm_area_struct), 0, | 1320 | sizeof(struct vm_area_struct), 0, |
| 1309 | SLAB_PANIC, NULL, NULL); | 1321 | SLAB_PANIC, NULL, NULL); |
| 1310 | mm_cachep = kmem_cache_create("mm_struct", | 1322 | mm_cachep = kmem_cache_create("mm_struct", |
| 1311 | sizeof(struct mm_struct), 0, | 1323 | sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, |
| 1312 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); | 1324 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); |
| 1313 | } | 1325 | } |
diff --git a/kernel/futex.c b/kernel/futex.c index 5e71a6bf6f6b..5efa2f978032 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -356,6 +356,13 @@ retry: | |||
| 356 | if (bh1 != bh2) | 356 | if (bh1 != bh2) |
| 357 | spin_unlock(&bh2->lock); | 357 | spin_unlock(&bh2->lock); |
| 358 | 358 | ||
| 359 | #ifndef CONFIG_MMU | ||
| 360 | /* we don't get EFAULT from MMU faults if we don't have an MMU, | ||
| 361 | * but we might get them from range checking */ | ||
| 362 | ret = op_ret; | ||
| 363 | goto out; | ||
| 364 | #endif | ||
| 365 | |||
| 359 | if (unlikely(op_ret != -EFAULT)) { | 366 | if (unlikely(op_ret != -EFAULT)) { |
| 360 | ret = op_ret; | 367 | ret = op_ret; |
| 361 | goto out; | 368 | goto out; |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c new file mode 100644 index 000000000000..04ccab099e84 --- /dev/null +++ b/kernel/hrtimer.c | |||
| @@ -0,0 +1,825 @@ | |||
| 1 | /* | ||
| 2 | * linux/kernel/hrtimer.c | ||
| 3 | * | ||
| 4 | * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de> | ||
| 5 | * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar | ||
| 6 | * | ||
| 7 | * High-resolution kernel timers | ||
| 8 | * | ||
| 9 | * In contrast to the low-resolution timeout API implemented in | ||
| 10 | * kernel/timer.c, hrtimers provide finer resolution and accuracy | ||
| 11 | * depending on system configuration and capabilities. | ||
| 12 | * | ||
| 13 | * These timers are currently used for: | ||
| 14 | * - itimers | ||
| 15 | * - POSIX timers | ||
| 16 | * - nanosleep | ||
| 17 | * - precise in-kernel timing | ||
| 18 | * | ||
| 19 | * Started by: Thomas Gleixner and Ingo Molnar | ||
| 20 | * | ||
| 21 | * Credits: | ||
| 22 | * based on kernel/timer.c | ||
| 23 | * | ||
| 24 | * For licencing details see kernel-base/COPYING | ||
| 25 | */ | ||
| 26 | |||
| 27 | #include <linux/cpu.h> | ||
| 28 | #include <linux/module.h> | ||
| 29 | #include <linux/percpu.h> | ||
| 30 | #include <linux/hrtimer.h> | ||
| 31 | #include <linux/notifier.h> | ||
| 32 | #include <linux/syscalls.h> | ||
| 33 | #include <linux/interrupt.h> | ||
| 34 | |||
| 35 | #include <asm/uaccess.h> | ||
| 36 | |||
| 37 | /** | ||
| 38 | * ktime_get - get the monotonic time in ktime_t format | ||
| 39 | * | ||
| 40 | * returns the time in ktime_t format | ||
| 41 | */ | ||
| 42 | static ktime_t ktime_get(void) | ||
| 43 | { | ||
| 44 | struct timespec now; | ||
| 45 | |||
| 46 | ktime_get_ts(&now); | ||
| 47 | |||
| 48 | return timespec_to_ktime(now); | ||
| 49 | } | ||
| 50 | |||
| 51 | /** | ||
| 52 | * ktime_get_real - get the real (wall-) time in ktime_t format | ||
| 53 | * | ||
| 54 | * returns the time in ktime_t format | ||
| 55 | */ | ||
| 56 | static ktime_t ktime_get_real(void) | ||
| 57 | { | ||
| 58 | struct timespec now; | ||
| 59 | |||
| 60 | getnstimeofday(&now); | ||
| 61 | |||
| 62 | return timespec_to_ktime(now); | ||
| 63 | } | ||
| 64 | |||
| 65 | EXPORT_SYMBOL_GPL(ktime_get_real); | ||
| 66 | |||
| 67 | /* | ||
| 68 | * The timer bases: | ||
| 69 | */ | ||
| 70 | |||
| 71 | #define MAX_HRTIMER_BASES 2 | ||
| 72 | |||
| 73 | static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) = | ||
| 74 | { | ||
| 75 | { | ||
| 76 | .index = CLOCK_REALTIME, | ||
| 77 | .get_time = &ktime_get_real, | ||
| 78 | .resolution = KTIME_REALTIME_RES, | ||
| 79 | }, | ||
| 80 | { | ||
| 81 | .index = CLOCK_MONOTONIC, | ||
| 82 | .get_time = &ktime_get, | ||
| 83 | .resolution = KTIME_MONOTONIC_RES, | ||
| 84 | }, | ||
| 85 | }; | ||
| 86 | |||
| 87 | /** | ||
| 88 | * ktime_get_ts - get the monotonic clock in timespec format | ||
| 89 | * | ||
| 90 | * @ts: pointer to timespec variable | ||
| 91 | * | ||
| 92 | * The function calculates the monotonic clock from the realtime | ||
| 93 | * clock and the wall_to_monotonic offset and stores the result | ||
| 94 | * in normalized timespec format in the variable pointed to by ts. | ||
| 95 | */ | ||
| 96 | void ktime_get_ts(struct timespec *ts) | ||
| 97 | { | ||
| 98 | struct timespec tomono; | ||
| 99 | unsigned long seq; | ||
| 100 | |||
| 101 | do { | ||
| 102 | seq = read_seqbegin(&xtime_lock); | ||
| 103 | getnstimeofday(ts); | ||
| 104 | tomono = wall_to_monotonic; | ||
| 105 | |||
| 106 | } while (read_seqretry(&xtime_lock, seq)); | ||
| 107 | |||
| 108 | set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, | ||
| 109 | ts->tv_nsec + tomono.tv_nsec); | ||
| 110 | } | ||
| 111 | EXPORT_SYMBOL_GPL(ktime_get_ts); | ||
| 112 | |||
| 113 | /* | ||
| 114 | * Functions and macros which are different for UP/SMP systems are kept in a | ||
| 115 | * single place | ||
| 116 | */ | ||
| 117 | #ifdef CONFIG_SMP | ||
| 118 | |||
| 119 | #define set_curr_timer(b, t) do { (b)->curr_timer = (t); } while (0) | ||
| 120 | |||
| 121 | /* | ||
| 122 | * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock | ||
| 123 | * means that all timers which are tied to this base via timer->base are | ||
| 124 | * locked, and the base itself is locked too. | ||
| 125 | * | ||
| 126 | * So __run_timers/migrate_timers can safely modify all timers which could | ||
| 127 | * be found on the lists/queues. | ||
| 128 | * | ||
| 129 | * When the timer's base is locked, and the timer removed from list, it is | ||
| 130 | * possible to set timer->base = NULL and drop the lock: the timer remains | ||
| 131 | * locked. | ||
| 132 | */ | ||
| 133 | static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer, | ||
| 134 | unsigned long *flags) | ||
| 135 | { | ||
| 136 | struct hrtimer_base *base; | ||
| 137 | |||
| 138 | for (;;) { | ||
| 139 | base = timer->base; | ||
| 140 | if (likely(base != NULL)) { | ||
| 141 | spin_lock_irqsave(&base->lock, *flags); | ||
| 142 | if (likely(base == timer->base)) | ||
| 143 | return base; | ||
| 144 | /* The timer has migrated to another CPU: */ | ||
| 145 | spin_unlock_irqrestore(&base->lock, *flags); | ||
| 146 | } | ||
| 147 | cpu_relax(); | ||
| 148 | } | ||
| 149 | } | ||
| 150 | |||
| 151 | /* | ||
| 152 | * Switch the timer base to the current CPU when possible. | ||
| 153 | */ | ||
| 154 | static inline struct hrtimer_base * | ||
| 155 | switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) | ||
| 156 | { | ||
| 157 | struct hrtimer_base *new_base; | ||
| 158 | |||
| 159 | new_base = &__get_cpu_var(hrtimer_bases[base->index]); | ||
| 160 | |||
| 161 | if (base != new_base) { | ||
| 162 | /* | ||
| 163 | * We are trying to schedule the timer on the local CPU. | ||
| 164 | * However we can't change timer's base while it is running, | ||
| 165 | * so we keep it on the same CPU. No hassle vs. reprogramming | ||
| 166 | * the event source in the high resolution case. The softirq | ||
| 167 | * code will take care of this when the timer function has | ||
| 168 | * completed. There is no conflict as we hold the lock until | ||
| 169 | * the timer is enqueued. | ||
| 170 | */ | ||
| 171 | if (unlikely(base->curr_timer == timer)) | ||
| 172 | return base; | ||
| 173 | |||
| 174 | /* See the comment in lock_timer_base() */ | ||
| 175 | timer->base = NULL; | ||
| 176 | spin_unlock(&base->lock); | ||
| 177 | spin_lock(&new_base->lock); | ||
| 178 | timer->base = new_base; | ||
| 179 | } | ||
| 180 | return new_base; | ||
| 181 | } | ||
| 182 | |||
| 183 | #else /* CONFIG_SMP */ | ||
| 184 | |||
| 185 | #define set_curr_timer(b, t) do { } while (0) | ||
| 186 | |||
| 187 | static inline struct hrtimer_base * | ||
| 188 | lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) | ||
| 189 | { | ||
| 190 | struct hrtimer_base *base = timer->base; | ||
| 191 | |||
| 192 | spin_lock_irqsave(&base->lock, *flags); | ||
| 193 | |||
| 194 | return base; | ||
| 195 | } | ||
| 196 | |||
| 197 | #define switch_hrtimer_base(t, b) (b) | ||
| 198 | |||
| 199 | #endif /* !CONFIG_SMP */ | ||
| 200 | |||
| 201 | /* | ||
| 202 | * Functions for the union type storage format of ktime_t which are | ||
| 203 | * too large for inlining: | ||
| 204 | */ | ||
| 205 | #if BITS_PER_LONG < 64 | ||
| 206 | # ifndef CONFIG_KTIME_SCALAR | ||
| 207 | /** | ||
| 208 | * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable | ||
| 209 | * | ||
| 210 | * @kt: addend | ||
| 211 | * @nsec: the scalar nsec value to add | ||
| 212 | * | ||
| 213 | * Returns the sum of kt and nsec in ktime_t format | ||
| 214 | */ | ||
| 215 | ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) | ||
| 216 | { | ||
| 217 | ktime_t tmp; | ||
| 218 | |||
| 219 | if (likely(nsec < NSEC_PER_SEC)) { | ||
| 220 | tmp.tv64 = nsec; | ||
| 221 | } else { | ||
| 222 | unsigned long rem = do_div(nsec, NSEC_PER_SEC); | ||
| 223 | |||
| 224 | tmp = ktime_set((long)nsec, rem); | ||
| 225 | } | ||
| 226 | |||
| 227 | return ktime_add(kt, tmp); | ||
| 228 | } | ||
| 229 | |||
| 230 | #else /* CONFIG_KTIME_SCALAR */ | ||
| 231 | |||
| 232 | # endif /* !CONFIG_KTIME_SCALAR */ | ||
| 233 | |||
| 234 | /* | ||
| 235 | * Divide a ktime value by a nanosecond value | ||
| 236 | */ | ||
| 237 | static unsigned long ktime_divns(const ktime_t kt, nsec_t div) | ||
| 238 | { | ||
| 239 | u64 dclc, inc, dns; | ||
| 240 | int sft = 0; | ||
| 241 | |||
| 242 | dclc = dns = ktime_to_ns(kt); | ||
| 243 | inc = div; | ||
| 244 | /* Make sure the divisor is less than 2^32: */ | ||
| 245 | while (div >> 32) { | ||
| 246 | sft++; | ||
| 247 | div >>= 1; | ||
| 248 | } | ||
| 249 | dclc >>= sft; | ||
| 250 | do_div(dclc, (unsigned long) div); | ||
| 251 | |||
| 252 | return (unsigned long) dclc; | ||
| 253 | } | ||
| 254 | |||
| 255 | #else /* BITS_PER_LONG < 64 */ | ||
| 256 | # define ktime_divns(kt, div) (unsigned long)((kt).tv64 / (div)) | ||
| 257 | #endif /* BITS_PER_LONG >= 64 */ | ||
| 258 | |||
| 259 | /* | ||
| 260 | * Counterpart to lock_timer_base above: | ||
| 261 | */ | ||
| 262 | static inline | ||
| 263 | void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) | ||
| 264 | { | ||
| 265 | spin_unlock_irqrestore(&timer->base->lock, *flags); | ||
| 266 | } | ||
| 267 | |||
| 268 | /** | ||
| 269 | * hrtimer_forward - forward the timer expiry | ||
| 270 | * | ||
| 271 | * @timer: hrtimer to forward | ||
| 272 | * @interval: the interval to forward | ||
| 273 | * | ||
| 274 | * Forward the timer expiry so it will expire in the future. | ||
| 275 | * The number of overruns is added to the overrun field. | ||
| 276 | */ | ||
| 277 | unsigned long | ||
| 278 | hrtimer_forward(struct hrtimer *timer, ktime_t interval) | ||
| 279 | { | ||
| 280 | unsigned long orun = 1; | ||
| 281 | ktime_t delta, now; | ||
| 282 | |||
| 283 | now = timer->base->get_time(); | ||
| 284 | |||
| 285 | delta = ktime_sub(now, timer->expires); | ||
| 286 | |||
| 287 | if (delta.tv64 < 0) | ||
| 288 | return 0; | ||
| 289 | |||
| 290 | if (interval.tv64 < timer->base->resolution.tv64) | ||
| 291 | interval.tv64 = timer->base->resolution.tv64; | ||
| 292 | |||
| 293 | if (unlikely(delta.tv64 >= interval.tv64)) { | ||
| 294 | nsec_t incr = ktime_to_ns(interval); | ||
| 295 | |||
| 296 | orun = ktime_divns(delta, incr); | ||
| 297 | timer->expires = ktime_add_ns(timer->expires, incr * orun); | ||
| 298 | if (timer->expires.tv64 > now.tv64) | ||
| 299 | return orun; | ||
| 300 | /* | ||
| 301 | * This (and the ktime_add() below) is the | ||
| 302 | * correction for exact: | ||
| 303 | */ | ||
| 304 | orun++; | ||
| 305 | } | ||
| 306 | timer->expires = ktime_add(timer->expires, interval); | ||
| 307 | |||
| 308 | return orun; | ||
| 309 | } | ||
| 310 | |||
| 311 | /* | ||
| 312 | * enqueue_hrtimer - internal function to (re)start a timer | ||
| 313 | * | ||
| 314 | * The timer is inserted in expiry order. Insertion into the | ||
| 315 | * red black tree is O(log(n)). Must hold the base lock. | ||
| 316 | */ | ||
| 317 | static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | ||
| 318 | { | ||
| 319 | struct rb_node **link = &base->active.rb_node; | ||
| 320 | struct rb_node *parent = NULL; | ||
| 321 | struct hrtimer *entry; | ||
| 322 | |||
| 323 | /* | ||
| 324 | * Find the right place in the rbtree: | ||
| 325 | */ | ||
| 326 | while (*link) { | ||
| 327 | parent = *link; | ||
| 328 | entry = rb_entry(parent, struct hrtimer, node); | ||
| 329 | /* | ||
| 330 | * We dont care about collisions. Nodes with | ||
| 331 | * the same expiry time stay together. | ||
| 332 | */ | ||
| 333 | if (timer->expires.tv64 < entry->expires.tv64) | ||
| 334 | link = &(*link)->rb_left; | ||
| 335 | else | ||
| 336 | link = &(*link)->rb_right; | ||
| 337 | } | ||
| 338 | |||
| 339 | /* | ||
| 340 | * Insert the timer to the rbtree and check whether it | ||
| 341 | * replaces the first pending timer | ||
| 342 | */ | ||
| 343 | rb_link_node(&timer->node, parent, link); | ||
| 344 | rb_insert_color(&timer->node, &base->active); | ||
| 345 | |||
| 346 | timer->state = HRTIMER_PENDING; | ||
| 347 | |||
| 348 | if (!base->first || timer->expires.tv64 < | ||
| 349 | rb_entry(base->first, struct hrtimer, node)->expires.tv64) | ||
| 350 | base->first = &timer->node; | ||
| 351 | } | ||
| 352 | |||
| 353 | /* | ||
| 354 | * __remove_hrtimer - internal function to remove a timer | ||
| 355 | * | ||
| 356 | * Caller must hold the base lock. | ||
| 357 | */ | ||
| 358 | static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | ||
| 359 | { | ||
| 360 | /* | ||
| 361 | * Remove the timer from the rbtree and replace the | ||
| 362 | * first entry pointer if necessary. | ||
| 363 | */ | ||
| 364 | if (base->first == &timer->node) | ||
| 365 | base->first = rb_next(&timer->node); | ||
| 366 | rb_erase(&timer->node, &base->active); | ||
| 367 | } | ||
| 368 | |||
| 369 | /* | ||
| 370 | * remove hrtimer, called with base lock held | ||
| 371 | */ | ||
| 372 | static inline int | ||
| 373 | remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | ||
| 374 | { | ||
| 375 | if (hrtimer_active(timer)) { | ||
| 376 | __remove_hrtimer(timer, base); | ||
| 377 | timer->state = HRTIMER_INACTIVE; | ||
| 378 | return 1; | ||
| 379 | } | ||
| 380 | return 0; | ||
| 381 | } | ||
| 382 | |||
| 383 | /** | ||
| 384 | * hrtimer_start - (re)start an relative timer on the current CPU | ||
| 385 | * | ||
| 386 | * @timer: the timer to be added | ||
| 387 | * @tim: expiry time | ||
| 388 | * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) | ||
| 389 | * | ||
| 390 | * Returns: | ||
| 391 | * 0 on success | ||
| 392 | * 1 when the timer was active | ||
| 393 | */ | ||
| 394 | int | ||
| 395 | hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) | ||
| 396 | { | ||
| 397 | struct hrtimer_base *base, *new_base; | ||
| 398 | unsigned long flags; | ||
| 399 | int ret; | ||
| 400 | |||
| 401 | base = lock_hrtimer_base(timer, &flags); | ||
| 402 | |||
| 403 | /* Remove an active timer from the queue: */ | ||
| 404 | ret = remove_hrtimer(timer, base); | ||
| 405 | |||
| 406 | /* Switch the timer base, if necessary: */ | ||
| 407 | new_base = switch_hrtimer_base(timer, base); | ||
| 408 | |||
| 409 | if (mode == HRTIMER_REL) | ||
| 410 | tim = ktime_add(tim, new_base->get_time()); | ||
| 411 | timer->expires = tim; | ||
| 412 | |||
| 413 | enqueue_hrtimer(timer, new_base); | ||
| 414 | |||
| 415 | unlock_hrtimer_base(timer, &flags); | ||
| 416 | |||
| 417 | return ret; | ||
| 418 | } | ||
| 419 | |||
| 420 | /** | ||
| 421 | * hrtimer_try_to_cancel - try to deactivate a timer | ||
| 422 | * | ||
| 423 | * @timer: hrtimer to stop | ||
| 424 | * | ||
| 425 | * Returns: | ||
| 426 | * 0 when the timer was not active | ||
| 427 | * 1 when the timer was active | ||
| 428 | * -1 when the timer is currently excuting the callback function and | ||
| 429 | * can not be stopped | ||
| 430 | */ | ||
| 431 | int hrtimer_try_to_cancel(struct hrtimer *timer) | ||
| 432 | { | ||
| 433 | struct hrtimer_base *base; | ||
| 434 | unsigned long flags; | ||
| 435 | int ret = -1; | ||
| 436 | |||
| 437 | base = lock_hrtimer_base(timer, &flags); | ||
| 438 | |||
| 439 | if (base->curr_timer != timer) | ||
| 440 | ret = remove_hrtimer(timer, base); | ||
| 441 | |||
| 442 | unlock_hrtimer_base(timer, &flags); | ||
| 443 | |||
| 444 | return ret; | ||
| 445 | |||
| 446 | } | ||
| 447 | |||
| 448 | /** | ||
| 449 | * hrtimer_cancel - cancel a timer and wait for the handler to finish. | ||
| 450 | * | ||
| 451 | * @timer: the timer to be cancelled | ||
| 452 | * | ||
| 453 | * Returns: | ||
| 454 | * 0 when the timer was not active | ||
| 455 | * 1 when the timer was active | ||
| 456 | */ | ||
| 457 | int hrtimer_cancel(struct hrtimer *timer) | ||
| 458 | { | ||
| 459 | for (;;) { | ||
| 460 | int ret = hrtimer_try_to_cancel(timer); | ||
| 461 | |||
| 462 | if (ret >= 0) | ||
| 463 | return ret; | ||
| 464 | } | ||
| 465 | } | ||
| 466 | |||
| 467 | /** | ||
| 468 | * hrtimer_get_remaining - get remaining time for the timer | ||
| 469 | * | ||
| 470 | * @timer: the timer to read | ||
| 471 | */ | ||
| 472 | ktime_t hrtimer_get_remaining(const struct hrtimer *timer) | ||
| 473 | { | ||
| 474 | struct hrtimer_base *base; | ||
| 475 | unsigned long flags; | ||
| 476 | ktime_t rem; | ||
| 477 | |||
| 478 | base = lock_hrtimer_base(timer, &flags); | ||
| 479 | rem = ktime_sub(timer->expires, timer->base->get_time()); | ||
| 480 | unlock_hrtimer_base(timer, &flags); | ||
| 481 | |||
| 482 | return rem; | ||
| 483 | } | ||
| 484 | |||
| 485 | /** | ||
| 486 | * hrtimer_rebase - rebase an initialized hrtimer to a different base | ||
| 487 | * | ||
| 488 | * @timer: the timer to be rebased | ||
| 489 | * @clock_id: the clock to be used | ||
| 490 | */ | ||
| 491 | void hrtimer_rebase(struct hrtimer *timer, const clockid_t clock_id) | ||
| 492 | { | ||
| 493 | struct hrtimer_base *bases; | ||
| 494 | |||
| 495 | bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); | ||
| 496 | timer->base = &bases[clock_id]; | ||
| 497 | } | ||
| 498 | |||
| 499 | /** | ||
| 500 | * hrtimer_init - initialize a timer to the given clock | ||
| 501 | * | ||
| 502 | * @timer: the timer to be initialized | ||
| 503 | * @clock_id: the clock to be used | ||
| 504 | */ | ||
| 505 | void hrtimer_init(struct hrtimer *timer, const clockid_t clock_id) | ||
| 506 | { | ||
| 507 | memset(timer, 0, sizeof(struct hrtimer)); | ||
| 508 | hrtimer_rebase(timer, clock_id); | ||
| 509 | } | ||
| 510 | |||
| 511 | /** | ||
| 512 | * hrtimer_get_res - get the timer resolution for a clock | ||
| 513 | * | ||
| 514 | * @which_clock: which clock to query | ||
| 515 | * @tp: pointer to timespec variable to store the resolution | ||
| 516 | * | ||
| 517 | * Store the resolution of the clock selected by which_clock in the | ||
| 518 | * variable pointed to by tp. | ||
| 519 | */ | ||
| 520 | int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) | ||
| 521 | { | ||
| 522 | struct hrtimer_base *bases; | ||
| 523 | |||
| 524 | bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); | ||
| 525 | *tp = ktime_to_timespec(bases[which_clock].resolution); | ||
| 526 | |||
| 527 | return 0; | ||
| 528 | } | ||
| 529 | |||
| 530 | /* | ||
| 531 | * Expire the per base hrtimer-queue: | ||
| 532 | */ | ||
| 533 | static inline void run_hrtimer_queue(struct hrtimer_base *base) | ||
| 534 | { | ||
| 535 | ktime_t now = base->get_time(); | ||
| 536 | struct rb_node *node; | ||
| 537 | |||
| 538 | spin_lock_irq(&base->lock); | ||
| 539 | |||
| 540 | while ((node = base->first)) { | ||
| 541 | struct hrtimer *timer; | ||
| 542 | int (*fn)(void *); | ||
| 543 | int restart; | ||
| 544 | void *data; | ||
| 545 | |||
| 546 | timer = rb_entry(node, struct hrtimer, node); | ||
| 547 | if (now.tv64 <= timer->expires.tv64) | ||
| 548 | break; | ||
| 549 | |||
| 550 | fn = timer->function; | ||
| 551 | data = timer->data; | ||
| 552 | set_curr_timer(base, timer); | ||
| 553 | __remove_hrtimer(timer, base); | ||
| 554 | spin_unlock_irq(&base->lock); | ||
| 555 | |||
| 556 | /* | ||
| 557 | * fn == NULL is special case for the simplest timer | ||
| 558 | * variant - wake up process and do not restart: | ||
| 559 | */ | ||
| 560 | if (!fn) { | ||
| 561 | wake_up_process(data); | ||
| 562 | restart = HRTIMER_NORESTART; | ||
| 563 | } else | ||
| 564 | restart = fn(data); | ||
| 565 | |||
| 566 | spin_lock_irq(&base->lock); | ||
| 567 | |||
| 568 | if (restart == HRTIMER_RESTART) | ||
| 569 | enqueue_hrtimer(timer, base); | ||
| 570 | else | ||
| 571 | timer->state = HRTIMER_EXPIRED; | ||
| 572 | } | ||
| 573 | set_curr_timer(base, NULL); | ||
| 574 | spin_unlock_irq(&base->lock); | ||
| 575 | } | ||
| 576 | |||
| 577 | /* | ||
| 578 | * Called from timer softirq every jiffy, expire hrtimers: | ||
| 579 | */ | ||
| 580 | void hrtimer_run_queues(void) | ||
| 581 | { | ||
| 582 | struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); | ||
| 583 | int i; | ||
| 584 | |||
| 585 | for (i = 0; i < MAX_HRTIMER_BASES; i++) | ||
| 586 | run_hrtimer_queue(&base[i]); | ||
| 587 | } | ||
| 588 | |||
| 589 | /* | ||
| 590 | * Sleep related functions: | ||
| 591 | */ | ||
| 592 | |||
| 593 | /** | ||
| 594 | * schedule_hrtimer - sleep until timeout | ||
| 595 | * | ||
| 596 | * @timer: hrtimer variable initialized with the correct clock base | ||
| 597 | * @mode: timeout value is abs/rel | ||
| 598 | * | ||
| 599 | * Make the current task sleep until @timeout is | ||
| 600 | * elapsed. | ||
| 601 | * | ||
| 602 | * You can set the task state as follows - | ||
| 603 | * | ||
| 604 | * %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to | ||
| 605 | * pass before the routine returns. The routine will return 0 | ||
| 606 | * | ||
| 607 | * %TASK_INTERRUPTIBLE - the routine may return early if a signal is | ||
| 608 | * delivered to the current task. In this case the remaining time | ||
| 609 | * will be returned | ||
| 610 | * | ||
| 611 | * The current task state is guaranteed to be TASK_RUNNING when this | ||
| 612 | * routine returns. | ||
| 613 | */ | ||
| 614 | static ktime_t __sched | ||
| 615 | schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode) | ||
| 616 | { | ||
| 617 | /* fn stays NULL, meaning single-shot wakeup: */ | ||
| 618 | timer->data = current; | ||
| 619 | |||
| 620 | hrtimer_start(timer, timer->expires, mode); | ||
| 621 | |||
| 622 | schedule(); | ||
| 623 | hrtimer_cancel(timer); | ||
| 624 | |||
| 625 | /* Return the remaining time: */ | ||
| 626 | if (timer->state != HRTIMER_EXPIRED) | ||
| 627 | return ktime_sub(timer->expires, timer->base->get_time()); | ||
| 628 | else | ||
| 629 | return (ktime_t) {.tv64 = 0 }; | ||
| 630 | } | ||
| 631 | |||
| 632 | static inline ktime_t __sched | ||
| 633 | schedule_hrtimer_interruptible(struct hrtimer *timer, | ||
| 634 | const enum hrtimer_mode mode) | ||
| 635 | { | ||
| 636 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 637 | |||
| 638 | return schedule_hrtimer(timer, mode); | ||
| 639 | } | ||
| 640 | |||
| 641 | static long __sched | ||
| 642 | nanosleep_restart(struct restart_block *restart, clockid_t clockid) | ||
| 643 | { | ||
| 644 | struct timespec __user *rmtp, tu; | ||
| 645 | void *rfn_save = restart->fn; | ||
| 646 | struct hrtimer timer; | ||
| 647 | ktime_t rem; | ||
| 648 | |||
| 649 | restart->fn = do_no_restart_syscall; | ||
| 650 | |||
| 651 | hrtimer_init(&timer, clockid); | ||
| 652 | |||
| 653 | timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0; | ||
| 654 | |||
| 655 | rem = schedule_hrtimer_interruptible(&timer, HRTIMER_ABS); | ||
| 656 | |||
| 657 | if (rem.tv64 <= 0) | ||
| 658 | return 0; | ||
| 659 | |||
| 660 | rmtp = (struct timespec __user *) restart->arg2; | ||
| 661 | tu = ktime_to_timespec(rem); | ||
| 662 | if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu))) | ||
| 663 | return -EFAULT; | ||
| 664 | |||
| 665 | restart->fn = rfn_save; | ||
| 666 | |||
| 667 | /* The other values in restart are already filled in */ | ||
| 668 | return -ERESTART_RESTARTBLOCK; | ||
| 669 | } | ||
| 670 | |||
| 671 | static long __sched nanosleep_restart_mono(struct restart_block *restart) | ||
| 672 | { | ||
| 673 | return nanosleep_restart(restart, CLOCK_MONOTONIC); | ||
| 674 | } | ||
| 675 | |||
| 676 | static long __sched nanosleep_restart_real(struct restart_block *restart) | ||
| 677 | { | ||
| 678 | return nanosleep_restart(restart, CLOCK_REALTIME); | ||
| 679 | } | ||
| 680 | |||
| 681 | long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, | ||
| 682 | const enum hrtimer_mode mode, const clockid_t clockid) | ||
| 683 | { | ||
| 684 | struct restart_block *restart; | ||
| 685 | struct hrtimer timer; | ||
| 686 | struct timespec tu; | ||
| 687 | ktime_t rem; | ||
| 688 | |||
| 689 | hrtimer_init(&timer, clockid); | ||
| 690 | |||
| 691 | timer.expires = timespec_to_ktime(*rqtp); | ||
| 692 | |||
| 693 | rem = schedule_hrtimer_interruptible(&timer, mode); | ||
| 694 | if (rem.tv64 <= 0) | ||
| 695 | return 0; | ||
| 696 | |||
| 697 | /* Absolute timers do not update the rmtp value: */ | ||
| 698 | if (mode == HRTIMER_ABS) | ||
| 699 | return -ERESTARTNOHAND; | ||
| 700 | |||
| 701 | tu = ktime_to_timespec(rem); | ||
| 702 | |||
| 703 | if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu))) | ||
| 704 | return -EFAULT; | ||
| 705 | |||
| 706 | restart = ¤t_thread_info()->restart_block; | ||
| 707 | restart->fn = (clockid == CLOCK_MONOTONIC) ? | ||
| 708 | nanosleep_restart_mono : nanosleep_restart_real; | ||
| 709 | restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF; | ||
| 710 | restart->arg1 = timer.expires.tv64 >> 32; | ||
| 711 | restart->arg2 = (unsigned long) rmtp; | ||
| 712 | |||
| 713 | return -ERESTART_RESTARTBLOCK; | ||
| 714 | } | ||
| 715 | |||
| 716 | asmlinkage long | ||
| 717 | sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) | ||
| 718 | { | ||
| 719 | struct timespec tu; | ||
| 720 | |||
| 721 | if (copy_from_user(&tu, rqtp, sizeof(tu))) | ||
| 722 | return -EFAULT; | ||
| 723 | |||
| 724 | if (!timespec_valid(&tu)) | ||
| 725 | return -EINVAL; | ||
| 726 | |||
| 727 | return hrtimer_nanosleep(&tu, rmtp, HRTIMER_REL, CLOCK_MONOTONIC); | ||
| 728 | } | ||
| 729 | |||
| 730 | /* | ||
| 731 | * Functions related to boot-time initialization: | ||
| 732 | */ | ||
| 733 | static void __devinit init_hrtimers_cpu(int cpu) | ||
| 734 | { | ||
| 735 | struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu); | ||
| 736 | int i; | ||
| 737 | |||
| 738 | for (i = 0; i < MAX_HRTIMER_BASES; i++) { | ||
| 739 | spin_lock_init(&base->lock); | ||
| 740 | base++; | ||
| 741 | } | ||
| 742 | } | ||
| 743 | |||
| 744 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 745 | |||
| 746 | static void migrate_hrtimer_list(struct hrtimer_base *old_base, | ||
| 747 | struct hrtimer_base *new_base) | ||
| 748 | { | ||
| 749 | struct hrtimer *timer; | ||
| 750 | struct rb_node *node; | ||
| 751 | |||
| 752 | while ((node = rb_first(&old_base->active))) { | ||
| 753 | timer = rb_entry(node, struct hrtimer, node); | ||
| 754 | __remove_hrtimer(timer, old_base); | ||
| 755 | timer->base = new_base; | ||
| 756 | enqueue_hrtimer(timer, new_base); | ||
| 757 | } | ||
| 758 | } | ||
| 759 | |||
| 760 | static void migrate_hrtimers(int cpu) | ||
| 761 | { | ||
| 762 | struct hrtimer_base *old_base, *new_base; | ||
| 763 | int i; | ||
| 764 | |||
| 765 | BUG_ON(cpu_online(cpu)); | ||
| 766 | old_base = per_cpu(hrtimer_bases, cpu); | ||
| 767 | new_base = get_cpu_var(hrtimer_bases); | ||
| 768 | |||
| 769 | local_irq_disable(); | ||
| 770 | |||
| 771 | for (i = 0; i < MAX_HRTIMER_BASES; i++) { | ||
| 772 | |||
| 773 | spin_lock(&new_base->lock); | ||
| 774 | spin_lock(&old_base->lock); | ||
| 775 | |||
| 776 | BUG_ON(old_base->curr_timer); | ||
| 777 | |||
| 778 | migrate_hrtimer_list(old_base, new_base); | ||
| 779 | |||
| 780 | spin_unlock(&old_base->lock); | ||
| 781 | spin_unlock(&new_base->lock); | ||
| 782 | old_base++; | ||
| 783 | new_base++; | ||
| 784 | } | ||
| 785 | |||
| 786 | local_irq_enable(); | ||
| 787 | put_cpu_var(hrtimer_bases); | ||
| 788 | } | ||
| 789 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
| 790 | |||
| 791 | static int __devinit hrtimer_cpu_notify(struct notifier_block *self, | ||
| 792 | unsigned long action, void *hcpu) | ||
| 793 | { | ||
| 794 | long cpu = (long)hcpu; | ||
| 795 | |||
| 796 | switch (action) { | ||
| 797 | |||
| 798 | case CPU_UP_PREPARE: | ||
| 799 | init_hrtimers_cpu(cpu); | ||
| 800 | break; | ||
| 801 | |||
| 802 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 803 | case CPU_DEAD: | ||
| 804 | migrate_hrtimers(cpu); | ||
| 805 | break; | ||
| 806 | #endif | ||
| 807 | |||
| 808 | default: | ||
| 809 | break; | ||
| 810 | } | ||
| 811 | |||
| 812 | return NOTIFY_OK; | ||
| 813 | } | ||
| 814 | |||
| 815 | static struct notifier_block __devinitdata hrtimers_nb = { | ||
| 816 | .notifier_call = hrtimer_cpu_notify, | ||
| 817 | }; | ||
| 818 | |||
| 819 | void __init hrtimers_init(void) | ||
| 820 | { | ||
| 821 | hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, | ||
| 822 | (void *)(long)smp_processor_id()); | ||
| 823 | register_cpu_notifier(&hrtimers_nb); | ||
| 824 | } | ||
| 825 | |||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 81c49a4d679e..97d5559997d2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -366,6 +366,8 @@ int request_irq(unsigned int irq, | |||
| 366 | action->next = NULL; | 366 | action->next = NULL; |
| 367 | action->dev_id = dev_id; | 367 | action->dev_id = dev_id; |
| 368 | 368 | ||
| 369 | select_smp_affinity(irq); | ||
| 370 | |||
| 369 | retval = setup_irq(irq, action); | 371 | retval = setup_irq(irq, action); |
| 370 | if (retval) | 372 | if (retval) |
| 371 | kfree(action); | 373 | kfree(action); |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index f26e534c6585..d03b5eef8ce0 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
| @@ -10,6 +10,8 @@ | |||
| 10 | #include <linux/proc_fs.h> | 10 | #include <linux/proc_fs.h> |
| 11 | #include <linux/interrupt.h> | 11 | #include <linux/interrupt.h> |
| 12 | 12 | ||
| 13 | #include "internals.h" | ||
| 14 | |||
| 13 | static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; | 15 | static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; |
| 14 | 16 | ||
| 15 | #ifdef CONFIG_SMP | 17 | #ifdef CONFIG_SMP |
| @@ -68,7 +70,9 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, | |||
| 68 | */ | 70 | */ |
| 69 | cpus_and(tmp, new_value, cpu_online_map); | 71 | cpus_and(tmp, new_value, cpu_online_map); |
| 70 | if (cpus_empty(tmp)) | 72 | if (cpus_empty(tmp)) |
| 71 | return -EINVAL; | 73 | /* Special case for empty set - allow the architecture |
| 74 | code to set default SMP affinity. */ | ||
| 75 | return select_smp_affinity(irq) ? -EINVAL : full_count; | ||
| 72 | 76 | ||
| 73 | proc_set_irq_affinity(irq, new_value); | 77 | proc_set_irq_affinity(irq, new_value); |
| 74 | 78 | ||
diff --git a/kernel/itimer.c b/kernel/itimer.c index 7c1b25e25e47..c2c05c4ff28d 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c | |||
| @@ -12,36 +12,46 @@ | |||
| 12 | #include <linux/syscalls.h> | 12 | #include <linux/syscalls.h> |
| 13 | #include <linux/time.h> | 13 | #include <linux/time.h> |
| 14 | #include <linux/posix-timers.h> | 14 | #include <linux/posix-timers.h> |
| 15 | #include <linux/hrtimer.h> | ||
| 15 | 16 | ||
| 16 | #include <asm/uaccess.h> | 17 | #include <asm/uaccess.h> |
| 17 | 18 | ||
| 18 | static unsigned long it_real_value(struct signal_struct *sig) | 19 | /** |
| 20 | * itimer_get_remtime - get remaining time for the timer | ||
| 21 | * | ||
| 22 | * @timer: the timer to read | ||
| 23 | * | ||
| 24 | * Returns the delta between the expiry time and now, which can be | ||
| 25 | * less than zero or 1usec for an pending expired timer | ||
| 26 | */ | ||
| 27 | static struct timeval itimer_get_remtime(struct hrtimer *timer) | ||
| 19 | { | 28 | { |
| 20 | unsigned long val = 0; | 29 | ktime_t rem = hrtimer_get_remaining(timer); |
| 21 | if (timer_pending(&sig->real_timer)) { | ||
| 22 | val = sig->real_timer.expires - jiffies; | ||
| 23 | 30 | ||
| 24 | /* look out for negative/zero itimer.. */ | 31 | /* |
| 25 | if ((long) val <= 0) | 32 | * Racy but safe: if the itimer expires after the above |
| 26 | val = 1; | 33 | * hrtimer_get_remtime() call but before this condition |
| 27 | } | 34 | * then we return 0 - which is correct. |
| 28 | return val; | 35 | */ |
| 36 | if (hrtimer_active(timer)) { | ||
| 37 | if (rem.tv64 <= 0) | ||
| 38 | rem.tv64 = NSEC_PER_USEC; | ||
| 39 | } else | ||
| 40 | rem.tv64 = 0; | ||
| 41 | |||
| 42 | return ktime_to_timeval(rem); | ||
| 29 | } | 43 | } |
| 30 | 44 | ||
| 31 | int do_getitimer(int which, struct itimerval *value) | 45 | int do_getitimer(int which, struct itimerval *value) |
| 32 | { | 46 | { |
| 33 | struct task_struct *tsk = current; | 47 | struct task_struct *tsk = current; |
| 34 | unsigned long interval, val; | ||
| 35 | cputime_t cinterval, cval; | 48 | cputime_t cinterval, cval; |
| 36 | 49 | ||
| 37 | switch (which) { | 50 | switch (which) { |
| 38 | case ITIMER_REAL: | 51 | case ITIMER_REAL: |
| 39 | spin_lock_irq(&tsk->sighand->siglock); | 52 | value->it_value = itimer_get_remtime(&tsk->signal->real_timer); |
| 40 | interval = tsk->signal->it_real_incr; | 53 | value->it_interval = |
| 41 | val = it_real_value(tsk->signal); | 54 | ktime_to_timeval(tsk->signal->it_real_incr); |
| 42 | spin_unlock_irq(&tsk->sighand->siglock); | ||
| 43 | jiffies_to_timeval(val, &value->it_value); | ||
| 44 | jiffies_to_timeval(interval, &value->it_interval); | ||
| 45 | break; | 55 | break; |
| 46 | case ITIMER_VIRTUAL: | 56 | case ITIMER_VIRTUAL: |
| 47 | read_lock(&tasklist_lock); | 57 | read_lock(&tasklist_lock); |
| @@ -113,59 +123,45 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value) | |||
| 113 | } | 123 | } |
| 114 | 124 | ||
| 115 | 125 | ||
| 116 | void it_real_fn(unsigned long __data) | 126 | /* |
| 127 | * The timer is automagically restarted, when interval != 0 | ||
| 128 | */ | ||
| 129 | int it_real_fn(void *data) | ||
| 117 | { | 130 | { |
| 118 | struct task_struct * p = (struct task_struct *) __data; | 131 | struct task_struct *tsk = (struct task_struct *) data; |
| 119 | unsigned long inc = p->signal->it_real_incr; | ||
| 120 | 132 | ||
| 121 | send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p); | 133 | send_group_sig_info(SIGALRM, SEND_SIG_PRIV, tsk); |
| 122 | 134 | ||
| 123 | /* | 135 | if (tsk->signal->it_real_incr.tv64 != 0) { |
| 124 | * Now restart the timer if necessary. We don't need any locking | 136 | hrtimer_forward(&tsk->signal->real_timer, |
| 125 | * here because do_setitimer makes sure we have finished running | 137 | tsk->signal->it_real_incr); |
| 126 | * before it touches anything. | 138 | |
| 127 | * Note, we KNOW we are (or should be) at a jiffie edge here so | 139 | return HRTIMER_RESTART; |
| 128 | * we don't need the +1 stuff. Also, we want to use the prior | 140 | } |
| 129 | * expire value so as to not "slip" a jiffie if we are late. | 141 | return HRTIMER_NORESTART; |
| 130 | * Deal with requesting a time prior to "now" here rather than | ||
| 131 | * in add_timer. | ||
| 132 | */ | ||
| 133 | if (!inc) | ||
| 134 | return; | ||
| 135 | while (time_before_eq(p->signal->real_timer.expires, jiffies)) | ||
| 136 | p->signal->real_timer.expires += inc; | ||
| 137 | add_timer(&p->signal->real_timer); | ||
| 138 | } | 142 | } |
| 139 | 143 | ||
| 140 | int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) | 144 | int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) |
| 141 | { | 145 | { |
| 142 | struct task_struct *tsk = current; | 146 | struct task_struct *tsk = current; |
| 143 | unsigned long val, interval, expires; | 147 | struct hrtimer *timer; |
| 148 | ktime_t expires; | ||
| 144 | cputime_t cval, cinterval, nval, ninterval; | 149 | cputime_t cval, cinterval, nval, ninterval; |
| 145 | 150 | ||
| 146 | switch (which) { | 151 | switch (which) { |
| 147 | case ITIMER_REAL: | 152 | case ITIMER_REAL: |
| 148 | again: | 153 | timer = &tsk->signal->real_timer; |
| 149 | spin_lock_irq(&tsk->sighand->siglock); | 154 | hrtimer_cancel(timer); |
| 150 | interval = tsk->signal->it_real_incr; | ||
| 151 | val = it_real_value(tsk->signal); | ||
| 152 | /* We are sharing ->siglock with it_real_fn() */ | ||
| 153 | if (try_to_del_timer_sync(&tsk->signal->real_timer) < 0) { | ||
| 154 | spin_unlock_irq(&tsk->sighand->siglock); | ||
| 155 | goto again; | ||
| 156 | } | ||
| 157 | tsk->signal->it_real_incr = | ||
| 158 | timeval_to_jiffies(&value->it_interval); | ||
| 159 | expires = timeval_to_jiffies(&value->it_value); | ||
| 160 | if (expires) | ||
| 161 | mod_timer(&tsk->signal->real_timer, | ||
| 162 | jiffies + 1 + expires); | ||
| 163 | spin_unlock_irq(&tsk->sighand->siglock); | ||
| 164 | if (ovalue) { | 155 | if (ovalue) { |
| 165 | jiffies_to_timeval(val, &ovalue->it_value); | 156 | ovalue->it_value = itimer_get_remtime(timer); |
| 166 | jiffies_to_timeval(interval, | 157 | ovalue->it_interval |
| 167 | &ovalue->it_interval); | 158 | = ktime_to_timeval(tsk->signal->it_real_incr); |
| 168 | } | 159 | } |
| 160 | tsk->signal->it_real_incr = | ||
| 161 | timeval_to_ktime(value->it_interval); | ||
| 162 | expires = timeval_to_ktime(value->it_value); | ||
| 163 | if (expires.tv64 != 0) | ||
| 164 | hrtimer_start(timer, expires, HRTIMER_REL); | ||
| 169 | break; | 165 | break; |
| 170 | case ITIMER_VIRTUAL: | 166 | case ITIMER_VIRTUAL: |
| 171 | nval = timeval_to_cputime(&value->it_value); | 167 | nval = timeval_to_cputime(&value->it_value); |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 2c95848fbce8..bf39d28e4c0e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | * Version 2. See the file COPYING for more details. | 6 | * Version 2. See the file COPYING for more details. |
| 7 | */ | 7 | */ |
| 8 | 8 | ||
| 9 | #include <linux/capability.h> | ||
| 9 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
| 10 | #include <linux/file.h> | 11 | #include <linux/file.h> |
| 11 | #include <linux/slab.h> | 12 | #include <linux/slab.h> |
| @@ -26,6 +27,9 @@ | |||
| 26 | #include <asm/system.h> | 27 | #include <asm/system.h> |
| 27 | #include <asm/semaphore.h> | 28 | #include <asm/semaphore.h> |
| 28 | 29 | ||
| 30 | /* Per cpu memory for storing cpu states in case of system crash. */ | ||
| 31 | note_buf_t* crash_notes; | ||
| 32 | |||
| 29 | /* Location of the reserved area for the crash kernel */ | 33 | /* Location of the reserved area for the crash kernel */ |
| 30 | struct resource crashk_res = { | 34 | struct resource crashk_res = { |
| 31 | .name = "Crash kernel", | 35 | .name = "Crash kernel", |
| @@ -1054,9 +1058,24 @@ void crash_kexec(struct pt_regs *regs) | |||
| 1054 | if (!locked) { | 1058 | if (!locked) { |
| 1055 | image = xchg(&kexec_crash_image, NULL); | 1059 | image = xchg(&kexec_crash_image, NULL); |
| 1056 | if (image) { | 1060 | if (image) { |
| 1057 | machine_crash_shutdown(regs); | 1061 | struct pt_regs fixed_regs; |
| 1062 | crash_setup_regs(&fixed_regs, regs); | ||
| 1063 | machine_crash_shutdown(&fixed_regs); | ||
| 1058 | machine_kexec(image); | 1064 | machine_kexec(image); |
| 1059 | } | 1065 | } |
| 1060 | xchg(&kexec_lock, 0); | 1066 | xchg(&kexec_lock, 0); |
| 1061 | } | 1067 | } |
| 1062 | } | 1068 | } |
| 1069 | |||
| 1070 | static int __init crash_notes_memory_init(void) | ||
| 1071 | { | ||
| 1072 | /* Allocate memory for saving cpu registers. */ | ||
| 1073 | crash_notes = alloc_percpu(note_buf_t); | ||
| 1074 | if (!crash_notes) { | ||
| 1075 | printk("Kexec: Memory allocation for saving cpu register" | ||
| 1076 | " states failed\n"); | ||
| 1077 | return -ENOMEM; | ||
| 1078 | } | ||
| 1079 | return 0; | ||
| 1080 | } | ||
| 1081 | module_init(crash_notes_memory_init) | ||
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3bb71e63a37e..3ea6325228da 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
| @@ -48,10 +48,11 @@ | |||
| 48 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; | 48 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; |
| 49 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; | 49 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; |
| 50 | 50 | ||
| 51 | static DEFINE_SPINLOCK(kprobe_lock); /* Protects kprobe_table */ | 51 | DECLARE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ |
| 52 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ | 52 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ |
| 53 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; | 53 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; |
| 54 | 54 | ||
| 55 | #ifdef __ARCH_WANT_KPROBES_INSN_SLOT | ||
| 55 | /* | 56 | /* |
| 56 | * kprobe->ainsn.insn points to the copy of the instruction to be | 57 | * kprobe->ainsn.insn points to the copy of the instruction to be |
| 57 | * single-stepped. x86_64, POWER4 and above have no-exec support and | 58 | * single-stepped. x86_64, POWER4 and above have no-exec support and |
| @@ -151,6 +152,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot) | |||
| 151 | } | 152 | } |
| 152 | } | 153 | } |
| 153 | } | 154 | } |
| 155 | #endif | ||
| 154 | 156 | ||
| 155 | /* We have preemption disabled.. so it is safe to use __ versions */ | 157 | /* We have preemption disabled.. so it is safe to use __ versions */ |
| 156 | static inline void set_kprobe_instance(struct kprobe *kp) | 158 | static inline void set_kprobe_instance(struct kprobe *kp) |
| @@ -165,7 +167,7 @@ static inline void reset_kprobe_instance(void) | |||
| 165 | 167 | ||
| 166 | /* | 168 | /* |
| 167 | * This routine is called either: | 169 | * This routine is called either: |
| 168 | * - under the kprobe_lock spinlock - during kprobe_[un]register() | 170 | * - under the kprobe_mutex - during kprobe_[un]register() |
| 169 | * OR | 171 | * OR |
| 170 | * - with preemption disabled - from arch/xxx/kernel/kprobes.c | 172 | * - with preemption disabled - from arch/xxx/kernel/kprobes.c |
| 171 | */ | 173 | */ |
| @@ -418,7 +420,6 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | |||
| 418 | /* | 420 | /* |
| 419 | * This is the second or subsequent kprobe at the address - handle | 421 | * This is the second or subsequent kprobe at the address - handle |
| 420 | * the intricacies | 422 | * the intricacies |
| 421 | * TODO: Move kcalloc outside the spin_lock | ||
| 422 | */ | 423 | */ |
| 423 | static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | 424 | static int __kprobes register_aggr_kprobe(struct kprobe *old_p, |
| 424 | struct kprobe *p) | 425 | struct kprobe *p) |
| @@ -430,7 +431,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
| 430 | copy_kprobe(old_p, p); | 431 | copy_kprobe(old_p, p); |
| 431 | ret = add_new_kprobe(old_p, p); | 432 | ret = add_new_kprobe(old_p, p); |
| 432 | } else { | 433 | } else { |
| 433 | ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC); | 434 | ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); |
| 434 | if (!ap) | 435 | if (!ap) |
| 435 | return -ENOMEM; | 436 | return -ENOMEM; |
| 436 | add_aggr_kprobe(ap, old_p); | 437 | add_aggr_kprobe(ap, old_p); |
| @@ -440,25 +441,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
| 440 | return ret; | 441 | return ret; |
| 441 | } | 442 | } |
| 442 | 443 | ||
| 443 | /* kprobe removal house-keeping routines */ | ||
| 444 | static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags) | ||
| 445 | { | ||
| 446 | arch_disarm_kprobe(p); | ||
| 447 | hlist_del_rcu(&p->hlist); | ||
| 448 | spin_unlock_irqrestore(&kprobe_lock, flags); | ||
| 449 | arch_remove_kprobe(p); | ||
| 450 | } | ||
| 451 | |||
| 452 | static inline void cleanup_aggr_kprobe(struct kprobe *old_p, | ||
| 453 | struct kprobe *p, unsigned long flags) | ||
| 454 | { | ||
| 455 | list_del_rcu(&p->list); | ||
| 456 | if (list_empty(&old_p->list)) | ||
| 457 | cleanup_kprobe(old_p, flags); | ||
| 458 | else | ||
| 459 | spin_unlock_irqrestore(&kprobe_lock, flags); | ||
| 460 | } | ||
| 461 | |||
| 462 | static int __kprobes in_kprobes_functions(unsigned long addr) | 444 | static int __kprobes in_kprobes_functions(unsigned long addr) |
| 463 | { | 445 | { |
| 464 | if (addr >= (unsigned long)__kprobes_text_start | 446 | if (addr >= (unsigned long)__kprobes_text_start |
| @@ -467,33 +449,44 @@ static int __kprobes in_kprobes_functions(unsigned long addr) | |||
| 467 | return 0; | 449 | return 0; |
| 468 | } | 450 | } |
| 469 | 451 | ||
| 470 | int __kprobes register_kprobe(struct kprobe *p) | 452 | static int __kprobes __register_kprobe(struct kprobe *p, |
| 453 | unsigned long called_from) | ||
| 471 | { | 454 | { |
| 472 | int ret = 0; | 455 | int ret = 0; |
| 473 | unsigned long flags = 0; | ||
| 474 | struct kprobe *old_p; | 456 | struct kprobe *old_p; |
| 475 | struct module *mod; | 457 | struct module *probed_mod; |
| 476 | 458 | ||
| 477 | if ((!kernel_text_address((unsigned long) p->addr)) || | 459 | if ((!kernel_text_address((unsigned long) p->addr)) || |
| 478 | in_kprobes_functions((unsigned long) p->addr)) | 460 | in_kprobes_functions((unsigned long) p->addr)) |
| 479 | return -EINVAL; | 461 | return -EINVAL; |
| 480 | 462 | ||
| 481 | if ((mod = module_text_address((unsigned long) p->addr)) && | 463 | p->mod_refcounted = 0; |
| 482 | (unlikely(!try_module_get(mod)))) | 464 | /* Check are we probing a module */ |
| 483 | return -EINVAL; | 465 | if ((probed_mod = module_text_address((unsigned long) p->addr))) { |
| 484 | 466 | struct module *calling_mod = module_text_address(called_from); | |
| 485 | if ((ret = arch_prepare_kprobe(p)) != 0) | 467 | /* We must allow modules to probe themself and |
| 486 | goto rm_kprobe; | 468 | * in this case avoid incrementing the module refcount, |
| 469 | * so as to allow unloading of self probing modules. | ||
| 470 | */ | ||
| 471 | if (calling_mod && (calling_mod != probed_mod)) { | ||
| 472 | if (unlikely(!try_module_get(probed_mod))) | ||
| 473 | return -EINVAL; | ||
| 474 | p->mod_refcounted = 1; | ||
| 475 | } else | ||
| 476 | probed_mod = NULL; | ||
| 477 | } | ||
| 487 | 478 | ||
| 488 | p->nmissed = 0; | 479 | p->nmissed = 0; |
| 489 | spin_lock_irqsave(&kprobe_lock, flags); | 480 | down(&kprobe_mutex); |
| 490 | old_p = get_kprobe(p->addr); | 481 | old_p = get_kprobe(p->addr); |
| 491 | if (old_p) { | 482 | if (old_p) { |
| 492 | ret = register_aggr_kprobe(old_p, p); | 483 | ret = register_aggr_kprobe(old_p, p); |
| 493 | goto out; | 484 | goto out; |
| 494 | } | 485 | } |
| 495 | 486 | ||
| 496 | arch_copy_kprobe(p); | 487 | if ((ret = arch_prepare_kprobe(p)) != 0) |
| 488 | goto out; | ||
| 489 | |||
| 497 | INIT_HLIST_NODE(&p->hlist); | 490 | INIT_HLIST_NODE(&p->hlist); |
| 498 | hlist_add_head_rcu(&p->hlist, | 491 | hlist_add_head_rcu(&p->hlist, |
| 499 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); | 492 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); |
| @@ -501,40 +494,66 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
| 501 | arch_arm_kprobe(p); | 494 | arch_arm_kprobe(p); |
| 502 | 495 | ||
| 503 | out: | 496 | out: |
| 504 | spin_unlock_irqrestore(&kprobe_lock, flags); | 497 | up(&kprobe_mutex); |
| 505 | rm_kprobe: | 498 | |
| 506 | if (ret == -EEXIST) | 499 | if (ret && probed_mod) |
| 507 | arch_remove_kprobe(p); | 500 | module_put(probed_mod); |
| 508 | if (ret && mod) | ||
| 509 | module_put(mod); | ||
| 510 | return ret; | 501 | return ret; |
| 511 | } | 502 | } |
| 512 | 503 | ||
| 504 | int __kprobes register_kprobe(struct kprobe *p) | ||
| 505 | { | ||
| 506 | return __register_kprobe(p, | ||
| 507 | (unsigned long)__builtin_return_address(0)); | ||
| 508 | } | ||
| 509 | |||
| 513 | void __kprobes unregister_kprobe(struct kprobe *p) | 510 | void __kprobes unregister_kprobe(struct kprobe *p) |
| 514 | { | 511 | { |
| 515 | unsigned long flags; | ||
| 516 | struct kprobe *old_p; | ||
| 517 | struct module *mod; | 512 | struct module *mod; |
| 513 | struct kprobe *old_p, *list_p; | ||
| 514 | int cleanup_p; | ||
| 518 | 515 | ||
| 519 | spin_lock_irqsave(&kprobe_lock, flags); | 516 | down(&kprobe_mutex); |
| 520 | old_p = get_kprobe(p->addr); | 517 | old_p = get_kprobe(p->addr); |
| 521 | if (old_p) { | 518 | if (unlikely(!old_p)) { |
| 522 | /* cleanup_*_kprobe() does the spin_unlock_irqrestore */ | 519 | up(&kprobe_mutex); |
| 523 | if (old_p->pre_handler == aggr_pre_handler) | 520 | return; |
| 524 | cleanup_aggr_kprobe(old_p, p, flags); | 521 | } |
| 525 | else | 522 | if (p != old_p) { |
| 526 | cleanup_kprobe(p, flags); | 523 | list_for_each_entry_rcu(list_p, &old_p->list, list) |
| 524 | if (list_p == p) | ||
| 525 | /* kprobe p is a valid probe */ | ||
| 526 | goto valid_p; | ||
| 527 | up(&kprobe_mutex); | ||
| 528 | return; | ||
| 529 | } | ||
| 530 | valid_p: | ||
| 531 | if ((old_p == p) || ((old_p->pre_handler == aggr_pre_handler) && | ||
| 532 | (p->list.next == &old_p->list) && | ||
| 533 | (p->list.prev == &old_p->list))) { | ||
| 534 | /* Only probe on the hash list */ | ||
| 535 | arch_disarm_kprobe(p); | ||
| 536 | hlist_del_rcu(&old_p->hlist); | ||
| 537 | cleanup_p = 1; | ||
| 538 | } else { | ||
| 539 | list_del_rcu(&p->list); | ||
| 540 | cleanup_p = 0; | ||
| 541 | } | ||
| 527 | 542 | ||
| 528 | synchronize_sched(); | 543 | up(&kprobe_mutex); |
| 529 | 544 | ||
| 530 | if ((mod = module_text_address((unsigned long)p->addr))) | 545 | synchronize_sched(); |
| 531 | module_put(mod); | 546 | if (p->mod_refcounted && |
| 547 | (mod = module_text_address((unsigned long)p->addr))) | ||
| 548 | module_put(mod); | ||
| 532 | 549 | ||
| 533 | if (old_p->pre_handler == aggr_pre_handler && | 550 | if (cleanup_p) { |
| 534 | list_empty(&old_p->list)) | 551 | if (p != old_p) { |
| 552 | list_del_rcu(&p->list); | ||
| 535 | kfree(old_p); | 553 | kfree(old_p); |
| 536 | } else | 554 | } |
| 537 | spin_unlock_irqrestore(&kprobe_lock, flags); | 555 | arch_remove_kprobe(p); |
| 556 | } | ||
| 538 | } | 557 | } |
| 539 | 558 | ||
| 540 | static struct notifier_block kprobe_exceptions_nb = { | 559 | static struct notifier_block kprobe_exceptions_nb = { |
| @@ -548,7 +567,8 @@ int __kprobes register_jprobe(struct jprobe *jp) | |||
| 548 | jp->kp.pre_handler = setjmp_pre_handler; | 567 | jp->kp.pre_handler = setjmp_pre_handler; |
| 549 | jp->kp.break_handler = longjmp_break_handler; | 568 | jp->kp.break_handler = longjmp_break_handler; |
| 550 | 569 | ||
| 551 | return register_kprobe(&jp->kp); | 570 | return __register_kprobe(&jp->kp, |
| 571 | (unsigned long)__builtin_return_address(0)); | ||
| 552 | } | 572 | } |
| 553 | 573 | ||
| 554 | void __kprobes unregister_jprobe(struct jprobe *jp) | 574 | void __kprobes unregister_jprobe(struct jprobe *jp) |
| @@ -588,7 +608,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp) | |||
| 588 | 608 | ||
| 589 | rp->nmissed = 0; | 609 | rp->nmissed = 0; |
| 590 | /* Establish function entry probe point */ | 610 | /* Establish function entry probe point */ |
| 591 | if ((ret = register_kprobe(&rp->kp)) != 0) | 611 | if ((ret = __register_kprobe(&rp->kp, |
| 612 | (unsigned long)__builtin_return_address(0))) != 0) | ||
| 592 | free_rp_inst(rp); | 613 | free_rp_inst(rp); |
| 593 | return ret; | 614 | return ret; |
| 594 | } | 615 | } |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 015fb69ad94d..d5eeae0fa5bc 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
| @@ -15,6 +15,9 @@ | |||
| 15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
| 16 | #include <linux/init.h> | 16 | #include <linux/init.h> |
| 17 | 17 | ||
| 18 | u64 uevent_seqnum; | ||
| 19 | char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug"; | ||
| 20 | |||
| 18 | #define KERNEL_ATTR_RO(_name) \ | 21 | #define KERNEL_ATTR_RO(_name) \ |
| 19 | static struct subsys_attribute _name##_attr = __ATTR_RO(_name) | 22 | static struct subsys_attribute _name##_attr = __ATTR_RO(_name) |
| 20 | 23 | ||
| @@ -23,21 +26,29 @@ static struct subsys_attribute _name##_attr = \ | |||
| 23 | __ATTR(_name, 0644, _name##_show, _name##_store) | 26 | __ATTR(_name, 0644, _name##_show, _name##_store) |
| 24 | 27 | ||
| 25 | #ifdef CONFIG_HOTPLUG | 28 | #ifdef CONFIG_HOTPLUG |
| 26 | static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page) | 29 | /* current uevent sequence number */ |
| 30 | static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page) | ||
| 27 | { | 31 | { |
| 28 | return sprintf(page, "%llu\n", (unsigned long long)hotplug_seqnum); | 32 | return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum); |
| 29 | } | 33 | } |
| 30 | KERNEL_ATTR_RO(hotplug_seqnum); | 34 | KERNEL_ATTR_RO(uevent_seqnum); |
| 31 | #endif | ||
| 32 | |||
| 33 | #ifdef CONFIG_KEXEC | ||
| 34 | #include <asm/kexec.h> | ||
| 35 | 35 | ||
| 36 | static ssize_t crash_notes_show(struct subsystem *subsys, char *page) | 36 | /* uevent helper program, used during early boo */ |
| 37 | static ssize_t uevent_helper_show(struct subsystem *subsys, char *page) | ||
| 37 | { | 38 | { |
| 38 | return sprintf(page, "%p\n", (void *)crash_notes); | 39 | return sprintf(page, "%s\n", uevent_helper); |
| 39 | } | 40 | } |
| 40 | KERNEL_ATTR_RO(crash_notes); | 41 | static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, size_t count) |
| 42 | { | ||
| 43 | if (count+1 > UEVENT_HELPER_PATH_LEN) | ||
| 44 | return -ENOENT; | ||
| 45 | memcpy(uevent_helper, page, count); | ||
| 46 | uevent_helper[count] = '\0'; | ||
| 47 | if (count && uevent_helper[count-1] == '\n') | ||
| 48 | uevent_helper[count-1] = '\0'; | ||
| 49 | return count; | ||
| 50 | } | ||
| 51 | KERNEL_ATTR_RW(uevent_helper); | ||
| 41 | #endif | 52 | #endif |
| 42 | 53 | ||
| 43 | decl_subsys(kernel, NULL, NULL); | 54 | decl_subsys(kernel, NULL, NULL); |
| @@ -45,10 +56,8 @@ EXPORT_SYMBOL_GPL(kernel_subsys); | |||
| 45 | 56 | ||
| 46 | static struct attribute * kernel_attrs[] = { | 57 | static struct attribute * kernel_attrs[] = { |
| 47 | #ifdef CONFIG_HOTPLUG | 58 | #ifdef CONFIG_HOTPLUG |
| 48 | &hotplug_seqnum_attr.attr, | 59 | &uevent_seqnum_attr.attr, |
| 49 | #endif | 60 | &uevent_helper_attr.attr, |
| 50 | #ifdef CONFIG_KEXEC | ||
| 51 | &crash_notes_attr.attr, | ||
| 52 | #endif | 61 | #endif |
| 53 | NULL | 62 | NULL |
| 54 | }; | 63 | }; |
diff --git a/kernel/module.c b/kernel/module.c index 2ea929d51ad0..618ed6e23ecc 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -28,6 +28,7 @@ | |||
| 28 | #include <linux/syscalls.h> | 28 | #include <linux/syscalls.h> |
| 29 | #include <linux/fcntl.h> | 29 | #include <linux/fcntl.h> |
| 30 | #include <linux/rcupdate.h> | 30 | #include <linux/rcupdate.h> |
| 31 | #include <linux/capability.h> | ||
| 31 | #include <linux/cpu.h> | 32 | #include <linux/cpu.h> |
| 32 | #include <linux/moduleparam.h> | 33 | #include <linux/moduleparam.h> |
| 33 | #include <linux/errno.h> | 34 | #include <linux/errno.h> |
| @@ -496,15 +497,15 @@ static void module_unload_free(struct module *mod) | |||
| 496 | } | 497 | } |
| 497 | 498 | ||
| 498 | #ifdef CONFIG_MODULE_FORCE_UNLOAD | 499 | #ifdef CONFIG_MODULE_FORCE_UNLOAD |
| 499 | static inline int try_force(unsigned int flags) | 500 | static inline int try_force_unload(unsigned int flags) |
| 500 | { | 501 | { |
| 501 | int ret = (flags & O_TRUNC); | 502 | int ret = (flags & O_TRUNC); |
| 502 | if (ret) | 503 | if (ret) |
| 503 | add_taint(TAINT_FORCED_MODULE); | 504 | add_taint(TAINT_FORCED_RMMOD); |
| 504 | return ret; | 505 | return ret; |
| 505 | } | 506 | } |
| 506 | #else | 507 | #else |
| 507 | static inline int try_force(unsigned int flags) | 508 | static inline int try_force_unload(unsigned int flags) |
| 508 | { | 509 | { |
| 509 | return 0; | 510 | return 0; |
| 510 | } | 511 | } |
| @@ -524,7 +525,7 @@ static int __try_stop_module(void *_sref) | |||
| 524 | 525 | ||
| 525 | /* If it's not unused, quit unless we are told to block. */ | 526 | /* If it's not unused, quit unless we are told to block. */ |
| 526 | if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) { | 527 | if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) { |
| 527 | if (!(*sref->forced = try_force(sref->flags))) | 528 | if (!(*sref->forced = try_force_unload(sref->flags))) |
| 528 | return -EWOULDBLOCK; | 529 | return -EWOULDBLOCK; |
| 529 | } | 530 | } |
| 530 | 531 | ||
| @@ -609,7 +610,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags) | |||
| 609 | /* If it has an init func, it must have an exit func to unload */ | 610 | /* If it has an init func, it must have an exit func to unload */ |
| 610 | if ((mod->init != NULL && mod->exit == NULL) | 611 | if ((mod->init != NULL && mod->exit == NULL) |
| 611 | || mod->unsafe) { | 612 | || mod->unsafe) { |
| 612 | forced = try_force(flags); | 613 | forced = try_force_unload(flags); |
| 613 | if (!forced) { | 614 | if (!forced) { |
| 614 | /* This module can't be removed */ | 615 | /* This module can't be removed */ |
| 615 | ret = -EBUSY; | 616 | ret = -EBUSY; |
| @@ -958,7 +959,6 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, | |||
| 958 | unsigned long ret; | 959 | unsigned long ret; |
| 959 | const unsigned long *crc; | 960 | const unsigned long *crc; |
| 960 | 961 | ||
| 961 | spin_lock_irq(&modlist_lock); | ||
| 962 | ret = __find_symbol(name, &owner, &crc, mod->license_gplok); | 962 | ret = __find_symbol(name, &owner, &crc, mod->license_gplok); |
| 963 | if (ret) { | 963 | if (ret) { |
| 964 | /* use_module can fail due to OOM, or module unloading */ | 964 | /* use_module can fail due to OOM, or module unloading */ |
| @@ -966,7 +966,6 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, | |||
| 966 | !use_module(mod, owner)) | 966 | !use_module(mod, owner)) |
| 967 | ret = 0; | 967 | ret = 0; |
| 968 | } | 968 | } |
| 969 | spin_unlock_irq(&modlist_lock); | ||
| 970 | return ret; | 969 | return ret; |
| 971 | } | 970 | } |
| 972 | 971 | ||
| @@ -1204,6 +1203,39 @@ void *__symbol_get(const char *symbol) | |||
| 1204 | } | 1203 | } |
| 1205 | EXPORT_SYMBOL_GPL(__symbol_get); | 1204 | EXPORT_SYMBOL_GPL(__symbol_get); |
| 1206 | 1205 | ||
| 1206 | /* | ||
| 1207 | * Ensure that an exported symbol [global namespace] does not already exist | ||
| 1208 | * in the Kernel or in some other modules exported symbol table. | ||
| 1209 | */ | ||
| 1210 | static int verify_export_symbols(struct module *mod) | ||
| 1211 | { | ||
| 1212 | const char *name = NULL; | ||
| 1213 | unsigned long i, ret = 0; | ||
| 1214 | struct module *owner; | ||
| 1215 | const unsigned long *crc; | ||
| 1216 | |||
| 1217 | for (i = 0; i < mod->num_syms; i++) | ||
| 1218 | if (__find_symbol(mod->syms[i].name, &owner, &crc, 1)) { | ||
| 1219 | name = mod->syms[i].name; | ||
| 1220 | ret = -ENOEXEC; | ||
| 1221 | goto dup; | ||
| 1222 | } | ||
| 1223 | |||
| 1224 | for (i = 0; i < mod->num_gpl_syms; i++) | ||
| 1225 | if (__find_symbol(mod->gpl_syms[i].name, &owner, &crc, 1)) { | ||
| 1226 | name = mod->gpl_syms[i].name; | ||
| 1227 | ret = -ENOEXEC; | ||
| 1228 | goto dup; | ||
| 1229 | } | ||
| 1230 | |||
| 1231 | dup: | ||
| 1232 | if (ret) | ||
| 1233 | printk(KERN_ERR "%s: exports duplicate symbol %s (owned by %s)\n", | ||
| 1234 | mod->name, name, module_name(owner)); | ||
| 1235 | |||
| 1236 | return ret; | ||
| 1237 | } | ||
| 1238 | |||
| 1207 | /* Change all symbols so that sh_value encodes the pointer directly. */ | 1239 | /* Change all symbols so that sh_value encodes the pointer directly. */ |
| 1208 | static int simplify_symbols(Elf_Shdr *sechdrs, | 1240 | static int simplify_symbols(Elf_Shdr *sechdrs, |
| 1209 | unsigned int symindex, | 1241 | unsigned int symindex, |
| @@ -1715,6 +1747,11 @@ static struct module *load_module(void __user *umod, | |||
| 1715 | /* Set up license info based on the info section */ | 1747 | /* Set up license info based on the info section */ |
| 1716 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); | 1748 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); |
| 1717 | 1749 | ||
| 1750 | if (strcmp(mod->name, "ndiswrapper") == 0) | ||
| 1751 | add_taint(TAINT_PROPRIETARY_MODULE); | ||
| 1752 | if (strcmp(mod->name, "driverloader") == 0) | ||
| 1753 | add_taint(TAINT_PROPRIETARY_MODULE); | ||
| 1754 | |||
| 1718 | #ifdef CONFIG_MODULE_UNLOAD | 1755 | #ifdef CONFIG_MODULE_UNLOAD |
| 1719 | /* Set up MODINFO_ATTR fields */ | 1756 | /* Set up MODINFO_ATTR fields */ |
| 1720 | setup_modinfo(mod, sechdrs, infoindex); | 1757 | setup_modinfo(mod, sechdrs, infoindex); |
| @@ -1767,6 +1804,12 @@ static struct module *load_module(void __user *umod, | |||
| 1767 | goto cleanup; | 1804 | goto cleanup; |
| 1768 | } | 1805 | } |
| 1769 | 1806 | ||
| 1807 | /* Find duplicate symbols */ | ||
| 1808 | err = verify_export_symbols(mod); | ||
| 1809 | |||
| 1810 | if (err < 0) | ||
| 1811 | goto cleanup; | ||
| 1812 | |||
| 1770 | /* Set up and sort exception table */ | 1813 | /* Set up and sort exception table */ |
| 1771 | mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable); | 1814 | mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable); |
| 1772 | mod->extable = extable = (void *)sechdrs[exindex].sh_addr; | 1815 | mod->extable = extable = (void *)sechdrs[exindex].sh_addr; |
| @@ -1854,8 +1897,7 @@ static struct module *load_module(void __user *umod, | |||
| 1854 | kfree(args); | 1897 | kfree(args); |
| 1855 | free_hdr: | 1898 | free_hdr: |
| 1856 | vfree(hdr); | 1899 | vfree(hdr); |
| 1857 | if (err < 0) return ERR_PTR(err); | 1900 | return ERR_PTR(err); |
| 1858 | else return ptr; | ||
| 1859 | 1901 | ||
| 1860 | truncated: | 1902 | truncated: |
| 1861 | printk(KERN_ERR "Module len %lu truncated\n", len); | 1903 | printk(KERN_ERR "Module len %lu truncated\n", len); |
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c new file mode 100644 index 000000000000..f4913c376950 --- /dev/null +++ b/kernel/mutex-debug.c | |||
| @@ -0,0 +1,462 @@ | |||
| 1 | /* | ||
| 2 | * kernel/mutex-debug.c | ||
| 3 | * | ||
| 4 | * Debugging code for mutexes | ||
| 5 | * | ||
| 6 | * Started by Ingo Molnar: | ||
| 7 | * | ||
| 8 | * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
| 9 | * | ||
| 10 | * lock debugging, locking tree, deadlock detection started by: | ||
| 11 | * | ||
| 12 | * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey | ||
| 13 | * Released under the General Public License (GPL). | ||
| 14 | */ | ||
| 15 | #include <linux/mutex.h> | ||
| 16 | #include <linux/sched.h> | ||
| 17 | #include <linux/delay.h> | ||
| 18 | #include <linux/module.h> | ||
| 19 | #include <linux/spinlock.h> | ||
| 20 | #include <linux/kallsyms.h> | ||
| 21 | #include <linux/interrupt.h> | ||
| 22 | |||
| 23 | #include "mutex-debug.h" | ||
| 24 | |||
| 25 | /* | ||
| 26 | * We need a global lock when we walk through the multi-process | ||
| 27 | * lock tree. Only used in the deadlock-debugging case. | ||
| 28 | */ | ||
| 29 | DEFINE_SPINLOCK(debug_mutex_lock); | ||
| 30 | |||
| 31 | /* | ||
| 32 | * All locks held by all tasks, in a single global list: | ||
| 33 | */ | ||
| 34 | LIST_HEAD(debug_mutex_held_locks); | ||
| 35 | |||
| 36 | /* | ||
| 37 | * In the debug case we carry the caller's instruction pointer into | ||
| 38 | * other functions, but we dont want the function argument overhead | ||
| 39 | * in the nondebug case - hence these macros: | ||
| 40 | */ | ||
| 41 | #define __IP_DECL__ , unsigned long ip | ||
| 42 | #define __IP__ , ip | ||
| 43 | #define __RET_IP__ , (unsigned long)__builtin_return_address(0) | ||
| 44 | |||
| 45 | /* | ||
| 46 | * "mutex debugging enabled" flag. We turn it off when we detect | ||
| 47 | * the first problem because we dont want to recurse back | ||
| 48 | * into the tracing code when doing error printk or | ||
| 49 | * executing a BUG(): | ||
| 50 | */ | ||
| 51 | int debug_mutex_on = 1; | ||
| 52 | |||
| 53 | static void printk_task(struct task_struct *p) | ||
| 54 | { | ||
| 55 | if (p) | ||
| 56 | printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio); | ||
| 57 | else | ||
| 58 | printk("<none>"); | ||
| 59 | } | ||
| 60 | |||
| 61 | static void printk_ti(struct thread_info *ti) | ||
| 62 | { | ||
| 63 | if (ti) | ||
| 64 | printk_task(ti->task); | ||
| 65 | else | ||
| 66 | printk("<none>"); | ||
| 67 | } | ||
| 68 | |||
| 69 | static void printk_task_short(struct task_struct *p) | ||
| 70 | { | ||
| 71 | if (p) | ||
| 72 | printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio); | ||
| 73 | else | ||
| 74 | printk("<none>"); | ||
| 75 | } | ||
| 76 | |||
| 77 | static void printk_lock(struct mutex *lock, int print_owner) | ||
| 78 | { | ||
| 79 | printk(" [%p] {%s}\n", lock, lock->name); | ||
| 80 | |||
| 81 | if (print_owner && lock->owner) { | ||
| 82 | printk(".. held by: "); | ||
| 83 | printk_ti(lock->owner); | ||
| 84 | printk("\n"); | ||
| 85 | } | ||
| 86 | if (lock->owner) { | ||
| 87 | printk("... acquired at: "); | ||
| 88 | print_symbol("%s\n", lock->acquire_ip); | ||
| 89 | } | ||
| 90 | } | ||
| 91 | |||
| 92 | /* | ||
| 93 | * printk locks held by a task: | ||
| 94 | */ | ||
| 95 | static void show_task_locks(struct task_struct *p) | ||
| 96 | { | ||
| 97 | switch (p->state) { | ||
| 98 | case TASK_RUNNING: printk("R"); break; | ||
| 99 | case TASK_INTERRUPTIBLE: printk("S"); break; | ||
| 100 | case TASK_UNINTERRUPTIBLE: printk("D"); break; | ||
| 101 | case TASK_STOPPED: printk("T"); break; | ||
| 102 | case EXIT_ZOMBIE: printk("Z"); break; | ||
| 103 | case EXIT_DEAD: printk("X"); break; | ||
| 104 | default: printk("?"); break; | ||
| 105 | } | ||
| 106 | printk_task(p); | ||
| 107 | if (p->blocked_on) { | ||
| 108 | struct mutex *lock = p->blocked_on->lock; | ||
| 109 | |||
| 110 | printk(" blocked on mutex:"); | ||
| 111 | printk_lock(lock, 1); | ||
| 112 | } else | ||
| 113 | printk(" (not blocked on mutex)\n"); | ||
| 114 | } | ||
| 115 | |||
| 116 | /* | ||
| 117 | * printk all locks held in the system (if filter == NULL), | ||
| 118 | * or all locks belonging to a single task (if filter != NULL): | ||
| 119 | */ | ||
| 120 | void show_held_locks(struct task_struct *filter) | ||
| 121 | { | ||
| 122 | struct list_head *curr, *cursor = NULL; | ||
| 123 | struct mutex *lock; | ||
| 124 | struct thread_info *t; | ||
| 125 | unsigned long flags; | ||
| 126 | int count = 0; | ||
| 127 | |||
| 128 | if (filter) { | ||
| 129 | printk("------------------------------\n"); | ||
| 130 | printk("| showing all locks held by: | ("); | ||
| 131 | printk_task_short(filter); | ||
| 132 | printk("):\n"); | ||
| 133 | printk("------------------------------\n"); | ||
| 134 | } else { | ||
| 135 | printk("---------------------------\n"); | ||
| 136 | printk("| showing all locks held: |\n"); | ||
| 137 | printk("---------------------------\n"); | ||
| 138 | } | ||
| 139 | |||
| 140 | /* | ||
| 141 | * Play safe and acquire the global trace lock. We | ||
| 142 | * cannot printk with that lock held so we iterate | ||
| 143 | * very carefully: | ||
| 144 | */ | ||
| 145 | next: | ||
| 146 | debug_spin_lock_save(&debug_mutex_lock, flags); | ||
| 147 | list_for_each(curr, &debug_mutex_held_locks) { | ||
| 148 | if (cursor && curr != cursor) | ||
| 149 | continue; | ||
| 150 | lock = list_entry(curr, struct mutex, held_list); | ||
| 151 | t = lock->owner; | ||
| 152 | if (filter && (t != filter->thread_info)) | ||
| 153 | continue; | ||
| 154 | count++; | ||
| 155 | cursor = curr->next; | ||
| 156 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
| 157 | |||
| 158 | printk("\n#%03d: ", count); | ||
| 159 | printk_lock(lock, filter ? 0 : 1); | ||
| 160 | goto next; | ||
| 161 | } | ||
| 162 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
| 163 | printk("\n"); | ||
| 164 | } | ||
| 165 | |||
| 166 | void mutex_debug_show_all_locks(void) | ||
| 167 | { | ||
| 168 | struct task_struct *g, *p; | ||
| 169 | int count = 10; | ||
| 170 | int unlock = 1; | ||
| 171 | |||
| 172 | printk("\nShowing all blocking locks in the system:\n"); | ||
| 173 | |||
| 174 | /* | ||
| 175 | * Here we try to get the tasklist_lock as hard as possible, | ||
| 176 | * if not successful after 2 seconds we ignore it (but keep | ||
| 177 | * trying). This is to enable a debug printout even if a | ||
| 178 | * tasklist_lock-holding task deadlocks or crashes. | ||
| 179 | */ | ||
| 180 | retry: | ||
| 181 | if (!read_trylock(&tasklist_lock)) { | ||
| 182 | if (count == 10) | ||
| 183 | printk("hm, tasklist_lock locked, retrying... "); | ||
| 184 | if (count) { | ||
| 185 | count--; | ||
| 186 | printk(" #%d", 10-count); | ||
| 187 | mdelay(200); | ||
| 188 | goto retry; | ||
| 189 | } | ||
| 190 | printk(" ignoring it.\n"); | ||
| 191 | unlock = 0; | ||
| 192 | } | ||
| 193 | if (count != 10) | ||
| 194 | printk(" locked it.\n"); | ||
| 195 | |||
| 196 | do_each_thread(g, p) { | ||
| 197 | show_task_locks(p); | ||
| 198 | if (!unlock) | ||
| 199 | if (read_trylock(&tasklist_lock)) | ||
| 200 | unlock = 1; | ||
| 201 | } while_each_thread(g, p); | ||
| 202 | |||
| 203 | printk("\n"); | ||
| 204 | show_held_locks(NULL); | ||
| 205 | printk("=============================================\n\n"); | ||
| 206 | |||
| 207 | if (unlock) | ||
| 208 | read_unlock(&tasklist_lock); | ||
| 209 | } | ||
| 210 | |||
| 211 | static void report_deadlock(struct task_struct *task, struct mutex *lock, | ||
| 212 | struct mutex *lockblk, unsigned long ip) | ||
| 213 | { | ||
| 214 | printk("\n%s/%d is trying to acquire this lock:\n", | ||
| 215 | current->comm, current->pid); | ||
| 216 | printk_lock(lock, 1); | ||
| 217 | printk("... trying at: "); | ||
| 218 | print_symbol("%s\n", ip); | ||
| 219 | show_held_locks(current); | ||
| 220 | |||
| 221 | if (lockblk) { | ||
| 222 | printk("but %s/%d is deadlocking current task %s/%d!\n\n", | ||
| 223 | task->comm, task->pid, current->comm, current->pid); | ||
| 224 | printk("\n%s/%d is blocked on this lock:\n", | ||
| 225 | task->comm, task->pid); | ||
| 226 | printk_lock(lockblk, 1); | ||
| 227 | |||
| 228 | show_held_locks(task); | ||
| 229 | |||
| 230 | printk("\n%s/%d's [blocked] stackdump:\n\n", | ||
| 231 | task->comm, task->pid); | ||
| 232 | show_stack(task, NULL); | ||
| 233 | } | ||
| 234 | |||
| 235 | printk("\n%s/%d's [current] stackdump:\n\n", | ||
| 236 | current->comm, current->pid); | ||
| 237 | dump_stack(); | ||
| 238 | mutex_debug_show_all_locks(); | ||
| 239 | printk("[ turning off deadlock detection. Please report this. ]\n\n"); | ||
| 240 | local_irq_disable(); | ||
| 241 | } | ||
| 242 | |||
| 243 | /* | ||
| 244 | * Recursively check for mutex deadlocks: | ||
| 245 | */ | ||
| 246 | static int check_deadlock(struct mutex *lock, int depth, | ||
| 247 | struct thread_info *ti, unsigned long ip) | ||
| 248 | { | ||
| 249 | struct mutex *lockblk; | ||
| 250 | struct task_struct *task; | ||
| 251 | |||
| 252 | if (!debug_mutex_on) | ||
| 253 | return 0; | ||
| 254 | |||
| 255 | ti = lock->owner; | ||
| 256 | if (!ti) | ||
| 257 | return 0; | ||
| 258 | |||
| 259 | task = ti->task; | ||
| 260 | lockblk = NULL; | ||
| 261 | if (task->blocked_on) | ||
| 262 | lockblk = task->blocked_on->lock; | ||
| 263 | |||
| 264 | /* Self-deadlock: */ | ||
| 265 | if (current == task) { | ||
| 266 | DEBUG_OFF(); | ||
| 267 | if (depth) | ||
| 268 | return 1; | ||
| 269 | printk("\n==========================================\n"); | ||
| 270 | printk( "[ BUG: lock recursion deadlock detected! |\n"); | ||
| 271 | printk( "------------------------------------------\n"); | ||
| 272 | report_deadlock(task, lock, NULL, ip); | ||
| 273 | return 0; | ||
| 274 | } | ||
| 275 | |||
| 276 | /* Ugh, something corrupted the lock data structure? */ | ||
| 277 | if (depth > 20) { | ||
| 278 | DEBUG_OFF(); | ||
| 279 | printk("\n===========================================\n"); | ||
| 280 | printk( "[ BUG: infinite lock dependency detected!? |\n"); | ||
| 281 | printk( "-------------------------------------------\n"); | ||
| 282 | report_deadlock(task, lock, lockblk, ip); | ||
| 283 | return 0; | ||
| 284 | } | ||
| 285 | |||
| 286 | /* Recursively check for dependencies: */ | ||
| 287 | if (lockblk && check_deadlock(lockblk, depth+1, ti, ip)) { | ||
| 288 | printk("\n============================================\n"); | ||
| 289 | printk( "[ BUG: circular locking deadlock detected! ]\n"); | ||
| 290 | printk( "--------------------------------------------\n"); | ||
| 291 | report_deadlock(task, lock, lockblk, ip); | ||
| 292 | return 0; | ||
| 293 | } | ||
| 294 | return 0; | ||
| 295 | } | ||
| 296 | |||
| 297 | /* | ||
| 298 | * Called when a task exits, this function checks whether the | ||
| 299 | * task is holding any locks, and reports the first one if so: | ||
| 300 | */ | ||
| 301 | void mutex_debug_check_no_locks_held(struct task_struct *task) | ||
| 302 | { | ||
| 303 | struct list_head *curr, *next; | ||
| 304 | struct thread_info *t; | ||
| 305 | unsigned long flags; | ||
| 306 | struct mutex *lock; | ||
| 307 | |||
| 308 | if (!debug_mutex_on) | ||
| 309 | return; | ||
| 310 | |||
| 311 | debug_spin_lock_save(&debug_mutex_lock, flags); | ||
| 312 | list_for_each_safe(curr, next, &debug_mutex_held_locks) { | ||
| 313 | lock = list_entry(curr, struct mutex, held_list); | ||
| 314 | t = lock->owner; | ||
| 315 | if (t != task->thread_info) | ||
| 316 | continue; | ||
| 317 | list_del_init(curr); | ||
| 318 | DEBUG_OFF(); | ||
| 319 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
| 320 | |||
| 321 | printk("BUG: %s/%d, lock held at task exit time!\n", | ||
| 322 | task->comm, task->pid); | ||
| 323 | printk_lock(lock, 1); | ||
| 324 | if (lock->owner != task->thread_info) | ||
| 325 | printk("exiting task is not even the owner??\n"); | ||
| 326 | return; | ||
| 327 | } | ||
| 328 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
| 329 | } | ||
| 330 | |||
| 331 | /* | ||
| 332 | * Called when kernel memory is freed (or unmapped), or if a mutex | ||
| 333 | * is destroyed or reinitialized - this code checks whether there is | ||
| 334 | * any held lock in the memory range of <from> to <to>: | ||
| 335 | */ | ||
| 336 | void mutex_debug_check_no_locks_freed(const void *from, unsigned long len) | ||
| 337 | { | ||
| 338 | struct list_head *curr, *next; | ||
| 339 | const void *to = from + len; | ||
| 340 | unsigned long flags; | ||
| 341 | struct mutex *lock; | ||
| 342 | void *lock_addr; | ||
| 343 | |||
| 344 | if (!debug_mutex_on) | ||
| 345 | return; | ||
| 346 | |||
| 347 | debug_spin_lock_save(&debug_mutex_lock, flags); | ||
| 348 | list_for_each_safe(curr, next, &debug_mutex_held_locks) { | ||
| 349 | lock = list_entry(curr, struct mutex, held_list); | ||
| 350 | lock_addr = lock; | ||
| 351 | if (lock_addr < from || lock_addr >= to) | ||
| 352 | continue; | ||
| 353 | list_del_init(curr); | ||
| 354 | DEBUG_OFF(); | ||
| 355 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
| 356 | |||
| 357 | printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", | ||
| 358 | current->comm, current->pid, lock, from, to); | ||
| 359 | dump_stack(); | ||
| 360 | printk_lock(lock, 1); | ||
| 361 | if (lock->owner != current_thread_info()) | ||
| 362 | printk("freeing task is not even the owner??\n"); | ||
| 363 | return; | ||
| 364 | } | ||
| 365 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
| 366 | } | ||
| 367 | |||
| 368 | /* | ||
| 369 | * Must be called with lock->wait_lock held. | ||
| 370 | */ | ||
| 371 | void debug_mutex_set_owner(struct mutex *lock, | ||
| 372 | struct thread_info *new_owner __IP_DECL__) | ||
| 373 | { | ||
| 374 | lock->owner = new_owner; | ||
| 375 | DEBUG_WARN_ON(!list_empty(&lock->held_list)); | ||
| 376 | if (debug_mutex_on) { | ||
| 377 | list_add_tail(&lock->held_list, &debug_mutex_held_locks); | ||
| 378 | lock->acquire_ip = ip; | ||
| 379 | } | ||
| 380 | } | ||
| 381 | |||
| 382 | void debug_mutex_init_waiter(struct mutex_waiter *waiter) | ||
| 383 | { | ||
| 384 | memset(waiter, 0x11, sizeof(*waiter)); | ||
| 385 | waiter->magic = waiter; | ||
| 386 | INIT_LIST_HEAD(&waiter->list); | ||
| 387 | } | ||
| 388 | |||
| 389 | void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) | ||
| 390 | { | ||
| 391 | SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock)); | ||
| 392 | DEBUG_WARN_ON(list_empty(&lock->wait_list)); | ||
| 393 | DEBUG_WARN_ON(waiter->magic != waiter); | ||
| 394 | DEBUG_WARN_ON(list_empty(&waiter->list)); | ||
| 395 | } | ||
| 396 | |||
| 397 | void debug_mutex_free_waiter(struct mutex_waiter *waiter) | ||
| 398 | { | ||
| 399 | DEBUG_WARN_ON(!list_empty(&waiter->list)); | ||
| 400 | memset(waiter, 0x22, sizeof(*waiter)); | ||
| 401 | } | ||
| 402 | |||
| 403 | void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, | ||
| 404 | struct thread_info *ti __IP_DECL__) | ||
| 405 | { | ||
| 406 | SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock)); | ||
| 407 | check_deadlock(lock, 0, ti, ip); | ||
| 408 | /* Mark the current thread as blocked on the lock: */ | ||
| 409 | ti->task->blocked_on = waiter; | ||
| 410 | waiter->lock = lock; | ||
| 411 | } | ||
| 412 | |||
| 413 | void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, | ||
| 414 | struct thread_info *ti) | ||
| 415 | { | ||
| 416 | DEBUG_WARN_ON(list_empty(&waiter->list)); | ||
| 417 | DEBUG_WARN_ON(waiter->task != ti->task); | ||
| 418 | DEBUG_WARN_ON(ti->task->blocked_on != waiter); | ||
| 419 | ti->task->blocked_on = NULL; | ||
| 420 | |||
| 421 | list_del_init(&waiter->list); | ||
| 422 | waiter->task = NULL; | ||
| 423 | } | ||
| 424 | |||
| 425 | void debug_mutex_unlock(struct mutex *lock) | ||
| 426 | { | ||
| 427 | DEBUG_WARN_ON(lock->magic != lock); | ||
| 428 | DEBUG_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); | ||
| 429 | DEBUG_WARN_ON(lock->owner != current_thread_info()); | ||
| 430 | if (debug_mutex_on) { | ||
| 431 | DEBUG_WARN_ON(list_empty(&lock->held_list)); | ||
| 432 | list_del_init(&lock->held_list); | ||
| 433 | } | ||
| 434 | } | ||
| 435 | |||
| 436 | void debug_mutex_init(struct mutex *lock, const char *name) | ||
| 437 | { | ||
| 438 | /* | ||
| 439 | * Make sure we are not reinitializing a held lock: | ||
| 440 | */ | ||
| 441 | mutex_debug_check_no_locks_freed((void *)lock, sizeof(*lock)); | ||
| 442 | lock->owner = NULL; | ||
| 443 | INIT_LIST_HEAD(&lock->held_list); | ||
| 444 | lock->name = name; | ||
| 445 | lock->magic = lock; | ||
| 446 | } | ||
| 447 | |||
| 448 | /*** | ||
| 449 | * mutex_destroy - mark a mutex unusable | ||
| 450 | * @lock: the mutex to be destroyed | ||
| 451 | * | ||
| 452 | * This function marks the mutex uninitialized, and any subsequent | ||
| 453 | * use of the mutex is forbidden. The mutex must not be locked when | ||
| 454 | * this function is called. | ||
| 455 | */ | ||
| 456 | void fastcall mutex_destroy(struct mutex *lock) | ||
| 457 | { | ||
| 458 | DEBUG_WARN_ON(mutex_is_locked(lock)); | ||
| 459 | lock->magic = NULL; | ||
| 460 | } | ||
| 461 | |||
| 462 | EXPORT_SYMBOL_GPL(mutex_destroy); | ||
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h new file mode 100644 index 000000000000..fd384050acb1 --- /dev/null +++ b/kernel/mutex-debug.h | |||
| @@ -0,0 +1,134 @@ | |||
| 1 | /* | ||
| 2 | * Mutexes: blocking mutual exclusion locks | ||
| 3 | * | ||
| 4 | * started by Ingo Molnar: | ||
| 5 | * | ||
| 6 | * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
| 7 | * | ||
| 8 | * This file contains mutex debugging related internal declarations, | ||
| 9 | * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case. | ||
| 10 | * More details are in kernel/mutex-debug.c. | ||
| 11 | */ | ||
| 12 | |||
| 13 | extern spinlock_t debug_mutex_lock; | ||
| 14 | extern struct list_head debug_mutex_held_locks; | ||
| 15 | extern int debug_mutex_on; | ||
| 16 | |||
| 17 | /* | ||
| 18 | * In the debug case we carry the caller's instruction pointer into | ||
| 19 | * other functions, but we dont want the function argument overhead | ||
| 20 | * in the nondebug case - hence these macros: | ||
| 21 | */ | ||
| 22 | #define __IP_DECL__ , unsigned long ip | ||
| 23 | #define __IP__ , ip | ||
| 24 | #define __RET_IP__ , (unsigned long)__builtin_return_address(0) | ||
| 25 | |||
| 26 | /* | ||
| 27 | * This must be called with lock->wait_lock held. | ||
| 28 | */ | ||
| 29 | extern void debug_mutex_set_owner(struct mutex *lock, | ||
| 30 | struct thread_info *new_owner __IP_DECL__); | ||
| 31 | |||
| 32 | static inline void debug_mutex_clear_owner(struct mutex *lock) | ||
| 33 | { | ||
| 34 | lock->owner = NULL; | ||
| 35 | } | ||
| 36 | |||
| 37 | extern void debug_mutex_init_waiter(struct mutex_waiter *waiter); | ||
| 38 | extern void debug_mutex_wake_waiter(struct mutex *lock, | ||
| 39 | struct mutex_waiter *waiter); | ||
| 40 | extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); | ||
| 41 | extern void debug_mutex_add_waiter(struct mutex *lock, | ||
| 42 | struct mutex_waiter *waiter, | ||
| 43 | struct thread_info *ti __IP_DECL__); | ||
| 44 | extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, | ||
| 45 | struct thread_info *ti); | ||
| 46 | extern void debug_mutex_unlock(struct mutex *lock); | ||
| 47 | extern void debug_mutex_init(struct mutex *lock, const char *name); | ||
| 48 | |||
| 49 | #define debug_spin_lock(lock) \ | ||
| 50 | do { \ | ||
| 51 | local_irq_disable(); \ | ||
| 52 | if (debug_mutex_on) \ | ||
| 53 | spin_lock(lock); \ | ||
| 54 | } while (0) | ||
| 55 | |||
| 56 | #define debug_spin_unlock(lock) \ | ||
| 57 | do { \ | ||
| 58 | if (debug_mutex_on) \ | ||
| 59 | spin_unlock(lock); \ | ||
| 60 | local_irq_enable(); \ | ||
| 61 | preempt_check_resched(); \ | ||
| 62 | } while (0) | ||
| 63 | |||
| 64 | #define debug_spin_lock_save(lock, flags) \ | ||
| 65 | do { \ | ||
| 66 | local_irq_save(flags); \ | ||
| 67 | if (debug_mutex_on) \ | ||
| 68 | spin_lock(lock); \ | ||
| 69 | } while (0) | ||
| 70 | |||
| 71 | #define debug_spin_lock_restore(lock, flags) \ | ||
| 72 | do { \ | ||
| 73 | if (debug_mutex_on) \ | ||
| 74 | spin_unlock(lock); \ | ||
| 75 | local_irq_restore(flags); \ | ||
| 76 | preempt_check_resched(); \ | ||
| 77 | } while (0) | ||
| 78 | |||
| 79 | #define spin_lock_mutex(lock) \ | ||
| 80 | do { \ | ||
| 81 | struct mutex *l = container_of(lock, struct mutex, wait_lock); \ | ||
| 82 | \ | ||
| 83 | DEBUG_WARN_ON(in_interrupt()); \ | ||
| 84 | debug_spin_lock(&debug_mutex_lock); \ | ||
| 85 | spin_lock(lock); \ | ||
| 86 | DEBUG_WARN_ON(l->magic != l); \ | ||
| 87 | } while (0) | ||
| 88 | |||
| 89 | #define spin_unlock_mutex(lock) \ | ||
| 90 | do { \ | ||
| 91 | spin_unlock(lock); \ | ||
| 92 | debug_spin_unlock(&debug_mutex_lock); \ | ||
| 93 | } while (0) | ||
| 94 | |||
| 95 | #define DEBUG_OFF() \ | ||
| 96 | do { \ | ||
| 97 | if (debug_mutex_on) { \ | ||
| 98 | debug_mutex_on = 0; \ | ||
| 99 | console_verbose(); \ | ||
| 100 | if (spin_is_locked(&debug_mutex_lock)) \ | ||
| 101 | spin_unlock(&debug_mutex_lock); \ | ||
| 102 | } \ | ||
| 103 | } while (0) | ||
| 104 | |||
| 105 | #define DEBUG_BUG() \ | ||
| 106 | do { \ | ||
| 107 | if (debug_mutex_on) { \ | ||
| 108 | DEBUG_OFF(); \ | ||
| 109 | BUG(); \ | ||
| 110 | } \ | ||
| 111 | } while (0) | ||
| 112 | |||
| 113 | #define DEBUG_WARN_ON(c) \ | ||
| 114 | do { \ | ||
| 115 | if (unlikely(c && debug_mutex_on)) { \ | ||
| 116 | DEBUG_OFF(); \ | ||
| 117 | WARN_ON(1); \ | ||
| 118 | } \ | ||
| 119 | } while (0) | ||
| 120 | |||
| 121 | # define DEBUG_BUG_ON(c) \ | ||
| 122 | do { \ | ||
| 123 | if (unlikely(c)) \ | ||
| 124 | DEBUG_BUG(); \ | ||
| 125 | } while (0) | ||
| 126 | |||
| 127 | #ifdef CONFIG_SMP | ||
| 128 | # define SMP_DEBUG_WARN_ON(c) DEBUG_WARN_ON(c) | ||
| 129 | # define SMP_DEBUG_BUG_ON(c) DEBUG_BUG_ON(c) | ||
| 130 | #else | ||
| 131 | # define SMP_DEBUG_WARN_ON(c) do { } while (0) | ||
| 132 | # define SMP_DEBUG_BUG_ON(c) do { } while (0) | ||
| 133 | #endif | ||
| 134 | |||
diff --git a/kernel/mutex.c b/kernel/mutex.c new file mode 100644 index 000000000000..5449b210d9ed --- /dev/null +++ b/kernel/mutex.c | |||
| @@ -0,0 +1,315 @@ | |||
| 1 | /* | ||
| 2 | * kernel/mutex.c | ||
| 3 | * | ||
| 4 | * Mutexes: blocking mutual exclusion locks | ||
| 5 | * | ||
| 6 | * Started by Ingo Molnar: | ||
| 7 | * | ||
| 8 | * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
| 9 | * | ||
| 10 | * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and | ||
| 11 | * David Howells for suggestions and improvements. | ||
| 12 | * | ||
| 13 | * Also see Documentation/mutex-design.txt. | ||
| 14 | */ | ||
| 15 | #include <linux/mutex.h> | ||
| 16 | #include <linux/sched.h> | ||
| 17 | #include <linux/module.h> | ||
| 18 | #include <linux/spinlock.h> | ||
| 19 | #include <linux/interrupt.h> | ||
| 20 | |||
| 21 | /* | ||
| 22 | * In the DEBUG case we are using the "NULL fastpath" for mutexes, | ||
| 23 | * which forces all calls into the slowpath: | ||
| 24 | */ | ||
| 25 | #ifdef CONFIG_DEBUG_MUTEXES | ||
| 26 | # include "mutex-debug.h" | ||
| 27 | # include <asm-generic/mutex-null.h> | ||
| 28 | #else | ||
| 29 | # include "mutex.h" | ||
| 30 | # include <asm/mutex.h> | ||
| 31 | #endif | ||
| 32 | |||
| 33 | /*** | ||
| 34 | * mutex_init - initialize the mutex | ||
| 35 | * @lock: the mutex to be initialized | ||
| 36 | * | ||
| 37 | * Initialize the mutex to unlocked state. | ||
| 38 | * | ||
| 39 | * It is not allowed to initialize an already locked mutex. | ||
| 40 | */ | ||
| 41 | void fastcall __mutex_init(struct mutex *lock, const char *name) | ||
| 42 | { | ||
| 43 | atomic_set(&lock->count, 1); | ||
| 44 | spin_lock_init(&lock->wait_lock); | ||
| 45 | INIT_LIST_HEAD(&lock->wait_list); | ||
| 46 | |||
| 47 | debug_mutex_init(lock, name); | ||
| 48 | } | ||
| 49 | |||
| 50 | EXPORT_SYMBOL(__mutex_init); | ||
| 51 | |||
| 52 | /* | ||
| 53 | * We split the mutex lock/unlock logic into separate fastpath and | ||
| 54 | * slowpath functions, to reduce the register pressure on the fastpath. | ||
| 55 | * We also put the fastpath first in the kernel image, to make sure the | ||
| 56 | * branch is predicted by the CPU as default-untaken. | ||
| 57 | */ | ||
| 58 | static void fastcall noinline __sched | ||
| 59 | __mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__); | ||
| 60 | |||
| 61 | /*** | ||
| 62 | * mutex_lock - acquire the mutex | ||
| 63 | * @lock: the mutex to be acquired | ||
| 64 | * | ||
| 65 | * Lock the mutex exclusively for this task. If the mutex is not | ||
| 66 | * available right now, it will sleep until it can get it. | ||
| 67 | * | ||
| 68 | * The mutex must later on be released by the same task that | ||
| 69 | * acquired it. Recursive locking is not allowed. The task | ||
| 70 | * may not exit without first unlocking the mutex. Also, kernel | ||
| 71 | * memory where the mutex resides mutex must not be freed with | ||
| 72 | * the mutex still locked. The mutex must first be initialized | ||
| 73 | * (or statically defined) before it can be locked. memset()-ing | ||
| 74 | * the mutex to 0 is not allowed. | ||
| 75 | * | ||
| 76 | * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging | ||
| 77 | * checks that will enforce the restrictions and will also do | ||
| 78 | * deadlock debugging. ) | ||
| 79 | * | ||
| 80 | * This function is similar to (but not equivalent to) down(). | ||
| 81 | */ | ||
| 82 | void fastcall __sched mutex_lock(struct mutex *lock) | ||
| 83 | { | ||
| 84 | might_sleep(); | ||
| 85 | /* | ||
| 86 | * The locking fastpath is the 1->0 transition from | ||
| 87 | * 'unlocked' into 'locked' state. | ||
| 88 | */ | ||
| 89 | __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); | ||
| 90 | } | ||
| 91 | |||
| 92 | EXPORT_SYMBOL(mutex_lock); | ||
| 93 | |||
| 94 | static void fastcall noinline __sched | ||
| 95 | __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__); | ||
| 96 | |||
| 97 | /*** | ||
| 98 | * mutex_unlock - release the mutex | ||
| 99 | * @lock: the mutex to be released | ||
| 100 | * | ||
| 101 | * Unlock a mutex that has been locked by this task previously. | ||
| 102 | * | ||
| 103 | * This function must not be used in interrupt context. Unlocking | ||
| 104 | * of a not locked mutex is not allowed. | ||
| 105 | * | ||
| 106 | * This function is similar to (but not equivalent to) up(). | ||
| 107 | */ | ||
| 108 | void fastcall __sched mutex_unlock(struct mutex *lock) | ||
| 109 | { | ||
| 110 | /* | ||
| 111 | * The unlocking fastpath is the 0->1 transition from 'locked' | ||
| 112 | * into 'unlocked' state: | ||
| 113 | */ | ||
| 114 | __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); | ||
| 115 | } | ||
| 116 | |||
| 117 | EXPORT_SYMBOL(mutex_unlock); | ||
| 118 | |||
| 119 | /* | ||
| 120 | * Lock a mutex (possibly interruptible), slowpath: | ||
| 121 | */ | ||
| 122 | static inline int __sched | ||
| 123 | __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) | ||
| 124 | { | ||
| 125 | struct task_struct *task = current; | ||
| 126 | struct mutex_waiter waiter; | ||
| 127 | unsigned int old_val; | ||
| 128 | |||
| 129 | debug_mutex_init_waiter(&waiter); | ||
| 130 | |||
| 131 | spin_lock_mutex(&lock->wait_lock); | ||
| 132 | |||
| 133 | debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip); | ||
| 134 | |||
| 135 | /* add waiting tasks to the end of the waitqueue (FIFO): */ | ||
| 136 | list_add_tail(&waiter.list, &lock->wait_list); | ||
| 137 | waiter.task = task; | ||
| 138 | |||
| 139 | for (;;) { | ||
| 140 | /* | ||
| 141 | * Lets try to take the lock again - this is needed even if | ||
| 142 | * we get here for the first time (shortly after failing to | ||
| 143 | * acquire the lock), to make sure that we get a wakeup once | ||
| 144 | * it's unlocked. Later on, if we sleep, this is the | ||
| 145 | * operation that gives us the lock. We xchg it to -1, so | ||
| 146 | * that when we release the lock, we properly wake up the | ||
| 147 | * other waiters: | ||
| 148 | */ | ||
| 149 | old_val = atomic_xchg(&lock->count, -1); | ||
| 150 | if (old_val == 1) | ||
| 151 | break; | ||
| 152 | |||
| 153 | /* | ||
| 154 | * got a signal? (This code gets eliminated in the | ||
| 155 | * TASK_UNINTERRUPTIBLE case.) | ||
| 156 | */ | ||
| 157 | if (unlikely(state == TASK_INTERRUPTIBLE && | ||
| 158 | signal_pending(task))) { | ||
| 159 | mutex_remove_waiter(lock, &waiter, task->thread_info); | ||
| 160 | spin_unlock_mutex(&lock->wait_lock); | ||
| 161 | |||
| 162 | debug_mutex_free_waiter(&waiter); | ||
| 163 | return -EINTR; | ||
| 164 | } | ||
| 165 | __set_task_state(task, state); | ||
| 166 | |||
| 167 | /* didnt get the lock, go to sleep: */ | ||
| 168 | spin_unlock_mutex(&lock->wait_lock); | ||
| 169 | schedule(); | ||
| 170 | spin_lock_mutex(&lock->wait_lock); | ||
| 171 | } | ||
| 172 | |||
| 173 | /* got the lock - rejoice! */ | ||
| 174 | mutex_remove_waiter(lock, &waiter, task->thread_info); | ||
| 175 | debug_mutex_set_owner(lock, task->thread_info __IP__); | ||
| 176 | |||
| 177 | /* set it to 0 if there are no waiters left: */ | ||
| 178 | if (likely(list_empty(&lock->wait_list))) | ||
| 179 | atomic_set(&lock->count, 0); | ||
| 180 | |||
| 181 | spin_unlock_mutex(&lock->wait_lock); | ||
| 182 | |||
| 183 | debug_mutex_free_waiter(&waiter); | ||
| 184 | |||
| 185 | DEBUG_WARN_ON(list_empty(&lock->held_list)); | ||
| 186 | DEBUG_WARN_ON(lock->owner != task->thread_info); | ||
| 187 | |||
| 188 | return 0; | ||
| 189 | } | ||
| 190 | |||
| 191 | static void fastcall noinline __sched | ||
| 192 | __mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__) | ||
| 193 | { | ||
| 194 | struct mutex *lock = container_of(lock_count, struct mutex, count); | ||
| 195 | |||
| 196 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE __IP__); | ||
| 197 | } | ||
| 198 | |||
| 199 | /* | ||
| 200 | * Release the lock, slowpath: | ||
| 201 | */ | ||
| 202 | static fastcall noinline void | ||
| 203 | __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) | ||
| 204 | { | ||
| 205 | struct mutex *lock = container_of(lock_count, struct mutex, count); | ||
| 206 | |||
| 207 | DEBUG_WARN_ON(lock->owner != current_thread_info()); | ||
| 208 | |||
| 209 | spin_lock_mutex(&lock->wait_lock); | ||
| 210 | |||
| 211 | /* | ||
| 212 | * some architectures leave the lock unlocked in the fastpath failure | ||
| 213 | * case, others need to leave it locked. In the later case we have to | ||
| 214 | * unlock it here | ||
| 215 | */ | ||
| 216 | if (__mutex_slowpath_needs_to_unlock()) | ||
| 217 | atomic_set(&lock->count, 1); | ||
| 218 | |||
| 219 | debug_mutex_unlock(lock); | ||
| 220 | |||
| 221 | if (!list_empty(&lock->wait_list)) { | ||
| 222 | /* get the first entry from the wait-list: */ | ||
| 223 | struct mutex_waiter *waiter = | ||
| 224 | list_entry(lock->wait_list.next, | ||
| 225 | struct mutex_waiter, list); | ||
| 226 | |||
| 227 | debug_mutex_wake_waiter(lock, waiter); | ||
| 228 | |||
| 229 | wake_up_process(waiter->task); | ||
| 230 | } | ||
| 231 | |||
| 232 | debug_mutex_clear_owner(lock); | ||
| 233 | |||
| 234 | spin_unlock_mutex(&lock->wait_lock); | ||
| 235 | } | ||
| 236 | |||
| 237 | /* | ||
| 238 | * Here come the less common (and hence less performance-critical) APIs: | ||
| 239 | * mutex_lock_interruptible() and mutex_trylock(). | ||
| 240 | */ | ||
| 241 | static int fastcall noinline __sched | ||
| 242 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__); | ||
| 243 | |||
| 244 | /*** | ||
| 245 | * mutex_lock_interruptible - acquire the mutex, interruptable | ||
| 246 | * @lock: the mutex to be acquired | ||
| 247 | * | ||
| 248 | * Lock the mutex like mutex_lock(), and return 0 if the mutex has | ||
| 249 | * been acquired or sleep until the mutex becomes available. If a | ||
| 250 | * signal arrives while waiting for the lock then this function | ||
| 251 | * returns -EINTR. | ||
| 252 | * | ||
| 253 | * This function is similar to (but not equivalent to) down_interruptible(). | ||
| 254 | */ | ||
| 255 | int fastcall __sched mutex_lock_interruptible(struct mutex *lock) | ||
| 256 | { | ||
| 257 | might_sleep(); | ||
| 258 | return __mutex_fastpath_lock_retval | ||
| 259 | (&lock->count, __mutex_lock_interruptible_slowpath); | ||
| 260 | } | ||
| 261 | |||
| 262 | EXPORT_SYMBOL(mutex_lock_interruptible); | ||
| 263 | |||
| 264 | static int fastcall noinline __sched | ||
| 265 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__) | ||
| 266 | { | ||
| 267 | struct mutex *lock = container_of(lock_count, struct mutex, count); | ||
| 268 | |||
| 269 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE __IP__); | ||
| 270 | } | ||
| 271 | |||
| 272 | /* | ||
| 273 | * Spinlock based trylock, we take the spinlock and check whether we | ||
| 274 | * can get the lock: | ||
| 275 | */ | ||
| 276 | static inline int __mutex_trylock_slowpath(atomic_t *lock_count) | ||
| 277 | { | ||
| 278 | struct mutex *lock = container_of(lock_count, struct mutex, count); | ||
| 279 | int prev; | ||
| 280 | |||
| 281 | spin_lock_mutex(&lock->wait_lock); | ||
| 282 | |||
| 283 | prev = atomic_xchg(&lock->count, -1); | ||
| 284 | if (likely(prev == 1)) | ||
| 285 | debug_mutex_set_owner(lock, current_thread_info() __RET_IP__); | ||
| 286 | /* Set it back to 0 if there are no waiters: */ | ||
| 287 | if (likely(list_empty(&lock->wait_list))) | ||
| 288 | atomic_set(&lock->count, 0); | ||
| 289 | |||
| 290 | spin_unlock_mutex(&lock->wait_lock); | ||
| 291 | |||
| 292 | return prev == 1; | ||
| 293 | } | ||
| 294 | |||
| 295 | /*** | ||
| 296 | * mutex_trylock - try acquire the mutex, without waiting | ||
| 297 | * @lock: the mutex to be acquired | ||
| 298 | * | ||
| 299 | * Try to acquire the mutex atomically. Returns 1 if the mutex | ||
| 300 | * has been acquired successfully, and 0 on contention. | ||
| 301 | * | ||
| 302 | * NOTE: this function follows the spin_trylock() convention, so | ||
| 303 | * it is negated to the down_trylock() return values! Be careful | ||
| 304 | * about this when converting semaphore users to mutexes. | ||
| 305 | * | ||
| 306 | * This function must not be used in interrupt context. The | ||
| 307 | * mutex must be released by the same task that acquired it. | ||
| 308 | */ | ||
| 309 | int fastcall mutex_trylock(struct mutex *lock) | ||
| 310 | { | ||
| 311 | return __mutex_fastpath_trylock(&lock->count, | ||
| 312 | __mutex_trylock_slowpath); | ||
| 313 | } | ||
| 314 | |||
| 315 | EXPORT_SYMBOL(mutex_trylock); | ||
diff --git a/kernel/mutex.h b/kernel/mutex.h new file mode 100644 index 000000000000..00fe84e7b672 --- /dev/null +++ b/kernel/mutex.h | |||
| @@ -0,0 +1,35 @@ | |||
| 1 | /* | ||
| 2 | * Mutexes: blocking mutual exclusion locks | ||
| 3 | * | ||
| 4 | * started by Ingo Molnar: | ||
| 5 | * | ||
| 6 | * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
| 7 | * | ||
| 8 | * This file contains mutex debugging related internal prototypes, for the | ||
| 9 | * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: | ||
| 10 | */ | ||
| 11 | |||
| 12 | #define spin_lock_mutex(lock) spin_lock(lock) | ||
| 13 | #define spin_unlock_mutex(lock) spin_unlock(lock) | ||
| 14 | #define mutex_remove_waiter(lock, waiter, ti) \ | ||
| 15 | __list_del((waiter)->list.prev, (waiter)->list.next) | ||
| 16 | |||
| 17 | #define DEBUG_WARN_ON(c) do { } while (0) | ||
| 18 | #define debug_mutex_set_owner(lock, new_owner) do { } while (0) | ||
| 19 | #define debug_mutex_clear_owner(lock) do { } while (0) | ||
| 20 | #define debug_mutex_init_waiter(waiter) do { } while (0) | ||
| 21 | #define debug_mutex_wake_waiter(lock, waiter) do { } while (0) | ||
| 22 | #define debug_mutex_free_waiter(waiter) do { } while (0) | ||
| 23 | #define debug_mutex_add_waiter(lock, waiter, ti, ip) do { } while (0) | ||
| 24 | #define debug_mutex_unlock(lock) do { } while (0) | ||
| 25 | #define debug_mutex_init(lock, name) do { } while (0) | ||
| 26 | |||
| 27 | /* | ||
| 28 | * Return-address parameters/declarations. They are very useful for | ||
| 29 | * debugging, but add overhead in the !DEBUG case - so we go the | ||
| 30 | * trouble of using this not too elegant but zero-cost solution: | ||
| 31 | */ | ||
| 32 | #define __IP_DECL__ | ||
| 33 | #define __IP__ | ||
| 34 | #define __RET_IP__ | ||
| 35 | |||
diff --git a/kernel/panic.c b/kernel/panic.c index aabc5f86fa3f..c5c4ab255834 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
| @@ -60,7 +60,7 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
| 60 | long i; | 60 | long i; |
| 61 | static char buf[1024]; | 61 | static char buf[1024]; |
| 62 | va_list args; | 62 | va_list args; |
| 63 | #if defined(CONFIG_ARCH_S390) | 63 | #if defined(CONFIG_S390) |
| 64 | unsigned long caller = (unsigned long) __builtin_return_address(0); | 64 | unsigned long caller = (unsigned long) __builtin_return_address(0); |
| 65 | #endif | 65 | #endif |
| 66 | 66 | ||
| @@ -125,7 +125,7 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
| 125 | printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n"); | 125 | printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n"); |
| 126 | } | 126 | } |
| 127 | #endif | 127 | #endif |
| 128 | #if defined(CONFIG_ARCH_S390) | 128 | #if defined(CONFIG_S390) |
| 129 | disabled_wait(caller); | 129 | disabled_wait(caller); |
| 130 | #endif | 130 | #endif |
| 131 | local_irq_enable(); | 131 | local_irq_enable(); |
diff --git a/kernel/pid.c b/kernel/pid.c index edba31c681ac..1acc07246991 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
| @@ -136,7 +136,7 @@ struct pid * fastcall find_pid(enum pid_type type, int nr) | |||
| 136 | struct hlist_node *elem; | 136 | struct hlist_node *elem; |
| 137 | struct pid *pid; | 137 | struct pid *pid; |
| 138 | 138 | ||
| 139 | hlist_for_each_entry(pid, elem, | 139 | hlist_for_each_entry_rcu(pid, elem, |
| 140 | &pid_hash[type][pid_hashfn(nr)], pid_chain) { | 140 | &pid_hash[type][pid_hashfn(nr)], pid_chain) { |
| 141 | if (pid->nr == nr) | 141 | if (pid->nr == nr) |
| 142 | return pid; | 142 | return pid; |
| @@ -150,15 +150,15 @@ int fastcall attach_pid(task_t *task, enum pid_type type, int nr) | |||
| 150 | 150 | ||
| 151 | task_pid = &task->pids[type]; | 151 | task_pid = &task->pids[type]; |
| 152 | pid = find_pid(type, nr); | 152 | pid = find_pid(type, nr); |
| 153 | task_pid->nr = nr; | ||
| 153 | if (pid == NULL) { | 154 | if (pid == NULL) { |
| 154 | hlist_add_head(&task_pid->pid_chain, | ||
| 155 | &pid_hash[type][pid_hashfn(nr)]); | ||
| 156 | INIT_LIST_HEAD(&task_pid->pid_list); | 155 | INIT_LIST_HEAD(&task_pid->pid_list); |
| 156 | hlist_add_head_rcu(&task_pid->pid_chain, | ||
| 157 | &pid_hash[type][pid_hashfn(nr)]); | ||
| 157 | } else { | 158 | } else { |
| 158 | INIT_HLIST_NODE(&task_pid->pid_chain); | 159 | INIT_HLIST_NODE(&task_pid->pid_chain); |
| 159 | list_add_tail(&task_pid->pid_list, &pid->pid_list); | 160 | list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list); |
| 160 | } | 161 | } |
| 161 | task_pid->nr = nr; | ||
| 162 | 162 | ||
| 163 | return 0; | 163 | return 0; |
| 164 | } | 164 | } |
| @@ -170,20 +170,20 @@ static fastcall int __detach_pid(task_t *task, enum pid_type type) | |||
| 170 | 170 | ||
| 171 | pid = &task->pids[type]; | 171 | pid = &task->pids[type]; |
| 172 | if (!hlist_unhashed(&pid->pid_chain)) { | 172 | if (!hlist_unhashed(&pid->pid_chain)) { |
| 173 | hlist_del(&pid->pid_chain); | ||
| 174 | 173 | ||
| 175 | if (list_empty(&pid->pid_list)) | 174 | if (list_empty(&pid->pid_list)) { |
| 176 | nr = pid->nr; | 175 | nr = pid->nr; |
| 177 | else { | 176 | hlist_del_rcu(&pid->pid_chain); |
| 177 | } else { | ||
| 178 | pid_next = list_entry(pid->pid_list.next, | 178 | pid_next = list_entry(pid->pid_list.next, |
| 179 | struct pid, pid_list); | 179 | struct pid, pid_list); |
| 180 | /* insert next pid from pid_list to hash */ | 180 | /* insert next pid from pid_list to hash */ |
| 181 | hlist_add_head(&pid_next->pid_chain, | 181 | hlist_replace_rcu(&pid->pid_chain, |
| 182 | &pid_hash[type][pid_hashfn(pid_next->nr)]); | 182 | &pid_next->pid_chain); |
| 183 | } | 183 | } |
| 184 | } | 184 | } |
| 185 | 185 | ||
| 186 | list_del(&pid->pid_list); | 186 | list_del_rcu(&pid->pid_list); |
| 187 | pid->nr = 0; | 187 | pid->nr = 0; |
| 188 | 188 | ||
| 189 | return nr; | 189 | return nr; |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index cae4f5728997..520f6c59948d 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
| @@ -7,7 +7,7 @@ | |||
| 7 | #include <asm/uaccess.h> | 7 | #include <asm/uaccess.h> |
| 8 | #include <linux/errno.h> | 8 | #include <linux/errno.h> |
| 9 | 9 | ||
| 10 | static int check_clock(clockid_t which_clock) | 10 | static int check_clock(const clockid_t which_clock) |
| 11 | { | 11 | { |
| 12 | int error = 0; | 12 | int error = 0; |
| 13 | struct task_struct *p; | 13 | struct task_struct *p; |
| @@ -31,7 +31,7 @@ static int check_clock(clockid_t which_clock) | |||
| 31 | } | 31 | } |
| 32 | 32 | ||
| 33 | static inline union cpu_time_count | 33 | static inline union cpu_time_count |
| 34 | timespec_to_sample(clockid_t which_clock, const struct timespec *tp) | 34 | timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) |
| 35 | { | 35 | { |
| 36 | union cpu_time_count ret; | 36 | union cpu_time_count ret; |
| 37 | ret.sched = 0; /* high half always zero when .cpu used */ | 37 | ret.sched = 0; /* high half always zero when .cpu used */ |
| @@ -43,7 +43,7 @@ timespec_to_sample(clockid_t which_clock, const struct timespec *tp) | |||
| 43 | return ret; | 43 | return ret; |
| 44 | } | 44 | } |
| 45 | 45 | ||
| 46 | static void sample_to_timespec(clockid_t which_clock, | 46 | static void sample_to_timespec(const clockid_t which_clock, |
| 47 | union cpu_time_count cpu, | 47 | union cpu_time_count cpu, |
| 48 | struct timespec *tp) | 48 | struct timespec *tp) |
| 49 | { | 49 | { |
| @@ -55,7 +55,7 @@ static void sample_to_timespec(clockid_t which_clock, | |||
| 55 | } | 55 | } |
| 56 | } | 56 | } |
| 57 | 57 | ||
| 58 | static inline int cpu_time_before(clockid_t which_clock, | 58 | static inline int cpu_time_before(const clockid_t which_clock, |
| 59 | union cpu_time_count now, | 59 | union cpu_time_count now, |
| 60 | union cpu_time_count then) | 60 | union cpu_time_count then) |
| 61 | { | 61 | { |
| @@ -65,7 +65,7 @@ static inline int cpu_time_before(clockid_t which_clock, | |||
| 65 | return cputime_lt(now.cpu, then.cpu); | 65 | return cputime_lt(now.cpu, then.cpu); |
| 66 | } | 66 | } |
| 67 | } | 67 | } |
| 68 | static inline void cpu_time_add(clockid_t which_clock, | 68 | static inline void cpu_time_add(const clockid_t which_clock, |
| 69 | union cpu_time_count *acc, | 69 | union cpu_time_count *acc, |
| 70 | union cpu_time_count val) | 70 | union cpu_time_count val) |
| 71 | { | 71 | { |
| @@ -75,7 +75,7 @@ static inline void cpu_time_add(clockid_t which_clock, | |||
| 75 | acc->cpu = cputime_add(acc->cpu, val.cpu); | 75 | acc->cpu = cputime_add(acc->cpu, val.cpu); |
| 76 | } | 76 | } |
| 77 | } | 77 | } |
| 78 | static inline union cpu_time_count cpu_time_sub(clockid_t which_clock, | 78 | static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, |
| 79 | union cpu_time_count a, | 79 | union cpu_time_count a, |
| 80 | union cpu_time_count b) | 80 | union cpu_time_count b) |
| 81 | { | 81 | { |
| @@ -151,7 +151,7 @@ static inline unsigned long long sched_ns(struct task_struct *p) | |||
| 151 | return (p == current) ? current_sched_time(p) : p->sched_time; | 151 | return (p == current) ? current_sched_time(p) : p->sched_time; |
| 152 | } | 152 | } |
| 153 | 153 | ||
| 154 | int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp) | 154 | int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) |
| 155 | { | 155 | { |
| 156 | int error = check_clock(which_clock); | 156 | int error = check_clock(which_clock); |
| 157 | if (!error) { | 157 | if (!error) { |
| @@ -169,7 +169,7 @@ int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp) | |||
| 169 | return error; | 169 | return error; |
| 170 | } | 170 | } |
| 171 | 171 | ||
| 172 | int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp) | 172 | int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) |
| 173 | { | 173 | { |
| 174 | /* | 174 | /* |
| 175 | * You can never reset a CPU clock, but we check for other errors | 175 | * You can never reset a CPU clock, but we check for other errors |
| @@ -186,7 +186,7 @@ int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp) | |||
| 186 | /* | 186 | /* |
| 187 | * Sample a per-thread clock for the given task. | 187 | * Sample a per-thread clock for the given task. |
| 188 | */ | 188 | */ |
| 189 | static int cpu_clock_sample(clockid_t which_clock, struct task_struct *p, | 189 | static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, |
| 190 | union cpu_time_count *cpu) | 190 | union cpu_time_count *cpu) |
| 191 | { | 191 | { |
| 192 | switch (CPUCLOCK_WHICH(which_clock)) { | 192 | switch (CPUCLOCK_WHICH(which_clock)) { |
| @@ -238,18 +238,7 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx, | |||
| 238 | while ((t = next_thread(t)) != p) { | 238 | while ((t = next_thread(t)) != p) { |
| 239 | cpu->sched += t->sched_time; | 239 | cpu->sched += t->sched_time; |
| 240 | } | 240 | } |
| 241 | if (p->tgid == current->tgid) { | 241 | cpu->sched += sched_ns(p); |
| 242 | /* | ||
| 243 | * We're sampling ourselves, so include the | ||
| 244 | * cycles not yet banked. We still omit | ||
| 245 | * other threads running on other CPUs, | ||
| 246 | * so the total can always be behind as | ||
| 247 | * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ). | ||
| 248 | */ | ||
| 249 | cpu->sched += current_sched_time(current); | ||
| 250 | } else { | ||
| 251 | cpu->sched += p->sched_time; | ||
| 252 | } | ||
| 253 | break; | 242 | break; |
| 254 | } | 243 | } |
| 255 | return 0; | 244 | return 0; |
| @@ -259,7 +248,7 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx, | |||
| 259 | * Sample a process (thread group) clock for the given group_leader task. | 248 | * Sample a process (thread group) clock for the given group_leader task. |
| 260 | * Must be called with tasklist_lock held for reading. | 249 | * Must be called with tasklist_lock held for reading. |
| 261 | */ | 250 | */ |
| 262 | static int cpu_clock_sample_group(clockid_t which_clock, | 251 | static int cpu_clock_sample_group(const clockid_t which_clock, |
| 263 | struct task_struct *p, | 252 | struct task_struct *p, |
| 264 | union cpu_time_count *cpu) | 253 | union cpu_time_count *cpu) |
| 265 | { | 254 | { |
| @@ -273,7 +262,7 @@ static int cpu_clock_sample_group(clockid_t which_clock, | |||
| 273 | } | 262 | } |
| 274 | 263 | ||
| 275 | 264 | ||
| 276 | int posix_cpu_clock_get(clockid_t which_clock, struct timespec *tp) | 265 | int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) |
| 277 | { | 266 | { |
| 278 | const pid_t pid = CPUCLOCK_PID(which_clock); | 267 | const pid_t pid = CPUCLOCK_PID(which_clock); |
| 279 | int error = -EINVAL; | 268 | int error = -EINVAL; |
| @@ -1410,8 +1399,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, | |||
| 1410 | 1399 | ||
| 1411 | static long posix_cpu_clock_nanosleep_restart(struct restart_block *); | 1400 | static long posix_cpu_clock_nanosleep_restart(struct restart_block *); |
| 1412 | 1401 | ||
| 1413 | int posix_cpu_nsleep(clockid_t which_clock, int flags, | 1402 | int posix_cpu_nsleep(const clockid_t which_clock, int flags, |
| 1414 | struct timespec *rqtp) | 1403 | struct timespec *rqtp, struct timespec __user *rmtp) |
| 1415 | { | 1404 | { |
| 1416 | struct restart_block *restart_block = | 1405 | struct restart_block *restart_block = |
| 1417 | ¤t_thread_info()->restart_block; | 1406 | ¤t_thread_info()->restart_block; |
| @@ -1436,7 +1425,6 @@ int posix_cpu_nsleep(clockid_t which_clock, int flags, | |||
| 1436 | error = posix_cpu_timer_create(&timer); | 1425 | error = posix_cpu_timer_create(&timer); |
| 1437 | timer.it_process = current; | 1426 | timer.it_process = current; |
| 1438 | if (!error) { | 1427 | if (!error) { |
| 1439 | struct timespec __user *rmtp; | ||
| 1440 | static struct itimerspec zero_it; | 1428 | static struct itimerspec zero_it; |
| 1441 | struct itimerspec it = { .it_value = *rqtp, | 1429 | struct itimerspec it = { .it_value = *rqtp, |
| 1442 | .it_interval = {} }; | 1430 | .it_interval = {} }; |
| @@ -1483,7 +1471,6 @@ int posix_cpu_nsleep(clockid_t which_clock, int flags, | |||
| 1483 | /* | 1471 | /* |
| 1484 | * Report back to the user the time still remaining. | 1472 | * Report back to the user the time still remaining. |
| 1485 | */ | 1473 | */ |
| 1486 | rmtp = (struct timespec __user *) restart_block->arg1; | ||
| 1487 | if (rmtp != NULL && !(flags & TIMER_ABSTIME) && | 1474 | if (rmtp != NULL && !(flags & TIMER_ABSTIME) && |
| 1488 | copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) | 1475 | copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) |
| 1489 | return -EFAULT; | 1476 | return -EFAULT; |
| @@ -1491,6 +1478,7 @@ int posix_cpu_nsleep(clockid_t which_clock, int flags, | |||
| 1491 | restart_block->fn = posix_cpu_clock_nanosleep_restart; | 1478 | restart_block->fn = posix_cpu_clock_nanosleep_restart; |
| 1492 | /* Caller already set restart_block->arg1 */ | 1479 | /* Caller already set restart_block->arg1 */ |
| 1493 | restart_block->arg0 = which_clock; | 1480 | restart_block->arg0 = which_clock; |
| 1481 | restart_block->arg1 = (unsigned long) rmtp; | ||
| 1494 | restart_block->arg2 = rqtp->tv_sec; | 1482 | restart_block->arg2 = rqtp->tv_sec; |
| 1495 | restart_block->arg3 = rqtp->tv_nsec; | 1483 | restart_block->arg3 = rqtp->tv_nsec; |
| 1496 | 1484 | ||
| @@ -1504,21 +1492,28 @@ static long | |||
| 1504 | posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block) | 1492 | posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block) |
| 1505 | { | 1493 | { |
| 1506 | clockid_t which_clock = restart_block->arg0; | 1494 | clockid_t which_clock = restart_block->arg0; |
| 1507 | struct timespec t = { .tv_sec = restart_block->arg2, | 1495 | struct timespec __user *rmtp; |
| 1508 | .tv_nsec = restart_block->arg3 }; | 1496 | struct timespec t; |
| 1497 | |||
| 1498 | rmtp = (struct timespec __user *) restart_block->arg1; | ||
| 1499 | t.tv_sec = restart_block->arg2; | ||
| 1500 | t.tv_nsec = restart_block->arg3; | ||
| 1501 | |||
| 1509 | restart_block->fn = do_no_restart_syscall; | 1502 | restart_block->fn = do_no_restart_syscall; |
| 1510 | return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t); | 1503 | return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t, rmtp); |
| 1511 | } | 1504 | } |
| 1512 | 1505 | ||
| 1513 | 1506 | ||
| 1514 | #define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) | 1507 | #define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) |
| 1515 | #define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) | 1508 | #define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) |
| 1516 | 1509 | ||
| 1517 | static int process_cpu_clock_getres(clockid_t which_clock, struct timespec *tp) | 1510 | static int process_cpu_clock_getres(const clockid_t which_clock, |
| 1511 | struct timespec *tp) | ||
| 1518 | { | 1512 | { |
| 1519 | return posix_cpu_clock_getres(PROCESS_CLOCK, tp); | 1513 | return posix_cpu_clock_getres(PROCESS_CLOCK, tp); |
| 1520 | } | 1514 | } |
| 1521 | static int process_cpu_clock_get(clockid_t which_clock, struct timespec *tp) | 1515 | static int process_cpu_clock_get(const clockid_t which_clock, |
| 1516 | struct timespec *tp) | ||
| 1522 | { | 1517 | { |
| 1523 | return posix_cpu_clock_get(PROCESS_CLOCK, tp); | 1518 | return posix_cpu_clock_get(PROCESS_CLOCK, tp); |
| 1524 | } | 1519 | } |
| @@ -1527,16 +1522,19 @@ static int process_cpu_timer_create(struct k_itimer *timer) | |||
| 1527 | timer->it_clock = PROCESS_CLOCK; | 1522 | timer->it_clock = PROCESS_CLOCK; |
| 1528 | return posix_cpu_timer_create(timer); | 1523 | return posix_cpu_timer_create(timer); |
| 1529 | } | 1524 | } |
| 1530 | static int process_cpu_nsleep(clockid_t which_clock, int flags, | 1525 | static int process_cpu_nsleep(const clockid_t which_clock, int flags, |
| 1531 | struct timespec *rqtp) | 1526 | struct timespec *rqtp, |
| 1527 | struct timespec __user *rmtp) | ||
| 1532 | { | 1528 | { |
| 1533 | return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp); | 1529 | return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); |
| 1534 | } | 1530 | } |
| 1535 | static int thread_cpu_clock_getres(clockid_t which_clock, struct timespec *tp) | 1531 | static int thread_cpu_clock_getres(const clockid_t which_clock, |
| 1532 | struct timespec *tp) | ||
| 1536 | { | 1533 | { |
| 1537 | return posix_cpu_clock_getres(THREAD_CLOCK, tp); | 1534 | return posix_cpu_clock_getres(THREAD_CLOCK, tp); |
| 1538 | } | 1535 | } |
| 1539 | static int thread_cpu_clock_get(clockid_t which_clock, struct timespec *tp) | 1536 | static int thread_cpu_clock_get(const clockid_t which_clock, |
| 1537 | struct timespec *tp) | ||
| 1540 | { | 1538 | { |
| 1541 | return posix_cpu_clock_get(THREAD_CLOCK, tp); | 1539 | return posix_cpu_clock_get(THREAD_CLOCK, tp); |
| 1542 | } | 1540 | } |
| @@ -1545,8 +1543,8 @@ static int thread_cpu_timer_create(struct k_itimer *timer) | |||
| 1545 | timer->it_clock = THREAD_CLOCK; | 1543 | timer->it_clock = THREAD_CLOCK; |
| 1546 | return posix_cpu_timer_create(timer); | 1544 | return posix_cpu_timer_create(timer); |
| 1547 | } | 1545 | } |
| 1548 | static int thread_cpu_nsleep(clockid_t which_clock, int flags, | 1546 | static int thread_cpu_nsleep(const clockid_t which_clock, int flags, |
| 1549 | struct timespec *rqtp) | 1547 | struct timespec *rqtp, struct timespec __user *rmtp) |
| 1550 | { | 1548 | { |
| 1551 | return -EINVAL; | 1549 | return -EINVAL; |
| 1552 | } | 1550 | } |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 5870efb3e200..9e66e614862a 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
| @@ -48,21 +48,6 @@ | |||
| 48 | #include <linux/workqueue.h> | 48 | #include <linux/workqueue.h> |
| 49 | #include <linux/module.h> | 49 | #include <linux/module.h> |
| 50 | 50 | ||
| 51 | #ifndef div_long_long_rem | ||
| 52 | #include <asm/div64.h> | ||
| 53 | |||
| 54 | #define div_long_long_rem(dividend,divisor,remainder) ({ \ | ||
| 55 | u64 result = dividend; \ | ||
| 56 | *remainder = do_div(result,divisor); \ | ||
| 57 | result; }) | ||
| 58 | |||
| 59 | #endif | ||
| 60 | #define CLOCK_REALTIME_RES TICK_NSEC /* In nano seconds. */ | ||
| 61 | |||
| 62 | static inline u64 mpy_l_X_l_ll(unsigned long mpy1,unsigned long mpy2) | ||
| 63 | { | ||
| 64 | return (u64)mpy1 * mpy2; | ||
| 65 | } | ||
| 66 | /* | 51 | /* |
| 67 | * Management arrays for POSIX timers. Timers are kept in slab memory | 52 | * Management arrays for POSIX timers. Timers are kept in slab memory |
| 68 | * Timer ids are allocated by an external routine that keeps track of the | 53 | * Timer ids are allocated by an external routine that keeps track of the |
| @@ -148,18 +133,18 @@ static DEFINE_SPINLOCK(idr_lock); | |||
| 148 | */ | 133 | */ |
| 149 | 134 | ||
| 150 | static struct k_clock posix_clocks[MAX_CLOCKS]; | 135 | static struct k_clock posix_clocks[MAX_CLOCKS]; |
| 136 | |||
| 151 | /* | 137 | /* |
| 152 | * We only have one real clock that can be set so we need only one abs list, | 138 | * These ones are defined below. |
| 153 | * even if we should want to have several clocks with differing resolutions. | ||
| 154 | */ | 139 | */ |
| 155 | static struct k_clock_abs abs_list = {.list = LIST_HEAD_INIT(abs_list.list), | 140 | static int common_nsleep(const clockid_t, int flags, struct timespec *t, |
| 156 | .lock = SPIN_LOCK_UNLOCKED}; | 141 | struct timespec __user *rmtp); |
| 142 | static void common_timer_get(struct k_itimer *, struct itimerspec *); | ||
| 143 | static int common_timer_set(struct k_itimer *, int, | ||
| 144 | struct itimerspec *, struct itimerspec *); | ||
| 145 | static int common_timer_del(struct k_itimer *timer); | ||
| 157 | 146 | ||
| 158 | static void posix_timer_fn(unsigned long); | 147 | static int posix_timer_fn(void *data); |
| 159 | static u64 do_posix_clock_monotonic_gettime_parts( | ||
| 160 | struct timespec *tp, struct timespec *mo); | ||
| 161 | int do_posix_clock_monotonic_gettime(struct timespec *tp); | ||
| 162 | static int do_posix_clock_monotonic_get(clockid_t, struct timespec *tp); | ||
| 163 | 148 | ||
| 164 | static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); | 149 | static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); |
| 165 | 150 | ||
| @@ -184,7 +169,7 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) | |||
| 184 | * the function pointer CALL in struct k_clock. | 169 | * the function pointer CALL in struct k_clock. |
| 185 | */ | 170 | */ |
| 186 | 171 | ||
| 187 | static inline int common_clock_getres(clockid_t which_clock, | 172 | static inline int common_clock_getres(const clockid_t which_clock, |
| 188 | struct timespec *tp) | 173 | struct timespec *tp) |
| 189 | { | 174 | { |
| 190 | tp->tv_sec = 0; | 175 | tp->tv_sec = 0; |
| @@ -192,39 +177,33 @@ static inline int common_clock_getres(clockid_t which_clock, | |||
| 192 | return 0; | 177 | return 0; |
| 193 | } | 178 | } |
| 194 | 179 | ||
| 195 | static inline int common_clock_get(clockid_t which_clock, struct timespec *tp) | 180 | /* |
| 181 | * Get real time for posix timers | ||
| 182 | */ | ||
| 183 | static int common_clock_get(clockid_t which_clock, struct timespec *tp) | ||
| 196 | { | 184 | { |
| 197 | getnstimeofday(tp); | 185 | ktime_get_real_ts(tp); |
| 198 | return 0; | 186 | return 0; |
| 199 | } | 187 | } |
| 200 | 188 | ||
| 201 | static inline int common_clock_set(clockid_t which_clock, struct timespec *tp) | 189 | static inline int common_clock_set(const clockid_t which_clock, |
| 190 | struct timespec *tp) | ||
| 202 | { | 191 | { |
| 203 | return do_sys_settimeofday(tp, NULL); | 192 | return do_sys_settimeofday(tp, NULL); |
| 204 | } | 193 | } |
| 205 | 194 | ||
| 206 | static inline int common_timer_create(struct k_itimer *new_timer) | 195 | static inline int common_timer_create(struct k_itimer *new_timer) |
| 207 | { | 196 | { |
| 208 | INIT_LIST_HEAD(&new_timer->it.real.abs_timer_entry); | 197 | hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock); |
| 209 | init_timer(&new_timer->it.real.timer); | 198 | new_timer->it.real.timer.data = new_timer; |
| 210 | new_timer->it.real.timer.data = (unsigned long) new_timer; | ||
| 211 | new_timer->it.real.timer.function = posix_timer_fn; | 199 | new_timer->it.real.timer.function = posix_timer_fn; |
| 212 | return 0; | 200 | return 0; |
| 213 | } | 201 | } |
| 214 | 202 | ||
| 215 | /* | 203 | /* |
| 216 | * These ones are defined below. | 204 | * Return nonzero if we know a priori this clockid_t value is bogus. |
| 217 | */ | ||
| 218 | static int common_nsleep(clockid_t, int flags, struct timespec *t); | ||
| 219 | static void common_timer_get(struct k_itimer *, struct itimerspec *); | ||
| 220 | static int common_timer_set(struct k_itimer *, int, | ||
| 221 | struct itimerspec *, struct itimerspec *); | ||
| 222 | static int common_timer_del(struct k_itimer *timer); | ||
| 223 | |||
| 224 | /* | ||
| 225 | * Return nonzero iff we know a priori this clockid_t value is bogus. | ||
| 226 | */ | 205 | */ |
| 227 | static inline int invalid_clockid(clockid_t which_clock) | 206 | static inline int invalid_clockid(const clockid_t which_clock) |
| 228 | { | 207 | { |
| 229 | if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ | 208 | if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ |
| 230 | return 0; | 209 | return 0; |
| @@ -232,26 +211,32 @@ static inline int invalid_clockid(clockid_t which_clock) | |||
| 232 | return 1; | 211 | return 1; |
| 233 | if (posix_clocks[which_clock].clock_getres != NULL) | 212 | if (posix_clocks[which_clock].clock_getres != NULL) |
| 234 | return 0; | 213 | return 0; |
| 235 | #ifndef CLOCK_DISPATCH_DIRECT | ||
| 236 | if (posix_clocks[which_clock].res != 0) | 214 | if (posix_clocks[which_clock].res != 0) |
| 237 | return 0; | 215 | return 0; |
| 238 | #endif | ||
| 239 | return 1; | 216 | return 1; |
| 240 | } | 217 | } |
| 241 | 218 | ||
| 219 | /* | ||
| 220 | * Get monotonic time for posix timers | ||
| 221 | */ | ||
| 222 | static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) | ||
| 223 | { | ||
| 224 | ktime_get_ts(tp); | ||
| 225 | return 0; | ||
| 226 | } | ||
| 242 | 227 | ||
| 243 | /* | 228 | /* |
| 244 | * Initialize everything, well, just everything in Posix clocks/timers ;) | 229 | * Initialize everything, well, just everything in Posix clocks/timers ;) |
| 245 | */ | 230 | */ |
| 246 | static __init int init_posix_timers(void) | 231 | static __init int init_posix_timers(void) |
| 247 | { | 232 | { |
| 248 | struct k_clock clock_realtime = {.res = CLOCK_REALTIME_RES, | 233 | struct k_clock clock_realtime = { |
| 249 | .abs_struct = &abs_list | 234 | .clock_getres = hrtimer_get_res, |
| 250 | }; | 235 | }; |
| 251 | struct k_clock clock_monotonic = {.res = CLOCK_REALTIME_RES, | 236 | struct k_clock clock_monotonic = { |
| 252 | .abs_struct = NULL, | 237 | .clock_getres = hrtimer_get_res, |
| 253 | .clock_get = do_posix_clock_monotonic_get, | 238 | .clock_get = posix_ktime_get_ts, |
| 254 | .clock_set = do_posix_clock_nosettime | 239 | .clock_set = do_posix_clock_nosettime, |
| 255 | }; | 240 | }; |
| 256 | 241 | ||
| 257 | register_posix_clock(CLOCK_REALTIME, &clock_realtime); | 242 | register_posix_clock(CLOCK_REALTIME, &clock_realtime); |
| @@ -265,117 +250,17 @@ static __init int init_posix_timers(void) | |||
| 265 | 250 | ||
| 266 | __initcall(init_posix_timers); | 251 | __initcall(init_posix_timers); |
| 267 | 252 | ||
| 268 | static void tstojiffie(struct timespec *tp, int res, u64 *jiff) | ||
| 269 | { | ||
| 270 | long sec = tp->tv_sec; | ||
| 271 | long nsec = tp->tv_nsec + res - 1; | ||
| 272 | |||
| 273 | if (nsec >= NSEC_PER_SEC) { | ||
| 274 | sec++; | ||
| 275 | nsec -= NSEC_PER_SEC; | ||
| 276 | } | ||
| 277 | |||
| 278 | /* | ||
| 279 | * The scaling constants are defined in <linux/time.h> | ||
| 280 | * The difference between there and here is that we do the | ||
| 281 | * res rounding and compute a 64-bit result (well so does that | ||
| 282 | * but it then throws away the high bits). | ||
| 283 | */ | ||
| 284 | *jiff = (mpy_l_X_l_ll(sec, SEC_CONVERSION) + | ||
| 285 | (mpy_l_X_l_ll(nsec, NSEC_CONVERSION) >> | ||
| 286 | (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; | ||
| 287 | } | ||
| 288 | |||
| 289 | /* | ||
| 290 | * This function adjusts the timer as needed as a result of the clock | ||
| 291 | * being set. It should only be called for absolute timers, and then | ||
| 292 | * under the abs_list lock. It computes the time difference and sets | ||
| 293 | * the new jiffies value in the timer. It also updates the timers | ||
| 294 | * reference wall_to_monotonic value. It is complicated by the fact | ||
| 295 | * that tstojiffies() only handles positive times and it needs to work | ||
| 296 | * with both positive and negative times. Also, for negative offsets, | ||
| 297 | * we need to defeat the res round up. | ||
| 298 | * | ||
| 299 | * Return is true if there is a new time, else false. | ||
| 300 | */ | ||
| 301 | static long add_clockset_delta(struct k_itimer *timr, | ||
| 302 | struct timespec *new_wall_to) | ||
| 303 | { | ||
| 304 | struct timespec delta; | ||
| 305 | int sign = 0; | ||
| 306 | u64 exp; | ||
| 307 | |||
| 308 | set_normalized_timespec(&delta, | ||
| 309 | new_wall_to->tv_sec - | ||
| 310 | timr->it.real.wall_to_prev.tv_sec, | ||
| 311 | new_wall_to->tv_nsec - | ||
| 312 | timr->it.real.wall_to_prev.tv_nsec); | ||
| 313 | if (likely(!(delta.tv_sec | delta.tv_nsec))) | ||
| 314 | return 0; | ||
| 315 | if (delta.tv_sec < 0) { | ||
| 316 | set_normalized_timespec(&delta, | ||
| 317 | -delta.tv_sec, | ||
| 318 | 1 - delta.tv_nsec - | ||
| 319 | posix_clocks[timr->it_clock].res); | ||
| 320 | sign++; | ||
| 321 | } | ||
| 322 | tstojiffie(&delta, posix_clocks[timr->it_clock].res, &exp); | ||
| 323 | timr->it.real.wall_to_prev = *new_wall_to; | ||
| 324 | timr->it.real.timer.expires += (sign ? -exp : exp); | ||
| 325 | return 1; | ||
| 326 | } | ||
| 327 | |||
| 328 | static void remove_from_abslist(struct k_itimer *timr) | ||
| 329 | { | ||
| 330 | if (!list_empty(&timr->it.real.abs_timer_entry)) { | ||
| 331 | spin_lock(&abs_list.lock); | ||
| 332 | list_del_init(&timr->it.real.abs_timer_entry); | ||
| 333 | spin_unlock(&abs_list.lock); | ||
| 334 | } | ||
| 335 | } | ||
| 336 | |||
| 337 | static void schedule_next_timer(struct k_itimer *timr) | 253 | static void schedule_next_timer(struct k_itimer *timr) |
| 338 | { | 254 | { |
| 339 | struct timespec new_wall_to; | 255 | if (timr->it.real.interval.tv64 == 0) |
| 340 | struct now_struct now; | ||
| 341 | unsigned long seq; | ||
| 342 | |||
| 343 | /* | ||
| 344 | * Set up the timer for the next interval (if there is one). | ||
| 345 | * Note: this code uses the abs_timer_lock to protect | ||
| 346 | * it.real.wall_to_prev and must hold it until exp is set, not exactly | ||
| 347 | * obvious... | ||
| 348 | |||
| 349 | * This function is used for CLOCK_REALTIME* and | ||
| 350 | * CLOCK_MONOTONIC* timers. If we ever want to handle other | ||
| 351 | * CLOCKs, the calling code (do_schedule_next_timer) would need | ||
| 352 | * to pull the "clock" info from the timer and dispatch the | ||
| 353 | * "other" CLOCKs "next timer" code (which, I suppose should | ||
| 354 | * also be added to the k_clock structure). | ||
| 355 | */ | ||
| 356 | if (!timr->it.real.incr) | ||
| 357 | return; | 256 | return; |
| 358 | 257 | ||
| 359 | do { | 258 | timr->it_overrun += hrtimer_forward(&timr->it.real.timer, |
| 360 | seq = read_seqbegin(&xtime_lock); | 259 | timr->it.real.interval); |
| 361 | new_wall_to = wall_to_monotonic; | ||
| 362 | posix_get_now(&now); | ||
| 363 | } while (read_seqretry(&xtime_lock, seq)); | ||
| 364 | |||
| 365 | if (!list_empty(&timr->it.real.abs_timer_entry)) { | ||
| 366 | spin_lock(&abs_list.lock); | ||
| 367 | add_clockset_delta(timr, &new_wall_to); | ||
| 368 | |||
| 369 | posix_bump_timer(timr, now); | ||
| 370 | |||
| 371 | spin_unlock(&abs_list.lock); | ||
| 372 | } else { | ||
| 373 | posix_bump_timer(timr, now); | ||
| 374 | } | ||
| 375 | timr->it_overrun_last = timr->it_overrun; | 260 | timr->it_overrun_last = timr->it_overrun; |
| 376 | timr->it_overrun = -1; | 261 | timr->it_overrun = -1; |
| 377 | ++timr->it_requeue_pending; | 262 | ++timr->it_requeue_pending; |
| 378 | add_timer(&timr->it.real.timer); | 263 | hrtimer_restart(&timr->it.real.timer); |
| 379 | } | 264 | } |
| 380 | 265 | ||
| 381 | /* | 266 | /* |
| @@ -396,31 +281,23 @@ void do_schedule_next_timer(struct siginfo *info) | |||
| 396 | 281 | ||
| 397 | timr = lock_timer(info->si_tid, &flags); | 282 | timr = lock_timer(info->si_tid, &flags); |
| 398 | 283 | ||
| 399 | if (!timr || timr->it_requeue_pending != info->si_sys_private) | 284 | if (timr && timr->it_requeue_pending == info->si_sys_private) { |
| 400 | goto exit; | 285 | if (timr->it_clock < 0) |
| 286 | posix_cpu_timer_schedule(timr); | ||
| 287 | else | ||
| 288 | schedule_next_timer(timr); | ||
| 401 | 289 | ||
| 402 | if (timr->it_clock < 0) /* CPU clock */ | 290 | info->si_overrun = timr->it_overrun_last; |
| 403 | posix_cpu_timer_schedule(timr); | 291 | } |
| 404 | else | 292 | |
| 405 | schedule_next_timer(timr); | 293 | unlock_timer(timr, flags); |
| 406 | info->si_overrun = timr->it_overrun_last; | ||
| 407 | exit: | ||
| 408 | if (timr) | ||
| 409 | unlock_timer(timr, flags); | ||
| 410 | } | 294 | } |
| 411 | 295 | ||
| 412 | int posix_timer_event(struct k_itimer *timr,int si_private) | 296 | int posix_timer_event(struct k_itimer *timr,int si_private) |
| 413 | { | 297 | { |
| 414 | memset(&timr->sigq->info, 0, sizeof(siginfo_t)); | 298 | memset(&timr->sigq->info, 0, sizeof(siginfo_t)); |
| 415 | timr->sigq->info.si_sys_private = si_private; | 299 | timr->sigq->info.si_sys_private = si_private; |
| 416 | /* | 300 | /* Send signal to the process that owns this timer.*/ |
| 417 | * Send signal to the process that owns this timer. | ||
| 418 | |||
| 419 | * This code assumes that all the possible abs_lists share the | ||
| 420 | * same lock (there is only one list at this time). If this is | ||
| 421 | * not the case, the CLOCK info would need to be used to find | ||
| 422 | * the proper abs list lock. | ||
| 423 | */ | ||
| 424 | 301 | ||
| 425 | timr->sigq->info.si_signo = timr->it_sigev_signo; | 302 | timr->sigq->info.si_signo = timr->it_sigev_signo; |
| 426 | timr->sigq->info.si_errno = 0; | 303 | timr->sigq->info.si_errno = 0; |
| @@ -454,64 +331,35 @@ EXPORT_SYMBOL_GPL(posix_timer_event); | |||
| 454 | 331 | ||
| 455 | * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. | 332 | * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. |
| 456 | */ | 333 | */ |
| 457 | static void posix_timer_fn(unsigned long __data) | 334 | static int posix_timer_fn(void *data) |
| 458 | { | 335 | { |
| 459 | struct k_itimer *timr = (struct k_itimer *) __data; | 336 | struct k_itimer *timr = data; |
| 460 | unsigned long flags; | 337 | unsigned long flags; |
| 461 | unsigned long seq; | 338 | int si_private = 0; |
| 462 | struct timespec delta, new_wall_to; | 339 | int ret = HRTIMER_NORESTART; |
| 463 | u64 exp = 0; | ||
| 464 | int do_notify = 1; | ||
| 465 | 340 | ||
| 466 | spin_lock_irqsave(&timr->it_lock, flags); | 341 | spin_lock_irqsave(&timr->it_lock, flags); |
| 467 | if (!list_empty(&timr->it.real.abs_timer_entry)) { | ||
| 468 | spin_lock(&abs_list.lock); | ||
| 469 | do { | ||
| 470 | seq = read_seqbegin(&xtime_lock); | ||
| 471 | new_wall_to = wall_to_monotonic; | ||
| 472 | } while (read_seqretry(&xtime_lock, seq)); | ||
| 473 | set_normalized_timespec(&delta, | ||
| 474 | new_wall_to.tv_sec - | ||
| 475 | timr->it.real.wall_to_prev.tv_sec, | ||
| 476 | new_wall_to.tv_nsec - | ||
| 477 | timr->it.real.wall_to_prev.tv_nsec); | ||
| 478 | if (likely((delta.tv_sec | delta.tv_nsec ) == 0)) { | ||
| 479 | /* do nothing, timer is on time */ | ||
| 480 | } else if (delta.tv_sec < 0) { | ||
| 481 | /* do nothing, timer is already late */ | ||
| 482 | } else { | ||
| 483 | /* timer is early due to a clock set */ | ||
| 484 | tstojiffie(&delta, | ||
| 485 | posix_clocks[timr->it_clock].res, | ||
| 486 | &exp); | ||
| 487 | timr->it.real.wall_to_prev = new_wall_to; | ||
| 488 | timr->it.real.timer.expires += exp; | ||
| 489 | add_timer(&timr->it.real.timer); | ||
| 490 | do_notify = 0; | ||
| 491 | } | ||
| 492 | spin_unlock(&abs_list.lock); | ||
| 493 | 342 | ||
| 494 | } | 343 | if (timr->it.real.interval.tv64 != 0) |
| 495 | if (do_notify) { | 344 | si_private = ++timr->it_requeue_pending; |
| 496 | int si_private=0; | ||
| 497 | 345 | ||
| 498 | if (timr->it.real.incr) | 346 | if (posix_timer_event(timr, si_private)) { |
| 499 | si_private = ++timr->it_requeue_pending; | 347 | /* |
| 500 | else { | 348 | * signal was not sent because of sig_ignor |
| 501 | remove_from_abslist(timr); | 349 | * we will not get a call back to restart it AND |
| 350 | * it should be restarted. | ||
| 351 | */ | ||
| 352 | if (timr->it.real.interval.tv64 != 0) { | ||
| 353 | timr->it_overrun += | ||
| 354 | hrtimer_forward(&timr->it.real.timer, | ||
| 355 | timr->it.real.interval); | ||
| 356 | ret = HRTIMER_RESTART; | ||
| 502 | } | 357 | } |
| 503 | |||
| 504 | if (posix_timer_event(timr, si_private)) | ||
| 505 | /* | ||
| 506 | * signal was not sent because of sig_ignor | ||
| 507 | * we will not get a call back to restart it AND | ||
| 508 | * it should be restarted. | ||
| 509 | */ | ||
| 510 | schedule_next_timer(timr); | ||
| 511 | } | 358 | } |
| 512 | unlock_timer(timr, flags); /* hold thru abs lock to keep irq off */ | ||
| 513 | } | ||
| 514 | 359 | ||
| 360 | unlock_timer(timr, flags); | ||
| 361 | return ret; | ||
| 362 | } | ||
| 515 | 363 | ||
| 516 | static inline struct task_struct * good_sigevent(sigevent_t * event) | 364 | static inline struct task_struct * good_sigevent(sigevent_t * event) |
| 517 | { | 365 | { |
| @@ -530,7 +378,7 @@ static inline struct task_struct * good_sigevent(sigevent_t * event) | |||
| 530 | return rtn; | 378 | return rtn; |
| 531 | } | 379 | } |
| 532 | 380 | ||
| 533 | void register_posix_clock(clockid_t clock_id, struct k_clock *new_clock) | 381 | void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) |
| 534 | { | 382 | { |
| 535 | if ((unsigned) clock_id >= MAX_CLOCKS) { | 383 | if ((unsigned) clock_id >= MAX_CLOCKS) { |
| 536 | printk("POSIX clock register failed for clock_id %d\n", | 384 | printk("POSIX clock register failed for clock_id %d\n", |
| @@ -576,7 +424,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) | |||
| 576 | /* Create a POSIX.1b interval timer. */ | 424 | /* Create a POSIX.1b interval timer. */ |
| 577 | 425 | ||
| 578 | asmlinkage long | 426 | asmlinkage long |
| 579 | sys_timer_create(clockid_t which_clock, | 427 | sys_timer_create(const clockid_t which_clock, |
| 580 | struct sigevent __user *timer_event_spec, | 428 | struct sigevent __user *timer_event_spec, |
| 581 | timer_t __user * created_timer_id) | 429 | timer_t __user * created_timer_id) |
| 582 | { | 430 | { |
| @@ -602,8 +450,7 @@ sys_timer_create(clockid_t which_clock, | |||
| 602 | goto out; | 450 | goto out; |
| 603 | } | 451 | } |
| 604 | spin_lock_irq(&idr_lock); | 452 | spin_lock_irq(&idr_lock); |
| 605 | error = idr_get_new(&posix_timers_id, | 453 | error = idr_get_new(&posix_timers_id, (void *) new_timer, |
| 606 | (void *) new_timer, | ||
| 607 | &new_timer_id); | 454 | &new_timer_id); |
| 608 | spin_unlock_irq(&idr_lock); | 455 | spin_unlock_irq(&idr_lock); |
| 609 | if (error == -EAGAIN) | 456 | if (error == -EAGAIN) |
| @@ -704,27 +551,6 @@ out: | |||
| 704 | } | 551 | } |
| 705 | 552 | ||
| 706 | /* | 553 | /* |
| 707 | * good_timespec | ||
| 708 | * | ||
| 709 | * This function checks the elements of a timespec structure. | ||
| 710 | * | ||
| 711 | * Arguments: | ||
| 712 | * ts : Pointer to the timespec structure to check | ||
| 713 | * | ||
| 714 | * Return value: | ||
| 715 | * If a NULL pointer was passed in, or the tv_nsec field was less than 0 | ||
| 716 | * or greater than NSEC_PER_SEC, or the tv_sec field was less than 0, | ||
| 717 | * this function returns 0. Otherwise it returns 1. | ||
| 718 | */ | ||
| 719 | static int good_timespec(const struct timespec *ts) | ||
| 720 | { | ||
| 721 | if ((!ts) || (ts->tv_sec < 0) || | ||
| 722 | ((unsigned) ts->tv_nsec >= NSEC_PER_SEC)) | ||
| 723 | return 0; | ||
| 724 | return 1; | ||
| 725 | } | ||
| 726 | |||
| 727 | /* | ||
| 728 | * Locking issues: We need to protect the result of the id look up until | 554 | * Locking issues: We need to protect the result of the id look up until |
| 729 | * we get the timer locked down so it is not deleted under us. The | 555 | * we get the timer locked down so it is not deleted under us. The |
| 730 | * removal is done under the idr spinlock so we use that here to bridge | 556 | * removal is done under the idr spinlock so we use that here to bridge |
| @@ -776,39 +602,39 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) | |||
| 776 | static void | 602 | static void |
| 777 | common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) | 603 | common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) |
| 778 | { | 604 | { |
| 779 | unsigned long expires; | 605 | ktime_t remaining; |
| 780 | struct now_struct now; | 606 | struct hrtimer *timer = &timr->it.real.timer; |
| 781 | 607 | ||
| 782 | do | 608 | memset(cur_setting, 0, sizeof(struct itimerspec)); |
| 783 | expires = timr->it.real.timer.expires; | 609 | remaining = hrtimer_get_remaining(timer); |
| 784 | while ((volatile long) (timr->it.real.timer.expires) != expires); | ||
| 785 | |||
| 786 | posix_get_now(&now); | ||
| 787 | |||
| 788 | if (expires && | ||
| 789 | ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) && | ||
| 790 | !timr->it.real.incr && | ||
| 791 | posix_time_before(&timr->it.real.timer, &now)) | ||
| 792 | timr->it.real.timer.expires = expires = 0; | ||
| 793 | if (expires) { | ||
| 794 | if (timr->it_requeue_pending & REQUEUE_PENDING || | ||
| 795 | (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { | ||
| 796 | posix_bump_timer(timr, now); | ||
| 797 | expires = timr->it.real.timer.expires; | ||
| 798 | } | ||
| 799 | else | ||
| 800 | if (!timer_pending(&timr->it.real.timer)) | ||
| 801 | expires = 0; | ||
| 802 | if (expires) | ||
| 803 | expires -= now.jiffies; | ||
| 804 | } | ||
| 805 | jiffies_to_timespec(expires, &cur_setting->it_value); | ||
| 806 | jiffies_to_timespec(timr->it.real.incr, &cur_setting->it_interval); | ||
| 807 | 610 | ||
| 808 | if (cur_setting->it_value.tv_sec < 0) { | 611 | /* Time left ? or timer pending */ |
| 612 | if (remaining.tv64 > 0 || hrtimer_active(timer)) | ||
| 613 | goto calci; | ||
| 614 | /* interval timer ? */ | ||
| 615 | if (timr->it.real.interval.tv64 == 0) | ||
| 616 | return; | ||
| 617 | /* | ||
| 618 | * When a requeue is pending or this is a SIGEV_NONE timer | ||
| 619 | * move the expiry time forward by intervals, so expiry is > | ||
| 620 | * now. | ||
| 621 | */ | ||
| 622 | if (timr->it_requeue_pending & REQUEUE_PENDING || | ||
| 623 | (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { | ||
| 624 | timr->it_overrun += | ||
| 625 | hrtimer_forward(timer, timr->it.real.interval); | ||
| 626 | remaining = hrtimer_get_remaining(timer); | ||
| 627 | } | ||
| 628 | calci: | ||
| 629 | /* interval timer ? */ | ||
| 630 | if (timr->it.real.interval.tv64 != 0) | ||
| 631 | cur_setting->it_interval = | ||
| 632 | ktime_to_timespec(timr->it.real.interval); | ||
| 633 | /* Return 0 only, when the timer is expired and not pending */ | ||
| 634 | if (remaining.tv64 <= 0) | ||
| 809 | cur_setting->it_value.tv_nsec = 1; | 635 | cur_setting->it_value.tv_nsec = 1; |
| 810 | cur_setting->it_value.tv_sec = 0; | 636 | else |
| 811 | } | 637 | cur_setting->it_value = ktime_to_timespec(remaining); |
| 812 | } | 638 | } |
| 813 | 639 | ||
| 814 | /* Get the time remaining on a POSIX.1b interval timer. */ | 640 | /* Get the time remaining on a POSIX.1b interval timer. */ |
| @@ -832,6 +658,7 @@ sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting) | |||
| 832 | 658 | ||
| 833 | return 0; | 659 | return 0; |
| 834 | } | 660 | } |
| 661 | |||
| 835 | /* | 662 | /* |
| 836 | * Get the number of overruns of a POSIX.1b interval timer. This is to | 663 | * Get the number of overruns of a POSIX.1b interval timer. This is to |
| 837 | * be the overrun of the timer last delivered. At the same time we are | 664 | * be the overrun of the timer last delivered. At the same time we are |
| @@ -841,7 +668,6 @@ sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting) | |||
| 841 | * the call back to do_schedule_next_timer(). So all we need to do is | 668 | * the call back to do_schedule_next_timer(). So all we need to do is |
| 842 | * to pick up the frozen overrun. | 669 | * to pick up the frozen overrun. |
| 843 | */ | 670 | */ |
| 844 | |||
| 845 | asmlinkage long | 671 | asmlinkage long |
| 846 | sys_timer_getoverrun(timer_t timer_id) | 672 | sys_timer_getoverrun(timer_t timer_id) |
| 847 | { | 673 | { |
| @@ -858,84 +684,6 @@ sys_timer_getoverrun(timer_t timer_id) | |||
| 858 | 684 | ||
| 859 | return overrun; | 685 | return overrun; |
| 860 | } | 686 | } |
| 861 | /* | ||
| 862 | * Adjust for absolute time | ||
| 863 | * | ||
| 864 | * If absolute time is given and it is not CLOCK_MONOTONIC, we need to | ||
| 865 | * adjust for the offset between the timer clock (CLOCK_MONOTONIC) and | ||
| 866 | * what ever clock he is using. | ||
| 867 | * | ||
| 868 | * If it is relative time, we need to add the current (CLOCK_MONOTONIC) | ||
| 869 | * time to it to get the proper time for the timer. | ||
| 870 | */ | ||
| 871 | static int adjust_abs_time(struct k_clock *clock, struct timespec *tp, | ||
| 872 | int abs, u64 *exp, struct timespec *wall_to) | ||
| 873 | { | ||
| 874 | struct timespec now; | ||
| 875 | struct timespec oc = *tp; | ||
| 876 | u64 jiffies_64_f; | ||
| 877 | int rtn =0; | ||
| 878 | |||
| 879 | if (abs) { | ||
| 880 | /* | ||
| 881 | * The mask pick up the 4 basic clocks | ||
| 882 | */ | ||
| 883 | if (!((clock - &posix_clocks[0]) & ~CLOCKS_MASK)) { | ||
| 884 | jiffies_64_f = do_posix_clock_monotonic_gettime_parts( | ||
| 885 | &now, wall_to); | ||
| 886 | /* | ||
| 887 | * If we are doing a MONOTONIC clock | ||
| 888 | */ | ||
| 889 | if((clock - &posix_clocks[0]) & CLOCKS_MONO){ | ||
| 890 | now.tv_sec += wall_to->tv_sec; | ||
| 891 | now.tv_nsec += wall_to->tv_nsec; | ||
| 892 | } | ||
| 893 | } else { | ||
| 894 | /* | ||
| 895 | * Not one of the basic clocks | ||
| 896 | */ | ||
| 897 | clock->clock_get(clock - posix_clocks, &now); | ||
| 898 | jiffies_64_f = get_jiffies_64(); | ||
| 899 | } | ||
| 900 | /* | ||
| 901 | * Take away now to get delta and normalize | ||
| 902 | */ | ||
| 903 | set_normalized_timespec(&oc, oc.tv_sec - now.tv_sec, | ||
| 904 | oc.tv_nsec - now.tv_nsec); | ||
| 905 | }else{ | ||
| 906 | jiffies_64_f = get_jiffies_64(); | ||
| 907 | } | ||
| 908 | /* | ||
| 909 | * Check if the requested time is prior to now (if so set now) | ||
| 910 | */ | ||
| 911 | if (oc.tv_sec < 0) | ||
| 912 | oc.tv_sec = oc.tv_nsec = 0; | ||
| 913 | |||
| 914 | if (oc.tv_sec | oc.tv_nsec) | ||
| 915 | set_normalized_timespec(&oc, oc.tv_sec, | ||
| 916 | oc.tv_nsec + clock->res); | ||
| 917 | tstojiffie(&oc, clock->res, exp); | ||
| 918 | |||
| 919 | /* | ||
| 920 | * Check if the requested time is more than the timer code | ||
| 921 | * can handle (if so we error out but return the value too). | ||
| 922 | */ | ||
| 923 | if (*exp > ((u64)MAX_JIFFY_OFFSET)) | ||
| 924 | /* | ||
| 925 | * This is a considered response, not exactly in | ||
| 926 | * line with the standard (in fact it is silent on | ||
| 927 | * possible overflows). We assume such a large | ||
| 928 | * value is ALMOST always a programming error and | ||
| 929 | * try not to compound it by setting a really dumb | ||
| 930 | * value. | ||
| 931 | */ | ||
| 932 | rtn = -EINVAL; | ||
| 933 | /* | ||
| 934 | * return the actual jiffies expire time, full 64 bits | ||
| 935 | */ | ||
| 936 | *exp += jiffies_64_f; | ||
| 937 | return rtn; | ||
| 938 | } | ||
| 939 | 687 | ||
| 940 | /* Set a POSIX.1b interval timer. */ | 688 | /* Set a POSIX.1b interval timer. */ |
| 941 | /* timr->it_lock is taken. */ | 689 | /* timr->it_lock is taken. */ |
| @@ -943,68 +691,48 @@ static inline int | |||
| 943 | common_timer_set(struct k_itimer *timr, int flags, | 691 | common_timer_set(struct k_itimer *timr, int flags, |
| 944 | struct itimerspec *new_setting, struct itimerspec *old_setting) | 692 | struct itimerspec *new_setting, struct itimerspec *old_setting) |
| 945 | { | 693 | { |
| 946 | struct k_clock *clock = &posix_clocks[timr->it_clock]; | 694 | struct hrtimer *timer = &timr->it.real.timer; |
| 947 | u64 expire_64; | ||
| 948 | 695 | ||
| 949 | if (old_setting) | 696 | if (old_setting) |
| 950 | common_timer_get(timr, old_setting); | 697 | common_timer_get(timr, old_setting); |
| 951 | 698 | ||
| 952 | /* disable the timer */ | 699 | /* disable the timer */ |
| 953 | timr->it.real.incr = 0; | 700 | timr->it.real.interval.tv64 = 0; |
| 954 | /* | 701 | /* |
| 955 | * careful here. If smp we could be in the "fire" routine which will | 702 | * careful here. If smp we could be in the "fire" routine which will |
| 956 | * be spinning as we hold the lock. But this is ONLY an SMP issue. | 703 | * be spinning as we hold the lock. But this is ONLY an SMP issue. |
| 957 | */ | 704 | */ |
| 958 | if (try_to_del_timer_sync(&timr->it.real.timer) < 0) { | 705 | if (hrtimer_try_to_cancel(timer) < 0) |
| 959 | #ifdef CONFIG_SMP | ||
| 960 | /* | ||
| 961 | * It can only be active if on an other cpu. Since | ||
| 962 | * we have cleared the interval stuff above, it should | ||
| 963 | * clear once we release the spin lock. Of course once | ||
| 964 | * we do that anything could happen, including the | ||
| 965 | * complete melt down of the timer. So return with | ||
| 966 | * a "retry" exit status. | ||
| 967 | */ | ||
| 968 | return TIMER_RETRY; | 706 | return TIMER_RETRY; |
| 969 | #endif | ||
| 970 | } | ||
| 971 | |||
| 972 | remove_from_abslist(timr); | ||
| 973 | 707 | ||
| 974 | timr->it_requeue_pending = (timr->it_requeue_pending + 2) & | 708 | timr->it_requeue_pending = (timr->it_requeue_pending + 2) & |
| 975 | ~REQUEUE_PENDING; | 709 | ~REQUEUE_PENDING; |
| 976 | timr->it_overrun_last = 0; | 710 | timr->it_overrun_last = 0; |
| 977 | timr->it_overrun = -1; | ||
| 978 | /* | ||
| 979 | *switch off the timer when it_value is zero | ||
| 980 | */ | ||
| 981 | if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) { | ||
| 982 | timr->it.real.timer.expires = 0; | ||
| 983 | return 0; | ||
| 984 | } | ||
| 985 | 711 | ||
| 986 | if (adjust_abs_time(clock, | 712 | /* switch off the timer when it_value is zero */ |
| 987 | &new_setting->it_value, flags & TIMER_ABSTIME, | 713 | if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) |
| 988 | &expire_64, &(timr->it.real.wall_to_prev))) { | 714 | return 0; |
| 989 | return -EINVAL; | ||
| 990 | } | ||
| 991 | timr->it.real.timer.expires = (unsigned long)expire_64; | ||
| 992 | tstojiffie(&new_setting->it_interval, clock->res, &expire_64); | ||
| 993 | timr->it.real.incr = (unsigned long)expire_64; | ||
| 994 | 715 | ||
| 995 | /* | 716 | /* Posix madness. Only absolute CLOCK_REALTIME timers |
| 996 | * We do not even queue SIGEV_NONE timers! But we do put them | 717 | * are affected by clock sets. So we must reiniatilize |
| 997 | * in the abs list so we can do that right. | 718 | * the timer. |
| 998 | */ | 719 | */ |
| 999 | if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)) | 720 | if (timr->it_clock == CLOCK_REALTIME && (flags & TIMER_ABSTIME)) |
| 1000 | add_timer(&timr->it.real.timer); | 721 | hrtimer_rebase(timer, CLOCK_REALTIME); |
| 1001 | 722 | else | |
| 1002 | if (flags & TIMER_ABSTIME && clock->abs_struct) { | 723 | hrtimer_rebase(timer, CLOCK_MONOTONIC); |
| 1003 | spin_lock(&clock->abs_struct->lock); | 724 | |
| 1004 | list_add_tail(&(timr->it.real.abs_timer_entry), | 725 | timer->expires = timespec_to_ktime(new_setting->it_value); |
| 1005 | &(clock->abs_struct->list)); | 726 | |
| 1006 | spin_unlock(&clock->abs_struct->lock); | 727 | /* Convert interval */ |
| 1007 | } | 728 | timr->it.real.interval = timespec_to_ktime(new_setting->it_interval); |
| 729 | |||
| 730 | /* SIGEV_NONE timers are not queued ! See common_timer_get */ | ||
| 731 | if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) | ||
| 732 | return 0; | ||
| 733 | |||
| 734 | hrtimer_start(timer, timer->expires, (flags & TIMER_ABSTIME) ? | ||
| 735 | HRTIMER_ABS : HRTIMER_REL); | ||
| 1008 | return 0; | 736 | return 0; |
| 1009 | } | 737 | } |
| 1010 | 738 | ||
| @@ -1026,8 +754,8 @@ sys_timer_settime(timer_t timer_id, int flags, | |||
| 1026 | if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) | 754 | if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) |
| 1027 | return -EFAULT; | 755 | return -EFAULT; |
| 1028 | 756 | ||
| 1029 | if ((!good_timespec(&new_spec.it_interval)) || | 757 | if (!timespec_valid(&new_spec.it_interval) || |
| 1030 | (!good_timespec(&new_spec.it_value))) | 758 | !timespec_valid(&new_spec.it_value)) |
| 1031 | return -EINVAL; | 759 | return -EINVAL; |
| 1032 | retry: | 760 | retry: |
| 1033 | timr = lock_timer(timer_id, &flag); | 761 | timr = lock_timer(timer_id, &flag); |
| @@ -1043,8 +771,8 @@ retry: | |||
| 1043 | goto retry; | 771 | goto retry; |
| 1044 | } | 772 | } |
| 1045 | 773 | ||
| 1046 | if (old_setting && !error && copy_to_user(old_setting, | 774 | if (old_setting && !error && |
| 1047 | &old_spec, sizeof (old_spec))) | 775 | copy_to_user(old_setting, &old_spec, sizeof (old_spec))) |
| 1048 | error = -EFAULT; | 776 | error = -EFAULT; |
| 1049 | 777 | ||
| 1050 | return error; | 778 | return error; |
| @@ -1052,24 +780,10 @@ retry: | |||
| 1052 | 780 | ||
| 1053 | static inline int common_timer_del(struct k_itimer *timer) | 781 | static inline int common_timer_del(struct k_itimer *timer) |
| 1054 | { | 782 | { |
| 1055 | timer->it.real.incr = 0; | 783 | timer->it.real.interval.tv64 = 0; |
| 1056 | 784 | ||
| 1057 | if (try_to_del_timer_sync(&timer->it.real.timer) < 0) { | 785 | if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0) |
| 1058 | #ifdef CONFIG_SMP | ||
| 1059 | /* | ||
| 1060 | * It can only be active if on an other cpu. Since | ||
| 1061 | * we have cleared the interval stuff above, it should | ||
| 1062 | * clear once we release the spin lock. Of course once | ||
| 1063 | * we do that anything could happen, including the | ||
| 1064 | * complete melt down of the timer. So return with | ||
| 1065 | * a "retry" exit status. | ||
| 1066 | */ | ||
| 1067 | return TIMER_RETRY; | 786 | return TIMER_RETRY; |
| 1068 | #endif | ||
| 1069 | } | ||
| 1070 | |||
| 1071 | remove_from_abslist(timer); | ||
| 1072 | |||
| 1073 | return 0; | 787 | return 0; |
| 1074 | } | 788 | } |
| 1075 | 789 | ||
| @@ -1085,24 +799,16 @@ sys_timer_delete(timer_t timer_id) | |||
| 1085 | struct k_itimer *timer; | 799 | struct k_itimer *timer; |
| 1086 | long flags; | 800 | long flags; |
| 1087 | 801 | ||
| 1088 | #ifdef CONFIG_SMP | ||
| 1089 | int error; | ||
| 1090 | retry_delete: | 802 | retry_delete: |
| 1091 | #endif | ||
| 1092 | timer = lock_timer(timer_id, &flags); | 803 | timer = lock_timer(timer_id, &flags); |
| 1093 | if (!timer) | 804 | if (!timer) |
| 1094 | return -EINVAL; | 805 | return -EINVAL; |
| 1095 | 806 | ||
| 1096 | #ifdef CONFIG_SMP | 807 | if (timer_delete_hook(timer) == TIMER_RETRY) { |
| 1097 | error = timer_delete_hook(timer); | ||
| 1098 | |||
| 1099 | if (error == TIMER_RETRY) { | ||
| 1100 | unlock_timer(timer, flags); | 808 | unlock_timer(timer, flags); |
| 1101 | goto retry_delete; | 809 | goto retry_delete; |
| 1102 | } | 810 | } |
| 1103 | #else | 811 | |
| 1104 | timer_delete_hook(timer); | ||
| 1105 | #endif | ||
| 1106 | spin_lock(¤t->sighand->siglock); | 812 | spin_lock(¤t->sighand->siglock); |
| 1107 | list_del(&timer->list); | 813 | list_del(&timer->list); |
| 1108 | spin_unlock(¤t->sighand->siglock); | 814 | spin_unlock(¤t->sighand->siglock); |
| @@ -1119,6 +825,7 @@ retry_delete: | |||
| 1119 | release_posix_timer(timer, IT_ID_SET); | 825 | release_posix_timer(timer, IT_ID_SET); |
| 1120 | return 0; | 826 | return 0; |
| 1121 | } | 827 | } |
| 828 | |||
| 1122 | /* | 829 | /* |
| 1123 | * return timer owned by the process, used by exit_itimers | 830 | * return timer owned by the process, used by exit_itimers |
| 1124 | */ | 831 | */ |
| @@ -1126,22 +833,13 @@ static inline void itimer_delete(struct k_itimer *timer) | |||
| 1126 | { | 833 | { |
| 1127 | unsigned long flags; | 834 | unsigned long flags; |
| 1128 | 835 | ||
| 1129 | #ifdef CONFIG_SMP | ||
| 1130 | int error; | ||
| 1131 | retry_delete: | 836 | retry_delete: |
| 1132 | #endif | ||
| 1133 | spin_lock_irqsave(&timer->it_lock, flags); | 837 | spin_lock_irqsave(&timer->it_lock, flags); |
| 1134 | 838 | ||
| 1135 | #ifdef CONFIG_SMP | 839 | if (timer_delete_hook(timer) == TIMER_RETRY) { |
| 1136 | error = timer_delete_hook(timer); | ||
| 1137 | |||
| 1138 | if (error == TIMER_RETRY) { | ||
| 1139 | unlock_timer(timer, flags); | 840 | unlock_timer(timer, flags); |
| 1140 | goto retry_delete; | 841 | goto retry_delete; |
| 1141 | } | 842 | } |
| 1142 | #else | ||
| 1143 | timer_delete_hook(timer); | ||
| 1144 | #endif | ||
| 1145 | list_del(&timer->list); | 843 | list_del(&timer->list); |
| 1146 | /* | 844 | /* |
| 1147 | * This keeps any tasks waiting on the spin lock from thinking | 845 | * This keeps any tasks waiting on the spin lock from thinking |
| @@ -1170,57 +868,8 @@ void exit_itimers(struct signal_struct *sig) | |||
| 1170 | } | 868 | } |
| 1171 | } | 869 | } |
| 1172 | 870 | ||
| 1173 | /* | 871 | /* Not available / possible... functions */ |
| 1174 | * And now for the "clock" calls | 872 | int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp) |
| 1175 | * | ||
| 1176 | * These functions are called both from timer functions (with the timer | ||
| 1177 | * spin_lock_irq() held and from clock calls with no locking. They must | ||
| 1178 | * use the save flags versions of locks. | ||
| 1179 | */ | ||
| 1180 | |||
| 1181 | /* | ||
| 1182 | * We do ticks here to avoid the irq lock ( they take sooo long). | ||
| 1183 | * The seqlock is great here. Since we a reader, we don't really care | ||
| 1184 | * if we are interrupted since we don't take lock that will stall us or | ||
| 1185 | * any other cpu. Voila, no irq lock is needed. | ||
| 1186 | * | ||
| 1187 | */ | ||
| 1188 | |||
| 1189 | static u64 do_posix_clock_monotonic_gettime_parts( | ||
| 1190 | struct timespec *tp, struct timespec *mo) | ||
| 1191 | { | ||
| 1192 | u64 jiff; | ||
| 1193 | unsigned int seq; | ||
| 1194 | |||
| 1195 | do { | ||
| 1196 | seq = read_seqbegin(&xtime_lock); | ||
| 1197 | getnstimeofday(tp); | ||
| 1198 | *mo = wall_to_monotonic; | ||
| 1199 | jiff = jiffies_64; | ||
| 1200 | |||
| 1201 | } while(read_seqretry(&xtime_lock, seq)); | ||
| 1202 | |||
| 1203 | return jiff; | ||
| 1204 | } | ||
| 1205 | |||
| 1206 | static int do_posix_clock_monotonic_get(clockid_t clock, struct timespec *tp) | ||
| 1207 | { | ||
| 1208 | struct timespec wall_to_mono; | ||
| 1209 | |||
| 1210 | do_posix_clock_monotonic_gettime_parts(tp, &wall_to_mono); | ||
| 1211 | |||
| 1212 | set_normalized_timespec(tp, tp->tv_sec + wall_to_mono.tv_sec, | ||
| 1213 | tp->tv_nsec + wall_to_mono.tv_nsec); | ||
| 1214 | |||
| 1215 | return 0; | ||
| 1216 | } | ||
| 1217 | |||
| 1218 | int do_posix_clock_monotonic_gettime(struct timespec *tp) | ||
| 1219 | { | ||
| 1220 | return do_posix_clock_monotonic_get(CLOCK_MONOTONIC, tp); | ||
| 1221 | } | ||
| 1222 | |||
| 1223 | int do_posix_clock_nosettime(clockid_t clockid, struct timespec *tp) | ||
| 1224 | { | 873 | { |
| 1225 | return -EINVAL; | 874 | return -EINVAL; |
| 1226 | } | 875 | } |
| @@ -1232,7 +881,8 @@ int do_posix_clock_notimer_create(struct k_itimer *timer) | |||
| 1232 | } | 881 | } |
| 1233 | EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create); | 882 | EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create); |
| 1234 | 883 | ||
| 1235 | int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t) | 884 | int do_posix_clock_nonanosleep(const clockid_t clock, int flags, |
| 885 | struct timespec *t, struct timespec __user *r) | ||
| 1236 | { | 886 | { |
| 1237 | #ifndef ENOTSUP | 887 | #ifndef ENOTSUP |
| 1238 | return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */ | 888 | return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */ |
| @@ -1242,8 +892,8 @@ int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t) | |||
| 1242 | } | 892 | } |
| 1243 | EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); | 893 | EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); |
| 1244 | 894 | ||
| 1245 | asmlinkage long | 895 | asmlinkage long sys_clock_settime(const clockid_t which_clock, |
| 1246 | sys_clock_settime(clockid_t which_clock, const struct timespec __user *tp) | 896 | const struct timespec __user *tp) |
| 1247 | { | 897 | { |
| 1248 | struct timespec new_tp; | 898 | struct timespec new_tp; |
| 1249 | 899 | ||
| @@ -1256,7 +906,7 @@ sys_clock_settime(clockid_t which_clock, const struct timespec __user *tp) | |||
| 1256 | } | 906 | } |
| 1257 | 907 | ||
| 1258 | asmlinkage long | 908 | asmlinkage long |
| 1259 | sys_clock_gettime(clockid_t which_clock, struct timespec __user *tp) | 909 | sys_clock_gettime(const clockid_t which_clock, struct timespec __user *tp) |
| 1260 | { | 910 | { |
| 1261 | struct timespec kernel_tp; | 911 | struct timespec kernel_tp; |
| 1262 | int error; | 912 | int error; |
| @@ -1273,7 +923,7 @@ sys_clock_gettime(clockid_t which_clock, struct timespec __user *tp) | |||
| 1273 | } | 923 | } |
| 1274 | 924 | ||
| 1275 | asmlinkage long | 925 | asmlinkage long |
| 1276 | sys_clock_getres(clockid_t which_clock, struct timespec __user *tp) | 926 | sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp) |
| 1277 | { | 927 | { |
| 1278 | struct timespec rtn_tp; | 928 | struct timespec rtn_tp; |
| 1279 | int error; | 929 | int error; |
| @@ -1292,117 +942,34 @@ sys_clock_getres(clockid_t which_clock, struct timespec __user *tp) | |||
| 1292 | } | 942 | } |
| 1293 | 943 | ||
| 1294 | /* | 944 | /* |
| 1295 | * The standard says that an absolute nanosleep call MUST wake up at | 945 | * nanosleep for monotonic and realtime clocks |
| 1296 | * the requested time in spite of clock settings. Here is what we do: | ||
| 1297 | * For each nanosleep call that needs it (only absolute and not on | ||
| 1298 | * CLOCK_MONOTONIC* (as it can not be set)) we thread a little structure | ||
| 1299 | * into the "nanosleep_abs_list". All we need is the task_struct pointer. | ||
| 1300 | * When ever the clock is set we just wake up all those tasks. The rest | ||
| 1301 | * is done by the while loop in clock_nanosleep(). | ||
| 1302 | * | ||
| 1303 | * On locking, clock_was_set() is called from update_wall_clock which | ||
| 1304 | * holds (or has held for it) a write_lock_irq( xtime_lock) and is | ||
| 1305 | * called from the timer bh code. Thus we need the irq save locks. | ||
| 1306 | * | ||
| 1307 | * Also, on the call from update_wall_clock, that is done as part of a | ||
| 1308 | * softirq thing. We don't want to delay the system that much (possibly | ||
| 1309 | * long list of timers to fix), so we defer that work to keventd. | ||
| 1310 | */ | 946 | */ |
| 1311 | 947 | static int common_nsleep(const clockid_t which_clock, int flags, | |
| 1312 | static DECLARE_WAIT_QUEUE_HEAD(nanosleep_abs_wqueue); | 948 | struct timespec *tsave, struct timespec __user *rmtp) |
| 1313 | static DECLARE_WORK(clock_was_set_work, (void(*)(void*))clock_was_set, NULL); | 949 | { |
| 1314 | 950 | int mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL; | |
| 1315 | static DECLARE_MUTEX(clock_was_set_lock); | 951 | int clockid = which_clock; |
| 1316 | 952 | ||
| 1317 | void clock_was_set(void) | 953 | switch (which_clock) { |
| 1318 | { | 954 | case CLOCK_REALTIME: |
| 1319 | struct k_itimer *timr; | 955 | /* Posix madness. Only absolute timers on clock realtime |
| 1320 | struct timespec new_wall_to; | 956 | are affected by clock set. */ |
| 1321 | LIST_HEAD(cws_list); | 957 | if (mode != HRTIMER_ABS) |
| 1322 | unsigned long seq; | 958 | clockid = CLOCK_MONOTONIC; |
| 1323 | 959 | case CLOCK_MONOTONIC: | |
| 1324 | 960 | break; | |
| 1325 | if (unlikely(in_interrupt())) { | 961 | default: |
| 1326 | schedule_work(&clock_was_set_work); | 962 | return -EINVAL; |
| 1327 | return; | ||
| 1328 | } | 963 | } |
| 1329 | wake_up_all(&nanosleep_abs_wqueue); | 964 | return hrtimer_nanosleep(tsave, rmtp, mode, clockid); |
| 1330 | |||
| 1331 | /* | ||
| 1332 | * Check if there exist TIMER_ABSTIME timers to correct. | ||
| 1333 | * | ||
| 1334 | * Notes on locking: This code is run in task context with irq | ||
| 1335 | * on. We CAN be interrupted! All other usage of the abs list | ||
| 1336 | * lock is under the timer lock which holds the irq lock as | ||
| 1337 | * well. We REALLY don't want to scan the whole list with the | ||
| 1338 | * interrupt system off, AND we would like a sequence lock on | ||
| 1339 | * this code as well. Since we assume that the clock will not | ||
| 1340 | * be set often, it seems ok to take and release the irq lock | ||
| 1341 | * for each timer. In fact add_timer will do this, so this is | ||
| 1342 | * not an issue. So we know when we are done, we will move the | ||
| 1343 | * whole list to a new location. Then as we process each entry, | ||
| 1344 | * we will move it to the actual list again. This way, when our | ||
| 1345 | * copy is empty, we are done. We are not all that concerned | ||
| 1346 | * about preemption so we will use a semaphore lock to protect | ||
| 1347 | * aginst reentry. This way we will not stall another | ||
| 1348 | * processor. It is possible that this may delay some timers | ||
| 1349 | * that should have expired, given the new clock, but even this | ||
| 1350 | * will be minimal as we will always update to the current time, | ||
| 1351 | * even if it was set by a task that is waiting for entry to | ||
| 1352 | * this code. Timers that expire too early will be caught by | ||
| 1353 | * the expire code and restarted. | ||
| 1354 | |||
| 1355 | * Absolute timers that repeat are left in the abs list while | ||
| 1356 | * waiting for the task to pick up the signal. This means we | ||
| 1357 | * may find timers that are not in the "add_timer" list, but are | ||
| 1358 | * in the abs list. We do the same thing for these, save | ||
| 1359 | * putting them back in the "add_timer" list. (Note, these are | ||
| 1360 | * left in the abs list mainly to indicate that they are | ||
| 1361 | * ABSOLUTE timers, a fact that is used by the re-arm code, and | ||
| 1362 | * for which we have no other flag.) | ||
| 1363 | |||
| 1364 | */ | ||
| 1365 | |||
| 1366 | down(&clock_was_set_lock); | ||
| 1367 | spin_lock_irq(&abs_list.lock); | ||
| 1368 | list_splice_init(&abs_list.list, &cws_list); | ||
| 1369 | spin_unlock_irq(&abs_list.lock); | ||
| 1370 | do { | ||
| 1371 | do { | ||
| 1372 | seq = read_seqbegin(&xtime_lock); | ||
| 1373 | new_wall_to = wall_to_monotonic; | ||
| 1374 | } while (read_seqretry(&xtime_lock, seq)); | ||
| 1375 | |||
| 1376 | spin_lock_irq(&abs_list.lock); | ||
| 1377 | if (list_empty(&cws_list)) { | ||
| 1378 | spin_unlock_irq(&abs_list.lock); | ||
| 1379 | break; | ||
| 1380 | } | ||
| 1381 | timr = list_entry(cws_list.next, struct k_itimer, | ||
| 1382 | it.real.abs_timer_entry); | ||
| 1383 | |||
| 1384 | list_del_init(&timr->it.real.abs_timer_entry); | ||
| 1385 | if (add_clockset_delta(timr, &new_wall_to) && | ||
| 1386 | del_timer(&timr->it.real.timer)) /* timer run yet? */ | ||
| 1387 | add_timer(&timr->it.real.timer); | ||
| 1388 | list_add(&timr->it.real.abs_timer_entry, &abs_list.list); | ||
| 1389 | spin_unlock_irq(&abs_list.lock); | ||
| 1390 | } while (1); | ||
| 1391 | |||
| 1392 | up(&clock_was_set_lock); | ||
| 1393 | } | 965 | } |
| 1394 | 966 | ||
| 1395 | long clock_nanosleep_restart(struct restart_block *restart_block); | ||
| 1396 | |||
| 1397 | asmlinkage long | 967 | asmlinkage long |
| 1398 | sys_clock_nanosleep(clockid_t which_clock, int flags, | 968 | sys_clock_nanosleep(const clockid_t which_clock, int flags, |
| 1399 | const struct timespec __user *rqtp, | 969 | const struct timespec __user *rqtp, |
| 1400 | struct timespec __user *rmtp) | 970 | struct timespec __user *rmtp) |
| 1401 | { | 971 | { |
| 1402 | struct timespec t; | 972 | struct timespec t; |
| 1403 | struct restart_block *restart_block = | ||
| 1404 | &(current_thread_info()->restart_block); | ||
| 1405 | int ret; | ||
| 1406 | 973 | ||
| 1407 | if (invalid_clockid(which_clock)) | 974 | if (invalid_clockid(which_clock)) |
| 1408 | return -EINVAL; | 975 | return -EINVAL; |
| @@ -1410,125 +977,9 @@ sys_clock_nanosleep(clockid_t which_clock, int flags, | |||
| 1410 | if (copy_from_user(&t, rqtp, sizeof (struct timespec))) | 977 | if (copy_from_user(&t, rqtp, sizeof (struct timespec))) |
| 1411 | return -EFAULT; | 978 | return -EFAULT; |
| 1412 | 979 | ||
| 1413 | if ((unsigned) t.tv_nsec >= NSEC_PER_SEC || t.tv_sec < 0) | 980 | if (!timespec_valid(&t)) |
| 1414 | return -EINVAL; | 981 | return -EINVAL; |
| 1415 | 982 | ||
| 1416 | /* | 983 | return CLOCK_DISPATCH(which_clock, nsleep, |
| 1417 | * Do this here as nsleep function does not have the real address. | 984 | (which_clock, flags, &t, rmtp)); |
| 1418 | */ | ||
| 1419 | restart_block->arg1 = (unsigned long)rmtp; | ||
| 1420 | |||
| 1421 | ret = CLOCK_DISPATCH(which_clock, nsleep, (which_clock, flags, &t)); | ||
| 1422 | |||
| 1423 | if ((ret == -ERESTART_RESTARTBLOCK) && rmtp && | ||
| 1424 | copy_to_user(rmtp, &t, sizeof (t))) | ||
| 1425 | return -EFAULT; | ||
| 1426 | return ret; | ||
| 1427 | } | ||
| 1428 | |||
| 1429 | |||
| 1430 | static int common_nsleep(clockid_t which_clock, | ||
| 1431 | int flags, struct timespec *tsave) | ||
| 1432 | { | ||
| 1433 | struct timespec t, dum; | ||
| 1434 | DECLARE_WAITQUEUE(abs_wqueue, current); | ||
| 1435 | u64 rq_time = (u64)0; | ||
| 1436 | s64 left; | ||
| 1437 | int abs; | ||
| 1438 | struct restart_block *restart_block = | ||
| 1439 | ¤t_thread_info()->restart_block; | ||
| 1440 | |||
| 1441 | abs_wqueue.flags = 0; | ||
| 1442 | abs = flags & TIMER_ABSTIME; | ||
| 1443 | |||
| 1444 | if (restart_block->fn == clock_nanosleep_restart) { | ||
| 1445 | /* | ||
| 1446 | * Interrupted by a non-delivered signal, pick up remaining | ||
| 1447 | * time and continue. Remaining time is in arg2 & 3. | ||
| 1448 | */ | ||
| 1449 | restart_block->fn = do_no_restart_syscall; | ||
| 1450 | |||
| 1451 | rq_time = restart_block->arg3; | ||
| 1452 | rq_time = (rq_time << 32) + restart_block->arg2; | ||
| 1453 | if (!rq_time) | ||
| 1454 | return -EINTR; | ||
| 1455 | left = rq_time - get_jiffies_64(); | ||
| 1456 | if (left <= (s64)0) | ||
| 1457 | return 0; /* Already passed */ | ||
| 1458 | } | ||
| 1459 | |||
| 1460 | if (abs && (posix_clocks[which_clock].clock_get != | ||
| 1461 | posix_clocks[CLOCK_MONOTONIC].clock_get)) | ||
| 1462 | add_wait_queue(&nanosleep_abs_wqueue, &abs_wqueue); | ||
| 1463 | |||
| 1464 | do { | ||
| 1465 | t = *tsave; | ||
| 1466 | if (abs || !rq_time) { | ||
| 1467 | adjust_abs_time(&posix_clocks[which_clock], &t, abs, | ||
| 1468 | &rq_time, &dum); | ||
| 1469 | } | ||
| 1470 | |||
| 1471 | left = rq_time - get_jiffies_64(); | ||
| 1472 | if (left >= (s64)MAX_JIFFY_OFFSET) | ||
| 1473 | left = (s64)MAX_JIFFY_OFFSET; | ||
| 1474 | if (left < (s64)0) | ||
| 1475 | break; | ||
| 1476 | |||
| 1477 | schedule_timeout_interruptible(left); | ||
| 1478 | |||
| 1479 | left = rq_time - get_jiffies_64(); | ||
| 1480 | } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING)); | ||
| 1481 | |||
| 1482 | if (abs_wqueue.task_list.next) | ||
| 1483 | finish_wait(&nanosleep_abs_wqueue, &abs_wqueue); | ||
| 1484 | |||
| 1485 | if (left > (s64)0) { | ||
| 1486 | |||
| 1487 | /* | ||
| 1488 | * Always restart abs calls from scratch to pick up any | ||
| 1489 | * clock shifting that happened while we are away. | ||
| 1490 | */ | ||
| 1491 | if (abs) | ||
| 1492 | return -ERESTARTNOHAND; | ||
| 1493 | |||
| 1494 | left *= TICK_NSEC; | ||
| 1495 | tsave->tv_sec = div_long_long_rem(left, | ||
| 1496 | NSEC_PER_SEC, | ||
| 1497 | &tsave->tv_nsec); | ||
| 1498 | /* | ||
| 1499 | * Restart works by saving the time remaing in | ||
| 1500 | * arg2 & 3 (it is 64-bits of jiffies). The other | ||
| 1501 | * info we need is the clock_id (saved in arg0). | ||
| 1502 | * The sys_call interface needs the users | ||
| 1503 | * timespec return address which _it_ saves in arg1. | ||
| 1504 | * Since we have cast the nanosleep call to a clock_nanosleep | ||
| 1505 | * both can be restarted with the same code. | ||
| 1506 | */ | ||
| 1507 | restart_block->fn = clock_nanosleep_restart; | ||
| 1508 | restart_block->arg0 = which_clock; | ||
| 1509 | /* | ||
| 1510 | * Caller sets arg1 | ||
| 1511 | */ | ||
| 1512 | restart_block->arg2 = rq_time & 0xffffffffLL; | ||
| 1513 | restart_block->arg3 = rq_time >> 32; | ||
| 1514 | |||
| 1515 | return -ERESTART_RESTARTBLOCK; | ||
| 1516 | } | ||
| 1517 | |||
| 1518 | return 0; | ||
| 1519 | } | ||
| 1520 | /* | ||
| 1521 | * This will restart clock_nanosleep. | ||
| 1522 | */ | ||
| 1523 | long | ||
| 1524 | clock_nanosleep_restart(struct restart_block *restart_block) | ||
| 1525 | { | ||
| 1526 | struct timespec t; | ||
| 1527 | int ret = common_nsleep(restart_block->arg0, 0, &t); | ||
| 1528 | |||
| 1529 | if ((ret == -ERESTART_RESTARTBLOCK) && restart_block->arg1 && | ||
| 1530 | copy_to_user((struct timespec __user *)(restart_block->arg1), &t, | ||
| 1531 | sizeof (t))) | ||
| 1532 | return -EFAULT; | ||
| 1533 | return ret; | ||
| 1534 | } | 985 | } |
diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 027322a564f4..e24446f8d8cd 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c | |||
| @@ -24,10 +24,11 @@ | |||
| 24 | 24 | ||
| 25 | extern suspend_disk_method_t pm_disk_mode; | 25 | extern suspend_disk_method_t pm_disk_mode; |
| 26 | 26 | ||
| 27 | extern int swsusp_shrink_memory(void); | ||
| 27 | extern int swsusp_suspend(void); | 28 | extern int swsusp_suspend(void); |
| 28 | extern int swsusp_write(void); | 29 | extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages); |
| 29 | extern int swsusp_check(void); | 30 | extern int swsusp_check(void); |
| 30 | extern int swsusp_read(void); | 31 | extern int swsusp_read(struct pbe **pblist_ptr); |
| 31 | extern void swsusp_close(void); | 32 | extern void swsusp_close(void); |
| 32 | extern int swsusp_resume(void); | 33 | extern int swsusp_resume(void); |
| 33 | 34 | ||
| @@ -73,31 +74,6 @@ static void power_down(suspend_disk_method_t mode) | |||
| 73 | static int in_suspend __nosavedata = 0; | 74 | static int in_suspend __nosavedata = 0; |
| 74 | 75 | ||
| 75 | 76 | ||
| 76 | /** | ||
| 77 | * free_some_memory - Try to free as much memory as possible | ||
| 78 | * | ||
| 79 | * ... but do not OOM-kill anyone | ||
| 80 | * | ||
| 81 | * Notice: all userland should be stopped at this point, or | ||
| 82 | * livelock is possible. | ||
| 83 | */ | ||
| 84 | |||
| 85 | static void free_some_memory(void) | ||
| 86 | { | ||
| 87 | unsigned int i = 0; | ||
| 88 | unsigned int tmp; | ||
| 89 | unsigned long pages = 0; | ||
| 90 | char *p = "-\\|/"; | ||
| 91 | |||
| 92 | printk("Freeing memory... "); | ||
| 93 | while ((tmp = shrink_all_memory(10000))) { | ||
| 94 | pages += tmp; | ||
| 95 | printk("\b%c", p[i++ % 4]); | ||
| 96 | } | ||
| 97 | printk("\bdone (%li pages freed)\n", pages); | ||
| 98 | } | ||
| 99 | |||
| 100 | |||
| 101 | static inline void platform_finish(void) | 77 | static inline void platform_finish(void) |
| 102 | { | 78 | { |
| 103 | if (pm_disk_mode == PM_DISK_PLATFORM) { | 79 | if (pm_disk_mode == PM_DISK_PLATFORM) { |
| @@ -127,8 +103,8 @@ static int prepare_processes(void) | |||
| 127 | } | 103 | } |
| 128 | 104 | ||
| 129 | /* Free memory before shutting down devices. */ | 105 | /* Free memory before shutting down devices. */ |
| 130 | free_some_memory(); | 106 | if (!(error = swsusp_shrink_memory())) |
| 131 | return 0; | 107 | return 0; |
| 132 | thaw: | 108 | thaw: |
| 133 | thaw_processes(); | 109 | thaw_processes(); |
| 134 | enable_nonboot_cpus(); | 110 | enable_nonboot_cpus(); |
| @@ -176,7 +152,7 @@ int pm_suspend_disk(void) | |||
| 176 | if (in_suspend) { | 152 | if (in_suspend) { |
| 177 | device_resume(); | 153 | device_resume(); |
| 178 | pr_debug("PM: writing image.\n"); | 154 | pr_debug("PM: writing image.\n"); |
| 179 | error = swsusp_write(); | 155 | error = swsusp_write(pagedir_nosave, nr_copy_pages); |
| 180 | if (!error) | 156 | if (!error) |
| 181 | power_down(pm_disk_mode); | 157 | power_down(pm_disk_mode); |
| 182 | else { | 158 | else { |
| @@ -247,7 +223,7 @@ static int software_resume(void) | |||
| 247 | 223 | ||
| 248 | pr_debug("PM: Reading swsusp image.\n"); | 224 | pr_debug("PM: Reading swsusp image.\n"); |
| 249 | 225 | ||
| 250 | if ((error = swsusp_read())) { | 226 | if ((error = swsusp_read(&pagedir_nosave))) { |
| 251 | swsusp_free(); | 227 | swsusp_free(); |
| 252 | goto Thaw; | 228 | goto Thaw; |
| 253 | } | 229 | } |
| @@ -363,37 +339,55 @@ static ssize_t resume_show(struct subsystem * subsys, char *buf) | |||
| 363 | MINOR(swsusp_resume_device)); | 339 | MINOR(swsusp_resume_device)); |
| 364 | } | 340 | } |
| 365 | 341 | ||
| 366 | static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t n) | 342 | static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n) |
| 367 | { | 343 | { |
| 368 | int len; | ||
| 369 | char *p; | ||
| 370 | unsigned int maj, min; | 344 | unsigned int maj, min; |
| 371 | int error = -EINVAL; | ||
| 372 | dev_t res; | 345 | dev_t res; |
| 346 | int ret = -EINVAL; | ||
| 373 | 347 | ||
| 374 | p = memchr(buf, '\n', n); | 348 | if (sscanf(buf, "%u:%u", &maj, &min) != 2) |
| 375 | len = p ? p - buf : n; | 349 | goto out; |
| 376 | 350 | ||
| 377 | if (sscanf(buf, "%u:%u", &maj, &min) == 2) { | 351 | res = MKDEV(maj,min); |
| 378 | res = MKDEV(maj,min); | 352 | if (maj != MAJOR(res) || min != MINOR(res)) |
| 379 | if (maj == MAJOR(res) && min == MINOR(res)) { | 353 | goto out; |
| 380 | down(&pm_sem); | ||
| 381 | swsusp_resume_device = res; | ||
| 382 | up(&pm_sem); | ||
| 383 | printk("Attempting manual resume\n"); | ||
| 384 | noresume = 0; | ||
| 385 | software_resume(); | ||
| 386 | } | ||
| 387 | } | ||
| 388 | 354 | ||
| 389 | return error >= 0 ? n : error; | 355 | down(&pm_sem); |
| 356 | swsusp_resume_device = res; | ||
| 357 | up(&pm_sem); | ||
| 358 | printk("Attempting manual resume\n"); | ||
| 359 | noresume = 0; | ||
| 360 | software_resume(); | ||
| 361 | ret = n; | ||
| 362 | out: | ||
| 363 | return ret; | ||
| 390 | } | 364 | } |
| 391 | 365 | ||
| 392 | power_attr(resume); | 366 | power_attr(resume); |
| 393 | 367 | ||
| 368 | static ssize_t image_size_show(struct subsystem * subsys, char *buf) | ||
| 369 | { | ||
| 370 | return sprintf(buf, "%u\n", image_size); | ||
| 371 | } | ||
| 372 | |||
| 373 | static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n) | ||
| 374 | { | ||
| 375 | unsigned int size; | ||
| 376 | |||
| 377 | if (sscanf(buf, "%u", &size) == 1) { | ||
| 378 | image_size = size; | ||
| 379 | return n; | ||
| 380 | } | ||
| 381 | |||
| 382 | return -EINVAL; | ||
| 383 | } | ||
| 384 | |||
| 385 | power_attr(image_size); | ||
| 386 | |||
| 394 | static struct attribute * g[] = { | 387 | static struct attribute * g[] = { |
| 395 | &disk_attr.attr, | 388 | &disk_attr.attr, |
| 396 | &resume_attr.attr, | 389 | &resume_attr.attr, |
| 390 | &image_size_attr.attr, | ||
| 397 | NULL, | 391 | NULL, |
| 398 | }; | 392 | }; |
| 399 | 393 | ||
diff --git a/kernel/power/power.h b/kernel/power/power.h index 6c042b5ee14b..7e8492fd1423 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
| @@ -9,19 +9,13 @@ | |||
| 9 | #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) | 9 | #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) |
| 10 | #endif | 10 | #endif |
| 11 | 11 | ||
| 12 | #define MAX_PBES ((PAGE_SIZE - sizeof(struct new_utsname) \ | ||
| 13 | - 4 - 3*sizeof(unsigned long) - sizeof(int) \ | ||
| 14 | - sizeof(void *)) / sizeof(swp_entry_t)) | ||
| 15 | |||
| 16 | struct swsusp_info { | 12 | struct swsusp_info { |
| 17 | struct new_utsname uts; | 13 | struct new_utsname uts; |
| 18 | u32 version_code; | 14 | u32 version_code; |
| 19 | unsigned long num_physpages; | 15 | unsigned long num_physpages; |
| 20 | int cpus; | 16 | int cpus; |
| 21 | unsigned long image_pages; | 17 | unsigned long image_pages; |
| 22 | unsigned long pagedir_pages; | 18 | unsigned long pages; |
| 23 | suspend_pagedir_t * suspend_pagedir; | ||
| 24 | swp_entry_t pagedir[MAX_PBES]; | ||
| 25 | } __attribute__((aligned(PAGE_SIZE))); | 19 | } __attribute__((aligned(PAGE_SIZE))); |
| 26 | 20 | ||
| 27 | 21 | ||
| @@ -48,25 +42,27 @@ static struct subsys_attribute _name##_attr = { \ | |||
| 48 | 42 | ||
| 49 | extern struct subsystem power_subsys; | 43 | extern struct subsystem power_subsys; |
| 50 | 44 | ||
| 51 | extern int freeze_processes(void); | ||
| 52 | extern void thaw_processes(void); | ||
| 53 | |||
| 54 | extern int pm_prepare_console(void); | 45 | extern int pm_prepare_console(void); |
| 55 | extern void pm_restore_console(void); | 46 | extern void pm_restore_console(void); |
| 56 | 47 | ||
| 57 | |||
| 58 | /* References to section boundaries */ | 48 | /* References to section boundaries */ |
| 59 | extern const void __nosave_begin, __nosave_end; | 49 | extern const void __nosave_begin, __nosave_end; |
| 60 | 50 | ||
| 61 | extern unsigned int nr_copy_pages; | 51 | extern unsigned int nr_copy_pages; |
| 62 | extern suspend_pagedir_t *pagedir_nosave; | 52 | extern struct pbe *pagedir_nosave; |
| 63 | extern suspend_pagedir_t *pagedir_save; | 53 | |
| 54 | /* Preferred image size in MB (default 500) */ | ||
| 55 | extern unsigned int image_size; | ||
| 64 | 56 | ||
| 65 | extern asmlinkage int swsusp_arch_suspend(void); | 57 | extern asmlinkage int swsusp_arch_suspend(void); |
| 66 | extern asmlinkage int swsusp_arch_resume(void); | 58 | extern asmlinkage int swsusp_arch_resume(void); |
| 67 | 59 | ||
| 60 | extern unsigned int count_data_pages(void); | ||
| 68 | extern void free_pagedir(struct pbe *pblist); | 61 | extern void free_pagedir(struct pbe *pblist); |
| 62 | extern void release_eaten_pages(void); | ||
| 69 | extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed); | 63 | extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed); |
| 70 | extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages); | ||
| 71 | extern void swsusp_free(void); | 64 | extern void swsusp_free(void); |
| 72 | extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed); | 65 | extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed); |
| 66 | extern unsigned int snapshot_nr_pages(void); | ||
| 67 | extern struct pbe *snapshot_pblist(void); | ||
| 68 | extern void snapshot_pblist_set(struct pbe *pblist); | ||
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 4a6dbcefd378..41f66365f0d8 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
| @@ -33,7 +33,35 @@ | |||
| 33 | 33 | ||
| 34 | #include "power.h" | 34 | #include "power.h" |
| 35 | 35 | ||
| 36 | struct pbe *pagedir_nosave; | ||
| 37 | unsigned int nr_copy_pages; | ||
| 38 | |||
| 36 | #ifdef CONFIG_HIGHMEM | 39 | #ifdef CONFIG_HIGHMEM |
| 40 | unsigned int count_highmem_pages(void) | ||
| 41 | { | ||
| 42 | struct zone *zone; | ||
| 43 | unsigned long zone_pfn; | ||
| 44 | unsigned int n = 0; | ||
| 45 | |||
| 46 | for_each_zone (zone) | ||
| 47 | if (is_highmem(zone)) { | ||
| 48 | mark_free_pages(zone); | ||
| 49 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) { | ||
| 50 | struct page *page; | ||
| 51 | unsigned long pfn = zone_pfn + zone->zone_start_pfn; | ||
| 52 | if (!pfn_valid(pfn)) | ||
| 53 | continue; | ||
| 54 | page = pfn_to_page(pfn); | ||
| 55 | if (PageReserved(page)) | ||
| 56 | continue; | ||
| 57 | if (PageNosaveFree(page)) | ||
| 58 | continue; | ||
| 59 | n++; | ||
| 60 | } | ||
| 61 | } | ||
| 62 | return n; | ||
| 63 | } | ||
| 64 | |||
| 37 | struct highmem_page { | 65 | struct highmem_page { |
| 38 | char *data; | 66 | char *data; |
| 39 | struct page *page; | 67 | struct page *page; |
| @@ -149,17 +177,15 @@ static int saveable(struct zone *zone, unsigned long *zone_pfn) | |||
| 149 | BUG_ON(PageReserved(page) && PageNosave(page)); | 177 | BUG_ON(PageReserved(page) && PageNosave(page)); |
| 150 | if (PageNosave(page)) | 178 | if (PageNosave(page)) |
| 151 | return 0; | 179 | return 0; |
| 152 | if (PageReserved(page) && pfn_is_nosave(pfn)) { | 180 | if (PageReserved(page) && pfn_is_nosave(pfn)) |
| 153 | pr_debug("[nosave pfn 0x%lx]", pfn); | ||
| 154 | return 0; | 181 | return 0; |
| 155 | } | ||
| 156 | if (PageNosaveFree(page)) | 182 | if (PageNosaveFree(page)) |
| 157 | return 0; | 183 | return 0; |
| 158 | 184 | ||
| 159 | return 1; | 185 | return 1; |
| 160 | } | 186 | } |
| 161 | 187 | ||
| 162 | static unsigned count_data_pages(void) | 188 | unsigned int count_data_pages(void) |
| 163 | { | 189 | { |
| 164 | struct zone *zone; | 190 | struct zone *zone; |
| 165 | unsigned long zone_pfn; | 191 | unsigned long zone_pfn; |
| @@ -244,7 +270,7 @@ static inline void fill_pb_page(struct pbe *pbpage) | |||
| 244 | * of memory pages allocated with alloc_pagedir() | 270 | * of memory pages allocated with alloc_pagedir() |
| 245 | */ | 271 | */ |
| 246 | 272 | ||
| 247 | void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) | 273 | static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) |
| 248 | { | 274 | { |
| 249 | struct pbe *pbpage, *p; | 275 | struct pbe *pbpage, *p; |
| 250 | unsigned int num = PBES_PER_PAGE; | 276 | unsigned int num = PBES_PER_PAGE; |
| @@ -261,7 +287,35 @@ void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) | |||
| 261 | p->next = p + 1; | 287 | p->next = p + 1; |
| 262 | p->next = NULL; | 288 | p->next = NULL; |
| 263 | } | 289 | } |
| 264 | pr_debug("create_pbe_list(): initialized %d PBEs\n", num); | 290 | } |
| 291 | |||
| 292 | /** | ||
| 293 | * On resume it is necessary to trace and eventually free the unsafe | ||
| 294 | * pages that have been allocated, because they are needed for I/O | ||
| 295 | * (on x86-64 we likely will "eat" these pages once again while | ||
| 296 | * creating the temporary page translation tables) | ||
| 297 | */ | ||
| 298 | |||
| 299 | struct eaten_page { | ||
| 300 | struct eaten_page *next; | ||
| 301 | char padding[PAGE_SIZE - sizeof(void *)]; | ||
| 302 | }; | ||
| 303 | |||
| 304 | static struct eaten_page *eaten_pages = NULL; | ||
| 305 | |||
| 306 | void release_eaten_pages(void) | ||
| 307 | { | ||
| 308 | struct eaten_page *p, *q; | ||
| 309 | |||
| 310 | p = eaten_pages; | ||
| 311 | while (p) { | ||
| 312 | q = p->next; | ||
| 313 | /* We don't want swsusp_free() to free this page again */ | ||
| 314 | ClearPageNosave(virt_to_page(p)); | ||
| 315 | free_page((unsigned long)p); | ||
| 316 | p = q; | ||
| 317 | } | ||
| 318 | eaten_pages = NULL; | ||
| 265 | } | 319 | } |
| 266 | 320 | ||
| 267 | /** | 321 | /** |
| @@ -282,9 +336,12 @@ static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) | |||
| 282 | if (safe_needed) | 336 | if (safe_needed) |
| 283 | do { | 337 | do { |
| 284 | res = (void *)get_zeroed_page(gfp_mask); | 338 | res = (void *)get_zeroed_page(gfp_mask); |
| 285 | if (res && PageNosaveFree(virt_to_page(res))) | 339 | if (res && PageNosaveFree(virt_to_page(res))) { |
| 286 | /* This is for swsusp_free() */ | 340 | /* This is for swsusp_free() */ |
| 287 | SetPageNosave(virt_to_page(res)); | 341 | SetPageNosave(virt_to_page(res)); |
| 342 | ((struct eaten_page *)res)->next = eaten_pages; | ||
| 343 | eaten_pages = res; | ||
| 344 | } | ||
| 288 | } while (res && PageNosaveFree(virt_to_page(res))); | 345 | } while (res && PageNosaveFree(virt_to_page(res))); |
| 289 | else | 346 | else |
| 290 | res = (void *)get_zeroed_page(gfp_mask); | 347 | res = (void *)get_zeroed_page(gfp_mask); |
| @@ -332,7 +389,8 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed | |||
| 332 | if (!pbe) { /* get_zeroed_page() failed */ | 389 | if (!pbe) { /* get_zeroed_page() failed */ |
| 333 | free_pagedir(pblist); | 390 | free_pagedir(pblist); |
| 334 | pblist = NULL; | 391 | pblist = NULL; |
| 335 | } | 392 | } else |
| 393 | create_pbe_list(pblist, nr_pages); | ||
| 336 | return pblist; | 394 | return pblist; |
| 337 | } | 395 | } |
| 338 | 396 | ||
| @@ -370,8 +428,14 @@ void swsusp_free(void) | |||
| 370 | 428 | ||
| 371 | static int enough_free_mem(unsigned int nr_pages) | 429 | static int enough_free_mem(unsigned int nr_pages) |
| 372 | { | 430 | { |
| 373 | pr_debug("swsusp: available memory: %u pages\n", nr_free_pages()); | 431 | struct zone *zone; |
| 374 | return nr_free_pages() > (nr_pages + PAGES_FOR_IO + | 432 | unsigned int n = 0; |
| 433 | |||
| 434 | for_each_zone (zone) | ||
| 435 | if (!is_highmem(zone)) | ||
| 436 | n += zone->free_pages; | ||
| 437 | pr_debug("swsusp: available memory: %u pages\n", n); | ||
| 438 | return n > (nr_pages + PAGES_FOR_IO + | ||
| 375 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); | 439 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); |
| 376 | } | 440 | } |
| 377 | 441 | ||
| @@ -395,7 +459,6 @@ static struct pbe *swsusp_alloc(unsigned int nr_pages) | |||
| 395 | printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); | 459 | printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); |
| 396 | return NULL; | 460 | return NULL; |
| 397 | } | 461 | } |
| 398 | create_pbe_list(pblist, nr_pages); | ||
| 399 | 462 | ||
| 400 | if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) { | 463 | if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) { |
| 401 | printk(KERN_ERR "suspend: Allocating image pages failed.\n"); | 464 | printk(KERN_ERR "suspend: Allocating image pages failed.\n"); |
| @@ -421,10 +484,6 @@ asmlinkage int swsusp_save(void) | |||
| 421 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE, | 484 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE, |
| 422 | PAGES_FOR_IO, nr_free_pages()); | 485 | PAGES_FOR_IO, nr_free_pages()); |
| 423 | 486 | ||
| 424 | /* This is needed because of the fixed size of swsusp_info */ | ||
| 425 | if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE) | ||
| 426 | return -ENOSPC; | ||
| 427 | |||
| 428 | if (!enough_free_mem(nr_pages)) { | 487 | if (!enough_free_mem(nr_pages)) { |
| 429 | printk(KERN_ERR "swsusp: Not enough free memory\n"); | 488 | printk(KERN_ERR "swsusp: Not enough free memory\n"); |
| 430 | return -ENOMEM; | 489 | return -ENOMEM; |
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index c05f46e7348f..55a18d26abed 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c | |||
| @@ -30,8 +30,8 @@ | |||
| 30 | * Alex Badea <vampire@go.ro>: | 30 | * Alex Badea <vampire@go.ro>: |
| 31 | * Fixed runaway init | 31 | * Fixed runaway init |
| 32 | * | 32 | * |
| 33 | * Andreas Steinmetz <ast@domdv.de>: | 33 | * Rafael J. Wysocki <rjw@sisk.pl> |
| 34 | * Added encrypted suspend option | 34 | * Added the swap map data structure and reworked the handling of swap |
| 35 | * | 35 | * |
| 36 | * More state savers are welcome. Especially for the scsi layer... | 36 | * More state savers are welcome. Especially for the scsi layer... |
| 37 | * | 37 | * |
| @@ -67,44 +67,33 @@ | |||
| 67 | #include <asm/tlbflush.h> | 67 | #include <asm/tlbflush.h> |
| 68 | #include <asm/io.h> | 68 | #include <asm/io.h> |
| 69 | 69 | ||
| 70 | #include <linux/random.h> | ||
| 71 | #include <linux/crypto.h> | ||
| 72 | #include <asm/scatterlist.h> | ||
| 73 | |||
| 74 | #include "power.h" | 70 | #include "power.h" |
| 75 | 71 | ||
| 72 | /* | ||
| 73 | * Preferred image size in MB (tunable via /sys/power/image_size). | ||
| 74 | * When it is set to N, swsusp will do its best to ensure the image | ||
| 75 | * size will not exceed N MB, but if that is impossible, it will | ||
| 76 | * try to create the smallest image possible. | ||
| 77 | */ | ||
| 78 | unsigned int image_size = 500; | ||
| 79 | |||
| 76 | #ifdef CONFIG_HIGHMEM | 80 | #ifdef CONFIG_HIGHMEM |
| 81 | unsigned int count_highmem_pages(void); | ||
| 77 | int save_highmem(void); | 82 | int save_highmem(void); |
| 78 | int restore_highmem(void); | 83 | int restore_highmem(void); |
| 79 | #else | 84 | #else |
| 80 | static int save_highmem(void) { return 0; } | 85 | static int save_highmem(void) { return 0; } |
| 81 | static int restore_highmem(void) { return 0; } | 86 | static int restore_highmem(void) { return 0; } |
| 87 | static unsigned int count_highmem_pages(void) { return 0; } | ||
| 82 | #endif | 88 | #endif |
| 83 | 89 | ||
| 84 | #define CIPHER "aes" | ||
| 85 | #define MAXKEY 32 | ||
| 86 | #define MAXIV 32 | ||
| 87 | |||
| 88 | extern char resume_file[]; | 90 | extern char resume_file[]; |
| 89 | 91 | ||
| 90 | /* Local variables that should not be affected by save */ | ||
| 91 | unsigned int nr_copy_pages __nosavedata = 0; | ||
| 92 | |||
| 93 | /* Suspend pagedir is allocated before final copy, therefore it | ||
| 94 | must be freed after resume | ||
| 95 | |||
| 96 | Warning: this is even more evil than it seems. Pagedirs this file | ||
| 97 | talks about are completely different from page directories used by | ||
| 98 | MMU hardware. | ||
| 99 | */ | ||
| 100 | suspend_pagedir_t *pagedir_nosave __nosavedata = NULL; | ||
| 101 | |||
| 102 | #define SWSUSP_SIG "S1SUSPEND" | 92 | #define SWSUSP_SIG "S1SUSPEND" |
| 103 | 93 | ||
| 104 | static struct swsusp_header { | 94 | static struct swsusp_header { |
| 105 | char reserved[PAGE_SIZE - 20 - MAXKEY - MAXIV - sizeof(swp_entry_t)]; | 95 | char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; |
| 106 | u8 key_iv[MAXKEY+MAXIV]; | 96 | swp_entry_t image; |
| 107 | swp_entry_t swsusp_info; | ||
| 108 | char orig_sig[10]; | 97 | char orig_sig[10]; |
| 109 | char sig[10]; | 98 | char sig[10]; |
| 110 | } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; | 99 | } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; |
| @@ -115,140 +104,9 @@ static struct swsusp_info swsusp_info; | |||
| 115 | * Saving part... | 104 | * Saving part... |
| 116 | */ | 105 | */ |
| 117 | 106 | ||
| 118 | /* We memorize in swapfile_used what swap devices are used for suspension */ | 107 | static unsigned short root_swap = 0xffff; |
| 119 | #define SWAPFILE_UNUSED 0 | ||
| 120 | #define SWAPFILE_SUSPEND 1 /* This is the suspending device */ | ||
| 121 | #define SWAPFILE_IGNORED 2 /* Those are other swap devices ignored for suspension */ | ||
| 122 | |||
| 123 | static unsigned short swapfile_used[MAX_SWAPFILES]; | ||
| 124 | static unsigned short root_swap; | ||
| 125 | |||
| 126 | static int write_page(unsigned long addr, swp_entry_t *loc); | ||
| 127 | static int bio_read_page(pgoff_t page_off, void *page); | ||
| 128 | |||
| 129 | static u8 key_iv[MAXKEY+MAXIV]; | ||
| 130 | |||
| 131 | #ifdef CONFIG_SWSUSP_ENCRYPT | ||
| 132 | |||
| 133 | static int crypto_init(int mode, void **mem) | ||
| 134 | { | ||
| 135 | int error = 0; | ||
| 136 | int len; | ||
| 137 | char *modemsg; | ||
| 138 | struct crypto_tfm *tfm; | ||
| 139 | |||
| 140 | modemsg = mode ? "suspend not possible" : "resume not possible"; | ||
| 141 | |||
| 142 | tfm = crypto_alloc_tfm(CIPHER, CRYPTO_TFM_MODE_CBC); | ||
| 143 | if(!tfm) { | ||
| 144 | printk(KERN_ERR "swsusp: no tfm, %s\n", modemsg); | ||
| 145 | error = -EINVAL; | ||
| 146 | goto out; | ||
| 147 | } | ||
| 148 | |||
| 149 | if(MAXKEY < crypto_tfm_alg_min_keysize(tfm)) { | ||
| 150 | printk(KERN_ERR "swsusp: key buffer too small, %s\n", modemsg); | ||
| 151 | error = -ENOKEY; | ||
| 152 | goto fail; | ||
| 153 | } | ||
| 154 | |||
| 155 | if (mode) | ||
| 156 | get_random_bytes(key_iv, MAXKEY+MAXIV); | ||
| 157 | |||
| 158 | len = crypto_tfm_alg_max_keysize(tfm); | ||
| 159 | if (len > MAXKEY) | ||
| 160 | len = MAXKEY; | ||
| 161 | |||
| 162 | if (crypto_cipher_setkey(tfm, key_iv, len)) { | ||
| 163 | printk(KERN_ERR "swsusp: key setup failure, %s\n", modemsg); | ||
| 164 | error = -EKEYREJECTED; | ||
| 165 | goto fail; | ||
| 166 | } | ||
| 167 | |||
| 168 | len = crypto_tfm_alg_ivsize(tfm); | ||
| 169 | |||
| 170 | if (MAXIV < len) { | ||
| 171 | printk(KERN_ERR "swsusp: iv buffer too small, %s\n", modemsg); | ||
| 172 | error = -EOVERFLOW; | ||
| 173 | goto fail; | ||
| 174 | } | ||
| 175 | |||
| 176 | crypto_cipher_set_iv(tfm, key_iv+MAXKEY, len); | ||
| 177 | |||
| 178 | *mem=(void *)tfm; | ||
| 179 | |||
| 180 | goto out; | ||
| 181 | |||
| 182 | fail: crypto_free_tfm(tfm); | ||
| 183 | out: return error; | ||
| 184 | } | ||
| 185 | |||
| 186 | static __inline__ void crypto_exit(void *mem) | ||
| 187 | { | ||
| 188 | crypto_free_tfm((struct crypto_tfm *)mem); | ||
| 189 | } | ||
| 190 | |||
| 191 | static __inline__ int crypto_write(struct pbe *p, void *mem) | ||
| 192 | { | ||
| 193 | int error = 0; | ||
| 194 | struct scatterlist src, dst; | ||
| 195 | |||
| 196 | src.page = virt_to_page(p->address); | ||
| 197 | src.offset = 0; | ||
| 198 | src.length = PAGE_SIZE; | ||
| 199 | dst.page = virt_to_page((void *)&swsusp_header); | ||
| 200 | dst.offset = 0; | ||
| 201 | dst.length = PAGE_SIZE; | ||
| 202 | |||
| 203 | error = crypto_cipher_encrypt((struct crypto_tfm *)mem, &dst, &src, | ||
| 204 | PAGE_SIZE); | ||
| 205 | |||
| 206 | if (!error) | ||
| 207 | error = write_page((unsigned long)&swsusp_header, | ||
| 208 | &(p->swap_address)); | ||
| 209 | return error; | ||
| 210 | } | ||
| 211 | |||
| 212 | static __inline__ int crypto_read(struct pbe *p, void *mem) | ||
| 213 | { | ||
| 214 | int error = 0; | ||
| 215 | struct scatterlist src, dst; | ||
| 216 | |||
| 217 | error = bio_read_page(swp_offset(p->swap_address), (void *)p->address); | ||
| 218 | if (!error) { | ||
| 219 | src.offset = 0; | ||
| 220 | src.length = PAGE_SIZE; | ||
| 221 | dst.offset = 0; | ||
| 222 | dst.length = PAGE_SIZE; | ||
| 223 | src.page = dst.page = virt_to_page((void *)p->address); | ||
| 224 | |||
| 225 | error = crypto_cipher_decrypt((struct crypto_tfm *)mem, &dst, | ||
| 226 | &src, PAGE_SIZE); | ||
| 227 | } | ||
| 228 | return error; | ||
| 229 | } | ||
| 230 | #else | ||
| 231 | static __inline__ int crypto_init(int mode, void *mem) | ||
| 232 | { | ||
| 233 | return 0; | ||
| 234 | } | ||
| 235 | |||
| 236 | static __inline__ void crypto_exit(void *mem) | ||
| 237 | { | ||
| 238 | } | ||
| 239 | |||
| 240 | static __inline__ int crypto_write(struct pbe *p, void *mem) | ||
| 241 | { | ||
| 242 | return write_page(p->address, &(p->swap_address)); | ||
| 243 | } | ||
| 244 | 108 | ||
| 245 | static __inline__ int crypto_read(struct pbe *p, void *mem) | 109 | static int mark_swapfiles(swp_entry_t start) |
| 246 | { | ||
| 247 | return bio_read_page(swp_offset(p->swap_address), (void *)p->address); | ||
| 248 | } | ||
| 249 | #endif | ||
| 250 | |||
| 251 | static int mark_swapfiles(swp_entry_t prev) | ||
| 252 | { | 110 | { |
| 253 | int error; | 111 | int error; |
| 254 | 112 | ||
| @@ -259,8 +117,7 @@ static int mark_swapfiles(swp_entry_t prev) | |||
| 259 | !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { | 117 | !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { |
| 260 | memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); | 118 | memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); |
| 261 | memcpy(swsusp_header.sig,SWSUSP_SIG, 10); | 119 | memcpy(swsusp_header.sig,SWSUSP_SIG, 10); |
| 262 | memcpy(swsusp_header.key_iv, key_iv, MAXKEY+MAXIV); | 120 | swsusp_header.image = start; |
| 263 | swsusp_header.swsusp_info = prev; | ||
| 264 | error = rw_swap_page_sync(WRITE, | 121 | error = rw_swap_page_sync(WRITE, |
| 265 | swp_entry(root_swap, 0), | 122 | swp_entry(root_swap, 0), |
| 266 | virt_to_page((unsigned long) | 123 | virt_to_page((unsigned long) |
| @@ -283,7 +140,7 @@ static int mark_swapfiles(swp_entry_t prev) | |||
| 283 | * devfs, since the resume code can only recognize the form /dev/hda4, | 140 | * devfs, since the resume code can only recognize the form /dev/hda4, |
| 284 | * but the suspend code would see the long name.) | 141 | * but the suspend code would see the long name.) |
| 285 | */ | 142 | */ |
| 286 | static int is_resume_device(const struct swap_info_struct *swap_info) | 143 | static inline int is_resume_device(const struct swap_info_struct *swap_info) |
| 287 | { | 144 | { |
| 288 | struct file *file = swap_info->swap_file; | 145 | struct file *file = swap_info->swap_file; |
| 289 | struct inode *inode = file->f_dentry->d_inode; | 146 | struct inode *inode = file->f_dentry->d_inode; |
| @@ -294,54 +151,22 @@ static int is_resume_device(const struct swap_info_struct *swap_info) | |||
| 294 | 151 | ||
| 295 | static int swsusp_swap_check(void) /* This is called before saving image */ | 152 | static int swsusp_swap_check(void) /* This is called before saving image */ |
| 296 | { | 153 | { |
| 297 | int i, len; | ||
| 298 | |||
| 299 | len=strlen(resume_file); | ||
| 300 | root_swap = 0xFFFF; | ||
| 301 | |||
| 302 | spin_lock(&swap_lock); | ||
| 303 | for (i=0; i<MAX_SWAPFILES; i++) { | ||
| 304 | if (!(swap_info[i].flags & SWP_WRITEOK)) { | ||
| 305 | swapfile_used[i]=SWAPFILE_UNUSED; | ||
| 306 | } else { | ||
| 307 | if (!len) { | ||
| 308 | printk(KERN_WARNING "resume= option should be used to set suspend device" ); | ||
| 309 | if (root_swap == 0xFFFF) { | ||
| 310 | swapfile_used[i] = SWAPFILE_SUSPEND; | ||
| 311 | root_swap = i; | ||
| 312 | } else | ||
| 313 | swapfile_used[i] = SWAPFILE_IGNORED; | ||
| 314 | } else { | ||
| 315 | /* we ignore all swap devices that are not the resume_file */ | ||
| 316 | if (is_resume_device(&swap_info[i])) { | ||
| 317 | swapfile_used[i] = SWAPFILE_SUSPEND; | ||
| 318 | root_swap = i; | ||
| 319 | } else { | ||
| 320 | swapfile_used[i] = SWAPFILE_IGNORED; | ||
| 321 | } | ||
| 322 | } | ||
| 323 | } | ||
| 324 | } | ||
| 325 | spin_unlock(&swap_lock); | ||
| 326 | return (root_swap != 0xffff) ? 0 : -ENODEV; | ||
| 327 | } | ||
| 328 | |||
| 329 | /** | ||
| 330 | * This is called after saving image so modification | ||
| 331 | * will be lost after resume... and that's what we want. | ||
| 332 | * we make the device unusable. A new call to | ||
| 333 | * lock_swapdevices can unlock the devices. | ||
| 334 | */ | ||
| 335 | static void lock_swapdevices(void) | ||
| 336 | { | ||
| 337 | int i; | 154 | int i; |
| 338 | 155 | ||
| 156 | if (!swsusp_resume_device) | ||
| 157 | return -ENODEV; | ||
| 339 | spin_lock(&swap_lock); | 158 | spin_lock(&swap_lock); |
| 340 | for (i = 0; i< MAX_SWAPFILES; i++) | 159 | for (i = 0; i < MAX_SWAPFILES; i++) { |
| 341 | if (swapfile_used[i] == SWAPFILE_IGNORED) { | 160 | if (!(swap_info[i].flags & SWP_WRITEOK)) |
| 342 | swap_info[i].flags ^= SWP_WRITEOK; | 161 | continue; |
| 162 | if (is_resume_device(swap_info + i)) { | ||
| 163 | spin_unlock(&swap_lock); | ||
| 164 | root_swap = i; | ||
| 165 | return 0; | ||
| 343 | } | 166 | } |
| 167 | } | ||
| 344 | spin_unlock(&swap_lock); | 168 | spin_unlock(&swap_lock); |
| 169 | return -ENODEV; | ||
| 345 | } | 170 | } |
| 346 | 171 | ||
| 347 | /** | 172 | /** |
| @@ -359,72 +184,217 @@ static void lock_swapdevices(void) | |||
| 359 | static int write_page(unsigned long addr, swp_entry_t *loc) | 184 | static int write_page(unsigned long addr, swp_entry_t *loc) |
| 360 | { | 185 | { |
| 361 | swp_entry_t entry; | 186 | swp_entry_t entry; |
| 362 | int error = 0; | 187 | int error = -ENOSPC; |
| 363 | 188 | ||
| 364 | entry = get_swap_page(); | 189 | entry = get_swap_page_of_type(root_swap); |
| 365 | if (swp_offset(entry) && | 190 | if (swp_offset(entry)) { |
| 366 | swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) { | 191 | error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr)); |
| 367 | error = rw_swap_page_sync(WRITE, entry, | 192 | if (!error || error == -EIO) |
| 368 | virt_to_page(addr)); | ||
| 369 | if (error == -EIO) | ||
| 370 | error = 0; | ||
| 371 | if (!error) | ||
| 372 | *loc = entry; | 193 | *loc = entry; |
| 373 | } else | 194 | } |
| 374 | error = -ENOSPC; | ||
| 375 | return error; | 195 | return error; |
| 376 | } | 196 | } |
| 377 | 197 | ||
| 378 | /** | 198 | /** |
| 379 | * data_free - Free the swap entries used by the saved image. | 199 | * Swap map-handling functions |
| 200 | * | ||
| 201 | * The swap map is a data structure used for keeping track of each page | ||
| 202 | * written to the swap. It consists of many swap_map_page structures | ||
| 203 | * that contain each an array of MAP_PAGE_SIZE swap entries. | ||
| 204 | * These structures are linked together with the help of either the | ||
| 205 | * .next (in memory) or the .next_swap (in swap) member. | ||
| 380 | * | 206 | * |
| 381 | * Walk the list of used swap entries and free each one. | 207 | * The swap map is created during suspend. At that time we need to keep |
| 382 | * This is only used for cleanup when suspend fails. | 208 | * it in memory, because we have to free all of the allocated swap |
| 209 | * entries if an error occurs. The memory needed is preallocated | ||
| 210 | * so that we know in advance if there's enough of it. | ||
| 211 | * | ||
| 212 | * The first swap_map_page structure is filled with the swap entries that | ||
| 213 | * correspond to the first MAP_PAGE_SIZE data pages written to swap and | ||
| 214 | * so on. After the all of the data pages have been written, the order | ||
| 215 | * of the swap_map_page structures in the map is reversed so that they | ||
| 216 | * can be read from swap in the original order. This causes the data | ||
| 217 | * pages to be loaded in exactly the same order in which they have been | ||
| 218 | * saved. | ||
| 219 | * | ||
| 220 | * During resume we only need to use one swap_map_page structure | ||
| 221 | * at a time, which means that we only need to use two memory pages for | ||
| 222 | * reading the image - one for reading the swap_map_page structures | ||
| 223 | * and the second for reading the data pages from swap. | ||
| 383 | */ | 224 | */ |
| 384 | static void data_free(void) | 225 | |
| 226 | #define MAP_PAGE_SIZE ((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \ | ||
| 227 | / sizeof(swp_entry_t)) | ||
| 228 | |||
| 229 | struct swap_map_page { | ||
| 230 | swp_entry_t entries[MAP_PAGE_SIZE]; | ||
| 231 | swp_entry_t next_swap; | ||
| 232 | struct swap_map_page *next; | ||
| 233 | }; | ||
| 234 | |||
| 235 | static inline void free_swap_map(struct swap_map_page *swap_map) | ||
| 385 | { | 236 | { |
| 386 | swp_entry_t entry; | 237 | struct swap_map_page *swp; |
| 387 | struct pbe *p; | ||
| 388 | 238 | ||
| 389 | for_each_pbe (p, pagedir_nosave) { | 239 | while (swap_map) { |
| 390 | entry = p->swap_address; | 240 | swp = swap_map->next; |
| 391 | if (entry.val) | 241 | free_page((unsigned long)swap_map); |
| 392 | swap_free(entry); | 242 | swap_map = swp; |
| 393 | else | ||
| 394 | break; | ||
| 395 | } | 243 | } |
| 396 | } | 244 | } |
| 397 | 245 | ||
| 246 | static struct swap_map_page *alloc_swap_map(unsigned int nr_pages) | ||
| 247 | { | ||
| 248 | struct swap_map_page *swap_map, *swp; | ||
| 249 | unsigned n = 0; | ||
| 250 | |||
| 251 | if (!nr_pages) | ||
| 252 | return NULL; | ||
| 253 | |||
| 254 | pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages); | ||
| 255 | swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); | ||
| 256 | swp = swap_map; | ||
| 257 | for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) { | ||
| 258 | swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); | ||
| 259 | swp = swp->next; | ||
| 260 | if (!swp) { | ||
| 261 | free_swap_map(swap_map); | ||
| 262 | return NULL; | ||
| 263 | } | ||
| 264 | } | ||
| 265 | return swap_map; | ||
| 266 | } | ||
| 267 | |||
| 398 | /** | 268 | /** |
| 399 | * data_write - Write saved image to swap. | 269 | * reverse_swap_map - reverse the order of pages in the swap map |
| 400 | * | 270 | * @swap_map |
| 401 | * Walk the list of pages in the image and sync each one to swap. | ||
| 402 | */ | 271 | */ |
| 403 | static int data_write(void) | 272 | |
| 273 | static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map) | ||
| 404 | { | 274 | { |
| 405 | int error = 0, i = 0; | 275 | struct swap_map_page *prev, *next; |
| 406 | unsigned int mod = nr_copy_pages / 100; | 276 | |
| 407 | struct pbe *p; | 277 | prev = NULL; |
| 408 | void *tfm; | 278 | while (swap_map) { |
| 279 | next = swap_map->next; | ||
| 280 | swap_map->next = prev; | ||
| 281 | prev = swap_map; | ||
| 282 | swap_map = next; | ||
| 283 | } | ||
| 284 | return prev; | ||
| 285 | } | ||
| 409 | 286 | ||
| 410 | if ((error = crypto_init(1, &tfm))) | 287 | /** |
| 411 | return error; | 288 | * free_swap_map_entries - free the swap entries allocated to store |
| 289 | * the swap map @swap_map (this is only called in case of an error) | ||
| 290 | */ | ||
| 291 | static inline void free_swap_map_entries(struct swap_map_page *swap_map) | ||
| 292 | { | ||
| 293 | while (swap_map) { | ||
| 294 | if (swap_map->next_swap.val) | ||
| 295 | swap_free(swap_map->next_swap); | ||
| 296 | swap_map = swap_map->next; | ||
| 297 | } | ||
| 298 | } | ||
| 412 | 299 | ||
| 413 | if (!mod) | 300 | /** |
| 414 | mod = 1; | 301 | * save_swap_map - save the swap map used for tracing the data pages |
| 302 | * stored in the swap | ||
| 303 | */ | ||
| 415 | 304 | ||
| 416 | printk( "Writing data to swap (%d pages)... ", nr_copy_pages ); | 305 | static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start) |
| 417 | for_each_pbe (p, pagedir_nosave) { | 306 | { |
| 418 | if (!(i%mod)) | 307 | swp_entry_t entry = (swp_entry_t){0}; |
| 419 | printk( "\b\b\b\b%3d%%", i / mod ); | 308 | int error; |
| 420 | if ((error = crypto_write(p, tfm))) { | 309 | |
| 421 | crypto_exit(tfm); | 310 | while (swap_map) { |
| 311 | swap_map->next_swap = entry; | ||
| 312 | if ((error = write_page((unsigned long)swap_map, &entry))) | ||
| 422 | return error; | 313 | return error; |
| 423 | } | 314 | swap_map = swap_map->next; |
| 424 | i++; | ||
| 425 | } | 315 | } |
| 426 | printk("\b\b\b\bdone\n"); | 316 | *start = entry; |
| 427 | crypto_exit(tfm); | 317 | return 0; |
| 318 | } | ||
| 319 | |||
| 320 | /** | ||
| 321 | * free_image_entries - free the swap entries allocated to store | ||
| 322 | * the image data pages (this is only called in case of an error) | ||
| 323 | */ | ||
| 324 | |||
| 325 | static inline void free_image_entries(struct swap_map_page *swp) | ||
| 326 | { | ||
| 327 | unsigned k; | ||
| 328 | |||
| 329 | while (swp) { | ||
| 330 | for (k = 0; k < MAP_PAGE_SIZE; k++) | ||
| 331 | if (swp->entries[k].val) | ||
| 332 | swap_free(swp->entries[k]); | ||
| 333 | swp = swp->next; | ||
| 334 | } | ||
| 335 | } | ||
| 336 | |||
| 337 | /** | ||
| 338 | * The swap_map_handle structure is used for handling the swap map in | ||
| 339 | * a file-alike way | ||
| 340 | */ | ||
| 341 | |||
| 342 | struct swap_map_handle { | ||
| 343 | struct swap_map_page *cur; | ||
| 344 | unsigned int k; | ||
| 345 | }; | ||
| 346 | |||
| 347 | static inline void init_swap_map_handle(struct swap_map_handle *handle, | ||
| 348 | struct swap_map_page *map) | ||
| 349 | { | ||
| 350 | handle->cur = map; | ||
| 351 | handle->k = 0; | ||
| 352 | } | ||
| 353 | |||
| 354 | static inline int swap_map_write_page(struct swap_map_handle *handle, | ||
| 355 | unsigned long addr) | ||
| 356 | { | ||
| 357 | int error; | ||
| 358 | |||
| 359 | error = write_page(addr, handle->cur->entries + handle->k); | ||
| 360 | if (error) | ||
| 361 | return error; | ||
| 362 | if (++handle->k >= MAP_PAGE_SIZE) { | ||
| 363 | handle->cur = handle->cur->next; | ||
| 364 | handle->k = 0; | ||
| 365 | } | ||
| 366 | return 0; | ||
| 367 | } | ||
| 368 | |||
| 369 | /** | ||
| 370 | * save_image_data - save the data pages pointed to by the PBEs | ||
| 371 | * from the list @pblist using the swap map handle @handle | ||
| 372 | * (assume there are @nr_pages data pages to save) | ||
| 373 | */ | ||
| 374 | |||
| 375 | static int save_image_data(struct pbe *pblist, | ||
| 376 | struct swap_map_handle *handle, | ||
| 377 | unsigned int nr_pages) | ||
| 378 | { | ||
| 379 | unsigned int m; | ||
| 380 | struct pbe *p; | ||
| 381 | int error = 0; | ||
| 382 | |||
| 383 | printk("Saving image data pages (%u pages) ... ", nr_pages); | ||
| 384 | m = nr_pages / 100; | ||
| 385 | if (!m) | ||
| 386 | m = 1; | ||
| 387 | nr_pages = 0; | ||
| 388 | for_each_pbe (p, pblist) { | ||
| 389 | error = swap_map_write_page(handle, p->address); | ||
| 390 | if (error) | ||
| 391 | break; | ||
| 392 | if (!(nr_pages % m)) | ||
| 393 | printk("\b\b\b\b%3d%%", nr_pages / m); | ||
| 394 | nr_pages++; | ||
| 395 | } | ||
| 396 | if (!error) | ||
| 397 | printk("\b\b\b\bdone\n"); | ||
| 428 | return error; | 398 | return error; |
| 429 | } | 399 | } |
| 430 | 400 | ||
| @@ -440,70 +410,70 @@ static void dump_info(void) | |||
| 440 | pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname); | 410 | pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname); |
| 441 | pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus); | 411 | pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus); |
| 442 | pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages); | 412 | pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages); |
| 443 | pr_debug(" swsusp: Pagedir: %ld Pages\n",swsusp_info.pagedir_pages); | 413 | pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages); |
| 444 | } | 414 | } |
| 445 | 415 | ||
| 446 | static void init_header(void) | 416 | static void init_header(unsigned int nr_pages) |
| 447 | { | 417 | { |
| 448 | memset(&swsusp_info, 0, sizeof(swsusp_info)); | 418 | memset(&swsusp_info, 0, sizeof(swsusp_info)); |
| 449 | swsusp_info.version_code = LINUX_VERSION_CODE; | 419 | swsusp_info.version_code = LINUX_VERSION_CODE; |
| 450 | swsusp_info.num_physpages = num_physpages; | 420 | swsusp_info.num_physpages = num_physpages; |
| 451 | memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname)); | 421 | memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname)); |
| 452 | 422 | ||
| 453 | swsusp_info.suspend_pagedir = pagedir_nosave; | ||
| 454 | swsusp_info.cpus = num_online_cpus(); | 423 | swsusp_info.cpus = num_online_cpus(); |
| 455 | swsusp_info.image_pages = nr_copy_pages; | 424 | swsusp_info.image_pages = nr_pages; |
| 456 | } | 425 | swsusp_info.pages = nr_pages + |
| 457 | 426 | ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1; | |
| 458 | static int close_swap(void) | ||
| 459 | { | ||
| 460 | swp_entry_t entry; | ||
| 461 | int error; | ||
| 462 | |||
| 463 | dump_info(); | ||
| 464 | error = write_page((unsigned long)&swsusp_info, &entry); | ||
| 465 | if (!error) { | ||
| 466 | printk( "S" ); | ||
| 467 | error = mark_swapfiles(entry); | ||
| 468 | printk( "|\n" ); | ||
| 469 | } | ||
| 470 | return error; | ||
| 471 | } | 427 | } |
| 472 | 428 | ||
| 473 | /** | 429 | /** |
| 474 | * free_pagedir_entries - Free pages used by the page directory. | 430 | * pack_orig_addresses - the .orig_address fields of the PBEs from the |
| 475 | * | 431 | * list starting at @pbe are stored in the array @buf[] (1 page) |
| 476 | * This is used during suspend for error recovery. | ||
| 477 | */ | 432 | */ |
| 478 | 433 | ||
| 479 | static void free_pagedir_entries(void) | 434 | static inline struct pbe *pack_orig_addresses(unsigned long *buf, |
| 435 | struct pbe *pbe) | ||
| 480 | { | 436 | { |
| 481 | int i; | 437 | int j; |
| 482 | 438 | ||
| 483 | for (i = 0; i < swsusp_info.pagedir_pages; i++) | 439 | for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { |
| 484 | swap_free(swsusp_info.pagedir[i]); | 440 | buf[j] = pbe->orig_address; |
| 441 | pbe = pbe->next; | ||
| 442 | } | ||
| 443 | if (!pbe) | ||
| 444 | for (; j < PAGE_SIZE / sizeof(long); j++) | ||
| 445 | buf[j] = 0; | ||
| 446 | return pbe; | ||
| 485 | } | 447 | } |
| 486 | 448 | ||
| 487 | |||
| 488 | /** | 449 | /** |
| 489 | * write_pagedir - Write the array of pages holding the page directory. | 450 | * save_image_metadata - save the .orig_address fields of the PBEs |
| 490 | * @last: Last swap entry we write (needed for header). | 451 | * from the list @pblist using the swap map handle @handle |
| 491 | */ | 452 | */ |
| 492 | 453 | ||
| 493 | static int write_pagedir(void) | 454 | static int save_image_metadata(struct pbe *pblist, |
| 455 | struct swap_map_handle *handle) | ||
| 494 | { | 456 | { |
| 495 | int error = 0; | 457 | unsigned long *buf; |
| 496 | unsigned int n = 0; | 458 | unsigned int n = 0; |
| 497 | struct pbe *pbe; | 459 | struct pbe *p; |
| 460 | int error = 0; | ||
| 498 | 461 | ||
| 499 | printk( "Writing pagedir..."); | 462 | printk("Saving image metadata ... "); |
| 500 | for_each_pb_page (pbe, pagedir_nosave) { | 463 | buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); |
| 501 | if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++]))) | 464 | if (!buf) |
| 502 | return error; | 465 | return -ENOMEM; |
| 466 | p = pblist; | ||
| 467 | while (p) { | ||
| 468 | p = pack_orig_addresses(buf, p); | ||
| 469 | error = swap_map_write_page(handle, (unsigned long)buf); | ||
| 470 | if (error) | ||
| 471 | break; | ||
| 472 | n++; | ||
| 503 | } | 473 | } |
| 504 | 474 | free_page((unsigned long)buf); | |
| 505 | swsusp_info.pagedir_pages = n; | 475 | if (!error) |
| 506 | printk("done (%u pages)\n", n); | 476 | printk("done (%u pages saved)\n", n); |
| 507 | return error; | 477 | return error; |
| 508 | } | 478 | } |
| 509 | 479 | ||
| @@ -511,75 +481,125 @@ static int write_pagedir(void) | |||
| 511 | * enough_swap - Make sure we have enough swap to save the image. | 481 | * enough_swap - Make sure we have enough swap to save the image. |
| 512 | * | 482 | * |
| 513 | * Returns TRUE or FALSE after checking the total amount of swap | 483 | * Returns TRUE or FALSE after checking the total amount of swap |
| 514 | * space avaiable. | 484 | * space avaiable from the resume partition. |
| 515 | * | ||
| 516 | * FIXME: si_swapinfo(&i) returns all swap devices information. | ||
| 517 | * We should only consider resume_device. | ||
| 518 | */ | 485 | */ |
| 519 | 486 | ||
| 520 | static int enough_swap(unsigned int nr_pages) | 487 | static int enough_swap(unsigned int nr_pages) |
| 521 | { | 488 | { |
| 522 | struct sysinfo i; | 489 | unsigned int free_swap = swap_info[root_swap].pages - |
| 490 | swap_info[root_swap].inuse_pages; | ||
| 523 | 491 | ||
| 524 | si_swapinfo(&i); | 492 | pr_debug("swsusp: free swap pages: %u\n", free_swap); |
| 525 | pr_debug("swsusp: available swap: %lu pages\n", i.freeswap); | 493 | return free_swap > (nr_pages + PAGES_FOR_IO + |
| 526 | return i.freeswap > (nr_pages + PAGES_FOR_IO + | ||
| 527 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); | 494 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); |
| 528 | } | 495 | } |
| 529 | 496 | ||
| 530 | /** | 497 | /** |
| 531 | * write_suspend_image - Write entire image and metadata. | 498 | * swsusp_write - Write entire image and metadata. |
| 532 | * | 499 | * |
| 500 | * It is important _NOT_ to umount filesystems at this point. We want | ||
| 501 | * them synced (in case something goes wrong) but we DO not want to mark | ||
| 502 | * filesystem clean: it is not. (And it does not matter, if we resume | ||
| 503 | * correctly, we'll mark system clean, anyway.) | ||
| 533 | */ | 504 | */ |
| 534 | static int write_suspend_image(void) | 505 | |
| 506 | int swsusp_write(struct pbe *pblist, unsigned int nr_pages) | ||
| 535 | { | 507 | { |
| 508 | struct swap_map_page *swap_map; | ||
| 509 | struct swap_map_handle handle; | ||
| 510 | swp_entry_t start; | ||
| 536 | int error; | 511 | int error; |
| 537 | 512 | ||
| 538 | if (!enough_swap(nr_copy_pages)) { | 513 | if ((error = swsusp_swap_check())) { |
| 514 | printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n"); | ||
| 515 | return error; | ||
| 516 | } | ||
| 517 | if (!enough_swap(nr_pages)) { | ||
| 539 | printk(KERN_ERR "swsusp: Not enough free swap\n"); | 518 | printk(KERN_ERR "swsusp: Not enough free swap\n"); |
| 540 | return -ENOSPC; | 519 | return -ENOSPC; |
| 541 | } | 520 | } |
| 542 | 521 | ||
| 543 | init_header(); | 522 | init_header(nr_pages); |
| 544 | if ((error = data_write())) | 523 | swap_map = alloc_swap_map(swsusp_info.pages); |
| 545 | goto FreeData; | 524 | if (!swap_map) |
| 525 | return -ENOMEM; | ||
| 526 | init_swap_map_handle(&handle, swap_map); | ||
| 527 | |||
| 528 | error = swap_map_write_page(&handle, (unsigned long)&swsusp_info); | ||
| 529 | if (!error) | ||
| 530 | error = save_image_metadata(pblist, &handle); | ||
| 531 | if (!error) | ||
| 532 | error = save_image_data(pblist, &handle, nr_pages); | ||
| 533 | if (error) | ||
| 534 | goto Free_image_entries; | ||
| 546 | 535 | ||
| 547 | if ((error = write_pagedir())) | 536 | swap_map = reverse_swap_map(swap_map); |
| 548 | goto FreePagedir; | 537 | error = save_swap_map(swap_map, &start); |
| 538 | if (error) | ||
| 539 | goto Free_map_entries; | ||
| 549 | 540 | ||
| 550 | if ((error = close_swap())) | 541 | dump_info(); |
| 551 | goto FreePagedir; | 542 | printk( "S" ); |
| 552 | Done: | 543 | error = mark_swapfiles(start); |
| 553 | memset(key_iv, 0, MAXKEY+MAXIV); | 544 | printk( "|\n" ); |
| 545 | if (error) | ||
| 546 | goto Free_map_entries; | ||
| 547 | |||
| 548 | Free_swap_map: | ||
| 549 | free_swap_map(swap_map); | ||
| 554 | return error; | 550 | return error; |
| 555 | FreePagedir: | 551 | |
| 556 | free_pagedir_entries(); | 552 | Free_map_entries: |
| 557 | FreeData: | 553 | free_swap_map_entries(swap_map); |
| 558 | data_free(); | 554 | Free_image_entries: |
| 559 | goto Done; | 555 | free_image_entries(swap_map); |
| 556 | goto Free_swap_map; | ||
| 560 | } | 557 | } |
| 561 | 558 | ||
| 562 | /* It is important _NOT_ to umount filesystems at this point. We want | 559 | /** |
| 563 | * them synced (in case something goes wrong) but we DO not want to mark | 560 | * swsusp_shrink_memory - Try to free as much memory as needed |
| 564 | * filesystem clean: it is not. (And it does not matter, if we resume | 561 | * |
| 565 | * correctly, we'll mark system clean, anyway.) | 562 | * ... but do not OOM-kill anyone |
| 563 | * | ||
| 564 | * Notice: all userland should be stopped before it is called, or | ||
| 565 | * livelock is possible. | ||
| 566 | */ | 566 | */ |
| 567 | int swsusp_write(void) | ||
| 568 | { | ||
| 569 | int error; | ||
| 570 | 567 | ||
| 571 | if ((error = swsusp_swap_check())) { | 568 | #define SHRINK_BITE 10000 |
| 572 | printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n"); | ||
| 573 | return error; | ||
| 574 | } | ||
| 575 | lock_swapdevices(); | ||
| 576 | error = write_suspend_image(); | ||
| 577 | /* This will unlock ignored swap devices since writing is finished */ | ||
| 578 | lock_swapdevices(); | ||
| 579 | return error; | ||
| 580 | } | ||
| 581 | 569 | ||
| 570 | int swsusp_shrink_memory(void) | ||
| 571 | { | ||
| 572 | long size, tmp; | ||
| 573 | struct zone *zone; | ||
| 574 | unsigned long pages = 0; | ||
| 575 | unsigned int i = 0; | ||
| 576 | char *p = "-\\|/"; | ||
| 577 | |||
| 578 | printk("Shrinking memory... "); | ||
| 579 | do { | ||
| 580 | size = 2 * count_highmem_pages(); | ||
| 581 | size += size / 50 + count_data_pages(); | ||
| 582 | size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE + | ||
| 583 | PAGES_FOR_IO; | ||
| 584 | tmp = size; | ||
| 585 | for_each_zone (zone) | ||
| 586 | if (!is_highmem(zone)) | ||
| 587 | tmp -= zone->free_pages; | ||
| 588 | if (tmp > 0) { | ||
| 589 | tmp = shrink_all_memory(SHRINK_BITE); | ||
| 590 | if (!tmp) | ||
| 591 | return -ENOMEM; | ||
| 592 | pages += tmp; | ||
| 593 | } else if (size > (image_size * 1024 * 1024) / PAGE_SIZE) { | ||
| 594 | tmp = shrink_all_memory(SHRINK_BITE); | ||
| 595 | pages += tmp; | ||
| 596 | } | ||
| 597 | printk("\b%c", p[i++%4]); | ||
| 598 | } while (tmp > 0); | ||
| 599 | printk("\bdone (%lu pages freed)\n", pages); | ||
| 582 | 600 | ||
| 601 | return 0; | ||
| 602 | } | ||
| 583 | 603 | ||
| 584 | int swsusp_suspend(void) | 604 | int swsusp_suspend(void) |
| 585 | { | 605 | { |
| @@ -677,7 +697,6 @@ static void copy_page_backup_list(struct pbe *dst, struct pbe *src) | |||
| 677 | /* We assume both lists contain the same number of elements */ | 697 | /* We assume both lists contain the same number of elements */ |
| 678 | while (src) { | 698 | while (src) { |
| 679 | dst->orig_address = src->orig_address; | 699 | dst->orig_address = src->orig_address; |
| 680 | dst->swap_address = src->swap_address; | ||
| 681 | dst = dst->next; | 700 | dst = dst->next; |
| 682 | src = src->next; | 701 | src = src->next; |
| 683 | } | 702 | } |
| @@ -757,198 +776,224 @@ static int bio_write_page(pgoff_t page_off, void *page) | |||
| 757 | return submit(WRITE, page_off, page); | 776 | return submit(WRITE, page_off, page); |
| 758 | } | 777 | } |
| 759 | 778 | ||
| 760 | /* | 779 | /** |
| 761 | * Sanity check if this image makes sense with this kernel/swap context | 780 | * The following functions allow us to read data using a swap map |
| 762 | * I really don't think that it's foolproof but more than nothing.. | 781 | * in a file-alike way |
| 763 | */ | 782 | */ |
| 764 | 783 | ||
| 765 | static const char *sanity_check(void) | 784 | static inline void release_swap_map_reader(struct swap_map_handle *handle) |
| 766 | { | 785 | { |
| 767 | dump_info(); | 786 | if (handle->cur) |
| 768 | if (swsusp_info.version_code != LINUX_VERSION_CODE) | 787 | free_page((unsigned long)handle->cur); |
| 769 | return "kernel version"; | 788 | handle->cur = NULL; |
| 770 | if (swsusp_info.num_physpages != num_physpages) | ||
| 771 | return "memory size"; | ||
| 772 | if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname)) | ||
| 773 | return "system type"; | ||
| 774 | if (strcmp(swsusp_info.uts.release,system_utsname.release)) | ||
| 775 | return "kernel release"; | ||
| 776 | if (strcmp(swsusp_info.uts.version,system_utsname.version)) | ||
| 777 | return "version"; | ||
| 778 | if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) | ||
| 779 | return "machine"; | ||
| 780 | #if 0 | ||
| 781 | /* We can't use number of online CPUs when we use hotplug to remove them ;-))) */ | ||
| 782 | if (swsusp_info.cpus != num_possible_cpus()) | ||
| 783 | return "number of cpus"; | ||
| 784 | #endif | ||
| 785 | return NULL; | ||
| 786 | } | 789 | } |
| 787 | 790 | ||
| 788 | 791 | static inline int get_swap_map_reader(struct swap_map_handle *handle, | |
| 789 | static int check_header(void) | 792 | swp_entry_t start) |
| 790 | { | 793 | { |
| 791 | const char *reason = NULL; | ||
| 792 | int error; | 794 | int error; |
| 793 | 795 | ||
| 794 | if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info))) | 796 | if (!swp_offset(start)) |
| 797 | return -EINVAL; | ||
| 798 | handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); | ||
| 799 | if (!handle->cur) | ||
| 800 | return -ENOMEM; | ||
| 801 | error = bio_read_page(swp_offset(start), handle->cur); | ||
| 802 | if (error) { | ||
| 803 | release_swap_map_reader(handle); | ||
| 795 | return error; | 804 | return error; |
| 796 | |||
| 797 | /* Is this same machine? */ | ||
| 798 | if ((reason = sanity_check())) { | ||
| 799 | printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason); | ||
| 800 | return -EPERM; | ||
| 801 | } | 805 | } |
| 802 | nr_copy_pages = swsusp_info.image_pages; | 806 | handle->k = 0; |
| 803 | return error; | 807 | return 0; |
| 804 | } | 808 | } |
| 805 | 809 | ||
| 806 | static int check_sig(void) | 810 | static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf) |
| 807 | { | 811 | { |
| 812 | unsigned long offset; | ||
| 808 | int error; | 813 | int error; |
| 809 | 814 | ||
| 810 | memset(&swsusp_header, 0, sizeof(swsusp_header)); | 815 | if (!handle->cur) |
| 811 | if ((error = bio_read_page(0, &swsusp_header))) | 816 | return -EINVAL; |
| 812 | return error; | 817 | offset = swp_offset(handle->cur->entries[handle->k]); |
| 813 | if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { | 818 | if (!offset) |
| 814 | memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); | ||
| 815 | memcpy(key_iv, swsusp_header.key_iv, MAXKEY+MAXIV); | ||
| 816 | memset(swsusp_header.key_iv, 0, MAXKEY+MAXIV); | ||
| 817 | |||
| 818 | /* | ||
| 819 | * Reset swap signature now. | ||
| 820 | */ | ||
| 821 | error = bio_write_page(0, &swsusp_header); | ||
| 822 | } else { | ||
| 823 | return -EINVAL; | 819 | return -EINVAL; |
| 820 | error = bio_read_page(offset, buf); | ||
| 821 | if (error) | ||
| 822 | return error; | ||
| 823 | if (++handle->k >= MAP_PAGE_SIZE) { | ||
| 824 | handle->k = 0; | ||
| 825 | offset = swp_offset(handle->cur->next_swap); | ||
| 826 | if (!offset) | ||
| 827 | release_swap_map_reader(handle); | ||
| 828 | else | ||
| 829 | error = bio_read_page(offset, handle->cur); | ||
| 824 | } | 830 | } |
| 825 | if (!error) | ||
| 826 | pr_debug("swsusp: Signature found, resuming\n"); | ||
| 827 | return error; | 831 | return error; |
| 828 | } | 832 | } |
| 829 | 833 | ||
| 830 | /** | 834 | static int check_header(void) |
| 831 | * data_read - Read image pages from swap. | ||
| 832 | * | ||
| 833 | * You do not need to check for overlaps, check_pagedir() | ||
| 834 | * already did that. | ||
| 835 | */ | ||
| 836 | |||
| 837 | static int data_read(struct pbe *pblist) | ||
| 838 | { | 835 | { |
| 839 | struct pbe *p; | 836 | char *reason = NULL; |
| 840 | int error = 0; | ||
| 841 | int i = 0; | ||
| 842 | int mod = swsusp_info.image_pages / 100; | ||
| 843 | void *tfm; | ||
| 844 | |||
| 845 | if ((error = crypto_init(0, &tfm))) | ||
| 846 | return error; | ||
| 847 | |||
| 848 | if (!mod) | ||
| 849 | mod = 1; | ||
| 850 | |||
| 851 | printk("swsusp: Reading image data (%lu pages): ", | ||
| 852 | swsusp_info.image_pages); | ||
| 853 | |||
| 854 | for_each_pbe (p, pblist) { | ||
| 855 | if (!(i % mod)) | ||
| 856 | printk("\b\b\b\b%3d%%", i / mod); | ||
| 857 | 837 | ||
| 858 | if ((error = crypto_read(p, tfm))) { | 838 | dump_info(); |
| 859 | crypto_exit(tfm); | 839 | if (swsusp_info.version_code != LINUX_VERSION_CODE) |
| 860 | return error; | 840 | reason = "kernel version"; |
| 861 | } | 841 | if (swsusp_info.num_physpages != num_physpages) |
| 862 | 842 | reason = "memory size"; | |
| 863 | i++; | 843 | if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname)) |
| 844 | reason = "system type"; | ||
| 845 | if (strcmp(swsusp_info.uts.release,system_utsname.release)) | ||
| 846 | reason = "kernel release"; | ||
| 847 | if (strcmp(swsusp_info.uts.version,system_utsname.version)) | ||
| 848 | reason = "version"; | ||
| 849 | if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) | ||
| 850 | reason = "machine"; | ||
| 851 | if (reason) { | ||
| 852 | printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); | ||
| 853 | return -EPERM; | ||
| 864 | } | 854 | } |
| 865 | printk("\b\b\b\bdone\n"); | 855 | return 0; |
| 866 | crypto_exit(tfm); | ||
| 867 | return error; | ||
| 868 | } | 856 | } |
| 869 | 857 | ||
| 870 | /** | 858 | /** |
| 871 | * read_pagedir - Read page backup list pages from swap | 859 | * load_image_data - load the image data using the swap map handle |
| 860 | * @handle and store them using the page backup list @pblist | ||
| 861 | * (assume there are @nr_pages pages to load) | ||
| 872 | */ | 862 | */ |
| 873 | 863 | ||
| 874 | static int read_pagedir(struct pbe *pblist) | 864 | static int load_image_data(struct pbe *pblist, |
| 865 | struct swap_map_handle *handle, | ||
| 866 | unsigned int nr_pages) | ||
| 875 | { | 867 | { |
| 876 | struct pbe *pbpage, *p; | ||
| 877 | unsigned int i = 0; | ||
| 878 | int error; | 868 | int error; |
| 869 | unsigned int m; | ||
| 870 | struct pbe *p; | ||
| 879 | 871 | ||
| 880 | if (!pblist) | 872 | if (!pblist) |
| 881 | return -EFAULT; | 873 | return -EINVAL; |
| 882 | 874 | printk("Loading image data pages (%u pages) ... ", nr_pages); | |
| 883 | printk("swsusp: Reading pagedir (%lu pages)\n", | 875 | m = nr_pages / 100; |
| 884 | swsusp_info.pagedir_pages); | 876 | if (!m) |
| 885 | 877 | m = 1; | |
| 886 | for_each_pb_page (pbpage, pblist) { | 878 | nr_pages = 0; |
| 887 | unsigned long offset = swp_offset(swsusp_info.pagedir[i++]); | 879 | p = pblist; |
| 888 | 880 | while (p) { | |
| 889 | error = -EFAULT; | 881 | error = swap_map_read_page(handle, (void *)p->address); |
| 890 | if (offset) { | ||
| 891 | p = (pbpage + PB_PAGE_SKIP)->next; | ||
| 892 | error = bio_read_page(offset, (void *)pbpage); | ||
| 893 | (pbpage + PB_PAGE_SKIP)->next = p; | ||
| 894 | } | ||
| 895 | if (error) | 882 | if (error) |
| 896 | break; | 883 | break; |
| 884 | p = p->next; | ||
| 885 | if (!(nr_pages % m)) | ||
| 886 | printk("\b\b\b\b%3d%%", nr_pages / m); | ||
| 887 | nr_pages++; | ||
| 897 | } | 888 | } |
| 898 | |||
| 899 | if (!error) | 889 | if (!error) |
| 900 | BUG_ON(i != swsusp_info.pagedir_pages); | 890 | printk("\b\b\b\bdone\n"); |
| 901 | |||
| 902 | return error; | 891 | return error; |
| 903 | } | 892 | } |
| 904 | 893 | ||
| 894 | /** | ||
| 895 | * unpack_orig_addresses - copy the elements of @buf[] (1 page) to | ||
| 896 | * the PBEs in the list starting at @pbe | ||
| 897 | */ | ||
| 905 | 898 | ||
| 906 | static int check_suspend_image(void) | 899 | static inline struct pbe *unpack_orig_addresses(unsigned long *buf, |
| 900 | struct pbe *pbe) | ||
| 907 | { | 901 | { |
| 908 | int error = 0; | 902 | int j; |
| 909 | 903 | ||
| 910 | if ((error = check_sig())) | 904 | for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { |
| 911 | return error; | 905 | pbe->orig_address = buf[j]; |
| 912 | 906 | pbe = pbe->next; | |
| 913 | if ((error = check_header())) | 907 | } |
| 914 | return error; | 908 | return pbe; |
| 915 | |||
| 916 | return 0; | ||
| 917 | } | 909 | } |
| 918 | 910 | ||
| 919 | static int read_suspend_image(void) | 911 | /** |
| 912 | * load_image_metadata - load the image metadata using the swap map | ||
| 913 | * handle @handle and put them into the PBEs in the list @pblist | ||
| 914 | */ | ||
| 915 | |||
| 916 | static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle) | ||
| 920 | { | 917 | { |
| 921 | int error = 0; | ||
| 922 | struct pbe *p; | 918 | struct pbe *p; |
| 919 | unsigned long *buf; | ||
| 920 | unsigned int n = 0; | ||
| 921 | int error = 0; | ||
| 923 | 922 | ||
| 924 | if (!(p = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 0))) | 923 | printk("Loading image metadata ... "); |
| 924 | buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); | ||
| 925 | if (!buf) | ||
| 925 | return -ENOMEM; | 926 | return -ENOMEM; |
| 926 | 927 | p = pblist; | |
| 927 | if ((error = read_pagedir(p))) | 928 | while (p) { |
| 928 | return error; | 929 | error = swap_map_read_page(handle, buf); |
| 929 | create_pbe_list(p, nr_copy_pages); | 930 | if (error) |
| 930 | mark_unsafe_pages(p); | 931 | break; |
| 931 | pagedir_nosave = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); | 932 | p = unpack_orig_addresses(buf, p); |
| 932 | if (pagedir_nosave) { | 933 | n++; |
| 933 | create_pbe_list(pagedir_nosave, nr_copy_pages); | ||
| 934 | copy_page_backup_list(pagedir_nosave, p); | ||
| 935 | } | 934 | } |
| 936 | free_pagedir(p); | 935 | free_page((unsigned long)buf); |
| 937 | if (!pagedir_nosave) | 936 | if (!error) |
| 938 | return -ENOMEM; | 937 | printk("done (%u pages loaded)\n", n); |
| 938 | return error; | ||
| 939 | } | ||
| 939 | 940 | ||
| 940 | /* Allocate memory for the image and read the data from swap */ | 941 | int swsusp_read(struct pbe **pblist_ptr) |
| 942 | { | ||
| 943 | int error; | ||
| 944 | struct pbe *p, *pblist; | ||
| 945 | struct swap_map_handle handle; | ||
| 946 | unsigned int nr_pages; | ||
| 941 | 947 | ||
| 942 | error = alloc_data_pages(pagedir_nosave, GFP_ATOMIC, 1); | 948 | if (IS_ERR(resume_bdev)) { |
| 949 | pr_debug("swsusp: block device not initialised\n"); | ||
| 950 | return PTR_ERR(resume_bdev); | ||
| 951 | } | ||
| 943 | 952 | ||
| 953 | error = get_swap_map_reader(&handle, swsusp_header.image); | ||
| 944 | if (!error) | 954 | if (!error) |
| 945 | error = data_read(pagedir_nosave); | 955 | error = swap_map_read_page(&handle, &swsusp_info); |
| 956 | if (!error) | ||
| 957 | error = check_header(); | ||
| 958 | if (error) | ||
| 959 | return error; | ||
| 960 | nr_pages = swsusp_info.image_pages; | ||
| 961 | p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0); | ||
| 962 | if (!p) | ||
| 963 | return -ENOMEM; | ||
| 964 | error = load_image_metadata(p, &handle); | ||
| 965 | if (!error) { | ||
| 966 | mark_unsafe_pages(p); | ||
| 967 | pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); | ||
| 968 | if (pblist) | ||
| 969 | copy_page_backup_list(pblist, p); | ||
| 970 | free_pagedir(p); | ||
| 971 | if (!pblist) | ||
| 972 | error = -ENOMEM; | ||
| 973 | |||
| 974 | /* Allocate memory for the image and read the data from swap */ | ||
| 975 | if (!error) | ||
| 976 | error = alloc_data_pages(pblist, GFP_ATOMIC, 1); | ||
| 977 | if (!error) { | ||
| 978 | release_eaten_pages(); | ||
| 979 | error = load_image_data(pblist, &handle, nr_pages); | ||
| 980 | } | ||
| 981 | if (!error) | ||
| 982 | *pblist_ptr = pblist; | ||
| 983 | } | ||
| 984 | release_swap_map_reader(&handle); | ||
| 946 | 985 | ||
| 986 | blkdev_put(resume_bdev); | ||
| 987 | |||
| 988 | if (!error) | ||
| 989 | pr_debug("swsusp: Reading resume file was successful\n"); | ||
| 990 | else | ||
| 991 | pr_debug("swsusp: Error %d resuming\n", error); | ||
| 947 | return error; | 992 | return error; |
| 948 | } | 993 | } |
| 949 | 994 | ||
| 950 | /** | 995 | /** |
| 951 | * swsusp_check - Check for saved image in swap | 996 | * swsusp_check - Check for swsusp signature in the resume device |
| 952 | */ | 997 | */ |
| 953 | 998 | ||
| 954 | int swsusp_check(void) | 999 | int swsusp_check(void) |
| @@ -958,40 +1003,27 @@ int swsusp_check(void) | |||
| 958 | resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); | 1003 | resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); |
| 959 | if (!IS_ERR(resume_bdev)) { | 1004 | if (!IS_ERR(resume_bdev)) { |
| 960 | set_blocksize(resume_bdev, PAGE_SIZE); | 1005 | set_blocksize(resume_bdev, PAGE_SIZE); |
| 961 | error = check_suspend_image(); | 1006 | memset(&swsusp_header, 0, sizeof(swsusp_header)); |
| 1007 | if ((error = bio_read_page(0, &swsusp_header))) | ||
| 1008 | return error; | ||
| 1009 | if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { | ||
| 1010 | memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); | ||
| 1011 | /* Reset swap signature now */ | ||
| 1012 | error = bio_write_page(0, &swsusp_header); | ||
| 1013 | } else { | ||
| 1014 | return -EINVAL; | ||
| 1015 | } | ||
| 962 | if (error) | 1016 | if (error) |
| 963 | blkdev_put(resume_bdev); | 1017 | blkdev_put(resume_bdev); |
| 964 | } else | 1018 | else |
| 1019 | pr_debug("swsusp: Signature found, resuming\n"); | ||
| 1020 | } else { | ||
| 965 | error = PTR_ERR(resume_bdev); | 1021 | error = PTR_ERR(resume_bdev); |
| 966 | |||
| 967 | if (!error) | ||
| 968 | pr_debug("swsusp: resume file found\n"); | ||
| 969 | else | ||
| 970 | pr_debug("swsusp: Error %d check for resume file\n", error); | ||
| 971 | return error; | ||
| 972 | } | ||
| 973 | |||
| 974 | /** | ||
| 975 | * swsusp_read - Read saved image from swap. | ||
| 976 | */ | ||
| 977 | |||
| 978 | int swsusp_read(void) | ||
| 979 | { | ||
| 980 | int error; | ||
| 981 | |||
| 982 | if (IS_ERR(resume_bdev)) { | ||
| 983 | pr_debug("swsusp: block device not initialised\n"); | ||
| 984 | return PTR_ERR(resume_bdev); | ||
| 985 | } | 1022 | } |
| 986 | 1023 | ||
| 987 | error = read_suspend_image(); | 1024 | if (error) |
| 988 | blkdev_put(resume_bdev); | 1025 | pr_debug("swsusp: Error %d check for resume file\n", error); |
| 989 | memset(key_iv, 0, MAXKEY+MAXIV); | ||
| 990 | 1026 | ||
| 991 | if (!error) | ||
| 992 | pr_debug("swsusp: Reading resume file was successful\n"); | ||
| 993 | else | ||
| 994 | pr_debug("swsusp: Error %d resuming\n", error); | ||
| 995 | return error; | 1027 | return error; |
| 996 | } | 1028 | } |
| 997 | 1029 | ||
diff --git a/kernel/printk.c b/kernel/printk.c index 5287be83e3e7..2251be80cd22 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
| @@ -569,7 +569,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
| 569 | p[1] <= '7' && p[2] == '>') { | 569 | p[1] <= '7' && p[2] == '>') { |
| 570 | loglev_char = p[1]; | 570 | loglev_char = p[1]; |
| 571 | p += 3; | 571 | p += 3; |
| 572 | printed_len += 3; | 572 | printed_len -= 3; |
| 573 | } else { | 573 | } else { |
| 574 | loglev_char = default_message_loglevel | 574 | loglev_char = default_message_loglevel |
| 575 | + '0'; | 575 | + '0'; |
| @@ -584,7 +584,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
| 584 | 584 | ||
| 585 | for (tp = tbuf; tp < tbuf + tlen; tp++) | 585 | for (tp = tbuf; tp < tbuf + tlen; tp++) |
| 586 | emit_log_char(*tp); | 586 | emit_log_char(*tp); |
| 587 | printed_len += tlen - 3; | 587 | printed_len += tlen; |
| 588 | } else { | 588 | } else { |
| 589 | if (p[0] != '<' || p[1] < '0' || | 589 | if (p[0] != '<' || p[1] < '0' || |
| 590 | p[1] > '7' || p[2] != '>') { | 590 | p[1] > '7' || p[2] != '>') { |
| @@ -592,8 +592,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
| 592 | emit_log_char(default_message_loglevel | 592 | emit_log_char(default_message_loglevel |
| 593 | + '0'); | 593 | + '0'); |
| 594 | emit_log_char('>'); | 594 | emit_log_char('>'); |
| 595 | printed_len += 3; | ||
| 595 | } | 596 | } |
| 596 | printed_len += 3; | ||
| 597 | } | 597 | } |
| 598 | log_level_unknown = 0; | 598 | log_level_unknown = 0; |
| 599 | if (!*p) | 599 | if (!*p) |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 656476eedb1b..5f33cdb6fff5 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
| @@ -7,6 +7,7 @@ | |||
| 7 | * to continually duplicate across every architecture. | 7 | * to continually duplicate across every architecture. |
| 8 | */ | 8 | */ |
| 9 | 9 | ||
| 10 | #include <linux/capability.h> | ||
| 10 | #include <linux/module.h> | 11 | #include <linux/module.h> |
| 11 | #include <linux/sched.h> | 12 | #include <linux/sched.h> |
| 12 | #include <linux/errno.h> | 13 | #include <linux/errno.h> |
| @@ -408,54 +409,62 @@ int ptrace_request(struct task_struct *child, long request, | |||
| 408 | return ret; | 409 | return ret; |
| 409 | } | 410 | } |
| 410 | 411 | ||
| 411 | #ifndef __ARCH_SYS_PTRACE | 412 | /** |
| 412 | static int ptrace_get_task_struct(long request, long pid, | 413 | * ptrace_traceme -- helper for PTRACE_TRACEME |
| 413 | struct task_struct **childp) | 414 | * |
| 415 | * Performs checks and sets PT_PTRACED. | ||
| 416 | * Should be used by all ptrace implementations for PTRACE_TRACEME. | ||
| 417 | */ | ||
| 418 | int ptrace_traceme(void) | ||
| 414 | { | 419 | { |
| 415 | struct task_struct *child; | ||
| 416 | int ret; | 420 | int ret; |
| 417 | 421 | ||
| 418 | /* | 422 | /* |
| 419 | * Callers use child == NULL as an indication to exit early even | 423 | * Are we already being traced? |
| 420 | * when the return value is 0, so make sure it is non-NULL here. | 424 | */ |
| 425 | if (current->ptrace & PT_PTRACED) | ||
| 426 | return -EPERM; | ||
| 427 | ret = security_ptrace(current->parent, current); | ||
| 428 | if (ret) | ||
| 429 | return -EPERM; | ||
| 430 | /* | ||
| 431 | * Set the ptrace bit in the process ptrace flags. | ||
| 421 | */ | 432 | */ |
| 422 | *childp = NULL; | 433 | current->ptrace |= PT_PTRACED; |
| 434 | return 0; | ||
| 435 | } | ||
| 423 | 436 | ||
| 424 | if (request == PTRACE_TRACEME) { | 437 | /** |
| 425 | /* | 438 | * ptrace_get_task_struct -- grab a task struct reference for ptrace |
| 426 | * Are we already being traced? | 439 | * @pid: process id to grab a task_struct reference of |
| 427 | */ | 440 | * |
| 428 | if (current->ptrace & PT_PTRACED) | 441 | * This function is a helper for ptrace implementations. It checks |
| 429 | return -EPERM; | 442 | * permissions and then grabs a task struct for use of the actual |
| 430 | ret = security_ptrace(current->parent, current); | 443 | * ptrace implementation. |
| 431 | if (ret) | 444 | * |
| 432 | return -EPERM; | 445 | * Returns the task_struct for @pid or an ERR_PTR() on failure. |
| 433 | /* | 446 | */ |
| 434 | * Set the ptrace bit in the process ptrace flags. | 447 | struct task_struct *ptrace_get_task_struct(pid_t pid) |
| 435 | */ | 448 | { |
| 436 | current->ptrace |= PT_PTRACED; | 449 | struct task_struct *child; |
| 437 | return 0; | ||
| 438 | } | ||
| 439 | 450 | ||
| 440 | /* | 451 | /* |
| 441 | * You may not mess with init | 452 | * Tracing init is not allowed. |
| 442 | */ | 453 | */ |
| 443 | if (pid == 1) | 454 | if (pid == 1) |
| 444 | return -EPERM; | 455 | return ERR_PTR(-EPERM); |
| 445 | 456 | ||
| 446 | ret = -ESRCH; | ||
| 447 | read_lock(&tasklist_lock); | 457 | read_lock(&tasklist_lock); |
| 448 | child = find_task_by_pid(pid); | 458 | child = find_task_by_pid(pid); |
| 449 | if (child) | 459 | if (child) |
| 450 | get_task_struct(child); | 460 | get_task_struct(child); |
| 451 | read_unlock(&tasklist_lock); | 461 | read_unlock(&tasklist_lock); |
| 452 | if (!child) | 462 | if (!child) |
| 453 | return -ESRCH; | 463 | return ERR_PTR(-ESRCH); |
| 454 | 464 | return child; | |
| 455 | *childp = child; | ||
| 456 | return 0; | ||
| 457 | } | 465 | } |
| 458 | 466 | ||
| 467 | #ifndef __ARCH_SYS_PTRACE | ||
| 459 | asmlinkage long sys_ptrace(long request, long pid, long addr, long data) | 468 | asmlinkage long sys_ptrace(long request, long pid, long addr, long data) |
| 460 | { | 469 | { |
| 461 | struct task_struct *child; | 470 | struct task_struct *child; |
| @@ -465,9 +474,16 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data) | |||
| 465 | * This lock_kernel fixes a subtle race with suid exec | 474 | * This lock_kernel fixes a subtle race with suid exec |
| 466 | */ | 475 | */ |
| 467 | lock_kernel(); | 476 | lock_kernel(); |
| 468 | ret = ptrace_get_task_struct(request, pid, &child); | 477 | if (request == PTRACE_TRACEME) { |
| 469 | if (!child) | 478 | ret = ptrace_traceme(); |
| 470 | goto out; | 479 | goto out; |
| 480 | } | ||
| 481 | |||
| 482 | child = ptrace_get_task_struct(pid); | ||
| 483 | if (IS_ERR(child)) { | ||
| 484 | ret = PTR_ERR(child); | ||
| 485 | goto out; | ||
| 486 | } | ||
| 471 | 487 | ||
| 472 | if (request == PTRACE_ATTACH) { | 488 | if (request == PTRACE_ATTACH) { |
| 473 | ret = ptrace_attach(child); | 489 | ret = ptrace_attach(child); |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 48d3bce465b8..0cf8146bd585 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
| @@ -35,6 +35,7 @@ | |||
| 35 | #include <linux/init.h> | 35 | #include <linux/init.h> |
| 36 | #include <linux/spinlock.h> | 36 | #include <linux/spinlock.h> |
| 37 | #include <linux/smp.h> | 37 | #include <linux/smp.h> |
| 38 | #include <linux/rcupdate.h> | ||
| 38 | #include <linux/interrupt.h> | 39 | #include <linux/interrupt.h> |
| 39 | #include <linux/sched.h> | 40 | #include <linux/sched.h> |
| 40 | #include <asm/atomic.h> | 41 | #include <asm/atomic.h> |
| @@ -45,26 +46,21 @@ | |||
| 45 | #include <linux/percpu.h> | 46 | #include <linux/percpu.h> |
| 46 | #include <linux/notifier.h> | 47 | #include <linux/notifier.h> |
| 47 | #include <linux/rcupdate.h> | 48 | #include <linux/rcupdate.h> |
| 48 | #include <linux/rcuref.h> | ||
| 49 | #include <linux/cpu.h> | 49 | #include <linux/cpu.h> |
| 50 | 50 | ||
| 51 | /* Definition for rcupdate control block. */ | 51 | /* Definition for rcupdate control block. */ |
| 52 | struct rcu_ctrlblk rcu_ctrlblk = | 52 | struct rcu_ctrlblk rcu_ctrlblk = { |
| 53 | { .cur = -300, .completed = -300 }; | 53 | .cur = -300, |
| 54 | struct rcu_ctrlblk rcu_bh_ctrlblk = | 54 | .completed = -300, |
| 55 | { .cur = -300, .completed = -300 }; | 55 | .lock = SPIN_LOCK_UNLOCKED, |
| 56 | 56 | .cpumask = CPU_MASK_NONE, | |
| 57 | /* Bookkeeping of the progress of the grace period */ | 57 | }; |
| 58 | struct rcu_state { | 58 | struct rcu_ctrlblk rcu_bh_ctrlblk = { |
| 59 | spinlock_t lock; /* Guard this struct and writes to rcu_ctrlblk */ | 59 | .cur = -300, |
| 60 | cpumask_t cpumask; /* CPUs that need to switch in order */ | 60 | .completed = -300, |
| 61 | /* for current batch to proceed. */ | 61 | .lock = SPIN_LOCK_UNLOCKED, |
| 62 | .cpumask = CPU_MASK_NONE, | ||
| 62 | }; | 63 | }; |
| 63 | |||
| 64 | static struct rcu_state rcu_state ____cacheline_maxaligned_in_smp = | ||
| 65 | {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE }; | ||
| 66 | static struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp = | ||
| 67 | {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE }; | ||
| 68 | 64 | ||
| 69 | DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; | 65 | DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; |
| 70 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; | 66 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; |
| @@ -73,19 +69,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; | |||
| 73 | static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; | 69 | static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; |
| 74 | static int maxbatch = 10000; | 70 | static int maxbatch = 10000; |
| 75 | 71 | ||
| 76 | #ifndef __HAVE_ARCH_CMPXCHG | ||
| 77 | /* | ||
| 78 | * We use an array of spinlocks for the rcurefs -- similar to ones in sparc | ||
| 79 | * 32 bit atomic_t implementations, and a hash function similar to that | ||
| 80 | * for our refcounting needs. | ||
| 81 | * Can't help multiprocessors which donot have cmpxchg :( | ||
| 82 | */ | ||
| 83 | |||
| 84 | spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = { | ||
| 85 | [0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED | ||
| 86 | }; | ||
| 87 | #endif | ||
| 88 | |||
| 89 | /** | 72 | /** |
| 90 | * call_rcu - Queue an RCU callback for invocation after a grace period. | 73 | * call_rcu - Queue an RCU callback for invocation after a grace period. |
| 91 | * @head: structure to be used for queueing the RCU updates. | 74 | * @head: structure to be used for queueing the RCU updates. |
| @@ -233,13 +216,13 @@ static void rcu_do_batch(struct rcu_data *rdp) | |||
| 233 | * This is done by rcu_start_batch. The start is not broadcasted to | 216 | * This is done by rcu_start_batch. The start is not broadcasted to |
| 234 | * all cpus, they must pick this up by comparing rcp->cur with | 217 | * all cpus, they must pick this up by comparing rcp->cur with |
| 235 | * rdp->quiescbatch. All cpus are recorded in the | 218 | * rdp->quiescbatch. All cpus are recorded in the |
| 236 | * rcu_state.cpumask bitmap. | 219 | * rcu_ctrlblk.cpumask bitmap. |
| 237 | * - All cpus must go through a quiescent state. | 220 | * - All cpus must go through a quiescent state. |
| 238 | * Since the start of the grace period is not broadcasted, at least two | 221 | * Since the start of the grace period is not broadcasted, at least two |
| 239 | * calls to rcu_check_quiescent_state are required: | 222 | * calls to rcu_check_quiescent_state are required: |
| 240 | * The first call just notices that a new grace period is running. The | 223 | * The first call just notices that a new grace period is running. The |
| 241 | * following calls check if there was a quiescent state since the beginning | 224 | * following calls check if there was a quiescent state since the beginning |
| 242 | * of the grace period. If so, it updates rcu_state.cpumask. If | 225 | * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If |
| 243 | * the bitmap is empty, then the grace period is completed. | 226 | * the bitmap is empty, then the grace period is completed. |
| 244 | * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace | 227 | * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace |
| 245 | * period (if necessary). | 228 | * period (if necessary). |
| @@ -247,14 +230,10 @@ static void rcu_do_batch(struct rcu_data *rdp) | |||
| 247 | /* | 230 | /* |
| 248 | * Register a new batch of callbacks, and start it up if there is currently no | 231 | * Register a new batch of callbacks, and start it up if there is currently no |
| 249 | * active batch and the batch to be registered has not already occurred. | 232 | * active batch and the batch to be registered has not already occurred. |
| 250 | * Caller must hold rcu_state.lock. | 233 | * Caller must hold rcu_ctrlblk.lock. |
| 251 | */ | 234 | */ |
| 252 | static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp, | 235 | static void rcu_start_batch(struct rcu_ctrlblk *rcp) |
| 253 | int next_pending) | ||
| 254 | { | 236 | { |
| 255 | if (next_pending) | ||
| 256 | rcp->next_pending = 1; | ||
| 257 | |||
| 258 | if (rcp->next_pending && | 237 | if (rcp->next_pending && |
| 259 | rcp->completed == rcp->cur) { | 238 | rcp->completed == rcp->cur) { |
| 260 | rcp->next_pending = 0; | 239 | rcp->next_pending = 0; |
| @@ -268,11 +247,11 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp, | |||
| 268 | /* | 247 | /* |
| 269 | * Accessing nohz_cpu_mask before incrementing rcp->cur needs a | 248 | * Accessing nohz_cpu_mask before incrementing rcp->cur needs a |
| 270 | * Barrier Otherwise it can cause tickless idle CPUs to be | 249 | * Barrier Otherwise it can cause tickless idle CPUs to be |
| 271 | * included in rsp->cpumask, which will extend graceperiods | 250 | * included in rcp->cpumask, which will extend graceperiods |
| 272 | * unnecessarily. | 251 | * unnecessarily. |
| 273 | */ | 252 | */ |
| 274 | smp_mb(); | 253 | smp_mb(); |
| 275 | cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask); | 254 | cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); |
| 276 | 255 | ||
| 277 | } | 256 | } |
| 278 | } | 257 | } |
| @@ -282,13 +261,13 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp, | |||
| 282 | * Clear it from the cpu mask and complete the grace period if it was the last | 261 | * Clear it from the cpu mask and complete the grace period if it was the last |
| 283 | * cpu. Start another grace period if someone has further entries pending | 262 | * cpu. Start another grace period if someone has further entries pending |
| 284 | */ | 263 | */ |
| 285 | static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp) | 264 | static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) |
| 286 | { | 265 | { |
| 287 | cpu_clear(cpu, rsp->cpumask); | 266 | cpu_clear(cpu, rcp->cpumask); |
| 288 | if (cpus_empty(rsp->cpumask)) { | 267 | if (cpus_empty(rcp->cpumask)) { |
| 289 | /* batch completed ! */ | 268 | /* batch completed ! */ |
| 290 | rcp->completed = rcp->cur; | 269 | rcp->completed = rcp->cur; |
| 291 | rcu_start_batch(rcp, rsp, 0); | 270 | rcu_start_batch(rcp); |
| 292 | } | 271 | } |
| 293 | } | 272 | } |
| 294 | 273 | ||
| @@ -298,7 +277,7 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp) | |||
| 298 | * quiescent cycle, then indicate that it has done so. | 277 | * quiescent cycle, then indicate that it has done so. |
| 299 | */ | 278 | */ |
| 300 | static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, | 279 | static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, |
| 301 | struct rcu_state *rsp, struct rcu_data *rdp) | 280 | struct rcu_data *rdp) |
| 302 | { | 281 | { |
| 303 | if (rdp->quiescbatch != rcp->cur) { | 282 | if (rdp->quiescbatch != rcp->cur) { |
| 304 | /* start new grace period: */ | 283 | /* start new grace period: */ |
| @@ -323,15 +302,15 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, | |||
| 323 | return; | 302 | return; |
| 324 | rdp->qs_pending = 0; | 303 | rdp->qs_pending = 0; |
| 325 | 304 | ||
| 326 | spin_lock(&rsp->lock); | 305 | spin_lock(&rcp->lock); |
| 327 | /* | 306 | /* |
| 328 | * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync | 307 | * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync |
| 329 | * during cpu startup. Ignore the quiescent state. | 308 | * during cpu startup. Ignore the quiescent state. |
| 330 | */ | 309 | */ |
| 331 | if (likely(rdp->quiescbatch == rcp->cur)) | 310 | if (likely(rdp->quiescbatch == rcp->cur)) |
| 332 | cpu_quiet(rdp->cpu, rcp, rsp); | 311 | cpu_quiet(rdp->cpu, rcp); |
| 333 | 312 | ||
| 334 | spin_unlock(&rsp->lock); | 313 | spin_unlock(&rcp->lock); |
| 335 | } | 314 | } |
| 336 | 315 | ||
| 337 | 316 | ||
| @@ -352,28 +331,29 @@ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, | |||
| 352 | } | 331 | } |
| 353 | 332 | ||
| 354 | static void __rcu_offline_cpu(struct rcu_data *this_rdp, | 333 | static void __rcu_offline_cpu(struct rcu_data *this_rdp, |
| 355 | struct rcu_ctrlblk *rcp, struct rcu_state *rsp, struct rcu_data *rdp) | 334 | struct rcu_ctrlblk *rcp, struct rcu_data *rdp) |
| 356 | { | 335 | { |
| 357 | /* if the cpu going offline owns the grace period | 336 | /* if the cpu going offline owns the grace period |
| 358 | * we can block indefinitely waiting for it, so flush | 337 | * we can block indefinitely waiting for it, so flush |
| 359 | * it here | 338 | * it here |
| 360 | */ | 339 | */ |
| 361 | spin_lock_bh(&rsp->lock); | 340 | spin_lock_bh(&rcp->lock); |
| 362 | if (rcp->cur != rcp->completed) | 341 | if (rcp->cur != rcp->completed) |
| 363 | cpu_quiet(rdp->cpu, rcp, rsp); | 342 | cpu_quiet(rdp->cpu, rcp); |
| 364 | spin_unlock_bh(&rsp->lock); | 343 | spin_unlock_bh(&rcp->lock); |
| 365 | rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); | 344 | rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); |
| 366 | rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); | 345 | rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); |
| 367 | 346 | rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); | |
| 368 | } | 347 | } |
| 348 | |||
| 369 | static void rcu_offline_cpu(int cpu) | 349 | static void rcu_offline_cpu(int cpu) |
| 370 | { | 350 | { |
| 371 | struct rcu_data *this_rdp = &get_cpu_var(rcu_data); | 351 | struct rcu_data *this_rdp = &get_cpu_var(rcu_data); |
| 372 | struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); | 352 | struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); |
| 373 | 353 | ||
| 374 | __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, &rcu_state, | 354 | __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, |
| 375 | &per_cpu(rcu_data, cpu)); | 355 | &per_cpu(rcu_data, cpu)); |
| 376 | __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, &rcu_bh_state, | 356 | __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, |
| 377 | &per_cpu(rcu_bh_data, cpu)); | 357 | &per_cpu(rcu_bh_data, cpu)); |
| 378 | put_cpu_var(rcu_data); | 358 | put_cpu_var(rcu_data); |
| 379 | put_cpu_var(rcu_bh_data); | 359 | put_cpu_var(rcu_bh_data); |
| @@ -392,7 +372,7 @@ static void rcu_offline_cpu(int cpu) | |||
| 392 | * This does the RCU processing work from tasklet context. | 372 | * This does the RCU processing work from tasklet context. |
| 393 | */ | 373 | */ |
| 394 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, | 374 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, |
| 395 | struct rcu_state *rsp, struct rcu_data *rdp) | 375 | struct rcu_data *rdp) |
| 396 | { | 376 | { |
| 397 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { | 377 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { |
| 398 | *rdp->donetail = rdp->curlist; | 378 | *rdp->donetail = rdp->curlist; |
| @@ -422,24 +402,53 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, | |||
| 422 | 402 | ||
| 423 | if (!rcp->next_pending) { | 403 | if (!rcp->next_pending) { |
| 424 | /* and start it/schedule start if it's a new batch */ | 404 | /* and start it/schedule start if it's a new batch */ |
| 425 | spin_lock(&rsp->lock); | 405 | spin_lock(&rcp->lock); |
| 426 | rcu_start_batch(rcp, rsp, 1); | 406 | rcp->next_pending = 1; |
| 427 | spin_unlock(&rsp->lock); | 407 | rcu_start_batch(rcp); |
| 408 | spin_unlock(&rcp->lock); | ||
| 428 | } | 409 | } |
| 429 | } else { | 410 | } else { |
| 430 | local_irq_enable(); | 411 | local_irq_enable(); |
| 431 | } | 412 | } |
| 432 | rcu_check_quiescent_state(rcp, rsp, rdp); | 413 | rcu_check_quiescent_state(rcp, rdp); |
| 433 | if (rdp->donelist) | 414 | if (rdp->donelist) |
| 434 | rcu_do_batch(rdp); | 415 | rcu_do_batch(rdp); |
| 435 | } | 416 | } |
| 436 | 417 | ||
| 437 | static void rcu_process_callbacks(unsigned long unused) | 418 | static void rcu_process_callbacks(unsigned long unused) |
| 438 | { | 419 | { |
| 439 | __rcu_process_callbacks(&rcu_ctrlblk, &rcu_state, | 420 | __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); |
| 440 | &__get_cpu_var(rcu_data)); | 421 | __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); |
| 441 | __rcu_process_callbacks(&rcu_bh_ctrlblk, &rcu_bh_state, | 422 | } |
| 442 | &__get_cpu_var(rcu_bh_data)); | 423 | |
| 424 | static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
| 425 | { | ||
| 426 | /* This cpu has pending rcu entries and the grace period | ||
| 427 | * for them has completed. | ||
| 428 | */ | ||
| 429 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) | ||
| 430 | return 1; | ||
| 431 | |||
| 432 | /* This cpu has no pending entries, but there are new entries */ | ||
| 433 | if (!rdp->curlist && rdp->nxtlist) | ||
| 434 | return 1; | ||
| 435 | |||
| 436 | /* This cpu has finished callbacks to invoke */ | ||
| 437 | if (rdp->donelist) | ||
| 438 | return 1; | ||
| 439 | |||
| 440 | /* The rcu core waits for a quiescent state from the cpu */ | ||
| 441 | if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) | ||
| 442 | return 1; | ||
| 443 | |||
| 444 | /* nothing to do */ | ||
| 445 | return 0; | ||
| 446 | } | ||
| 447 | |||
| 448 | int rcu_pending(int cpu) | ||
| 449 | { | ||
| 450 | return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || | ||
| 451 | __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); | ||
| 443 | } | 452 | } |
| 444 | 453 | ||
| 445 | void rcu_check_callbacks(int cpu, int user) | 454 | void rcu_check_callbacks(int cpu, int user) |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 49fbbeff201c..773219907dd8 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
| @@ -39,7 +39,6 @@ | |||
| 39 | #include <linux/moduleparam.h> | 39 | #include <linux/moduleparam.h> |
| 40 | #include <linux/percpu.h> | 40 | #include <linux/percpu.h> |
| 41 | #include <linux/notifier.h> | 41 | #include <linux/notifier.h> |
| 42 | #include <linux/rcuref.h> | ||
| 43 | #include <linux/cpu.h> | 42 | #include <linux/cpu.h> |
| 44 | #include <linux/random.h> | 43 | #include <linux/random.h> |
| 45 | #include <linux/delay.h> | 44 | #include <linux/delay.h> |
| @@ -49,9 +48,11 @@ | |||
| 49 | MODULE_LICENSE("GPL"); | 48 | MODULE_LICENSE("GPL"); |
| 50 | 49 | ||
| 51 | static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */ | 50 | static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */ |
| 52 | static int stat_interval = 0; /* Interval between stats, in seconds. */ | 51 | static int stat_interval; /* Interval between stats, in seconds. */ |
| 53 | /* Defaults to "only at end of test". */ | 52 | /* Defaults to "only at end of test". */ |
| 54 | static int verbose = 0; /* Print more debug info. */ | 53 | static int verbose; /* Print more debug info. */ |
| 54 | static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ | ||
| 55 | static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ | ||
| 55 | 56 | ||
| 56 | MODULE_PARM(nreaders, "i"); | 57 | MODULE_PARM(nreaders, "i"); |
| 57 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | 58 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); |
| @@ -59,6 +60,10 @@ MODULE_PARM(stat_interval, "i"); | |||
| 59 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); | 60 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); |
| 60 | MODULE_PARM(verbose, "i"); | 61 | MODULE_PARM(verbose, "i"); |
| 61 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); | 62 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); |
| 63 | MODULE_PARM(test_no_idle_hz, "i"); | ||
| 64 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); | ||
| 65 | MODULE_PARM(shuffle_interval, "i"); | ||
| 66 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); | ||
| 62 | #define TORTURE_FLAG "rcutorture: " | 67 | #define TORTURE_FLAG "rcutorture: " |
| 63 | #define PRINTK_STRING(s) \ | 68 | #define PRINTK_STRING(s) \ |
| 64 | do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) | 69 | do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) |
| @@ -73,6 +78,7 @@ static int nrealreaders; | |||
| 73 | static struct task_struct *writer_task; | 78 | static struct task_struct *writer_task; |
| 74 | static struct task_struct **reader_tasks; | 79 | static struct task_struct **reader_tasks; |
| 75 | static struct task_struct *stats_task; | 80 | static struct task_struct *stats_task; |
| 81 | static struct task_struct *shuffler_task; | ||
| 76 | 82 | ||
| 77 | #define RCU_TORTURE_PIPE_LEN 10 | 83 | #define RCU_TORTURE_PIPE_LEN 10 |
| 78 | 84 | ||
| @@ -103,7 +109,7 @@ atomic_t n_rcu_torture_error; | |||
| 103 | /* | 109 | /* |
| 104 | * Allocate an element from the rcu_tortures pool. | 110 | * Allocate an element from the rcu_tortures pool. |
| 105 | */ | 111 | */ |
| 106 | struct rcu_torture * | 112 | static struct rcu_torture * |
| 107 | rcu_torture_alloc(void) | 113 | rcu_torture_alloc(void) |
| 108 | { | 114 | { |
| 109 | struct list_head *p; | 115 | struct list_head *p; |
| @@ -376,12 +382,77 @@ rcu_torture_stats(void *arg) | |||
| 376 | return 0; | 382 | return 0; |
| 377 | } | 383 | } |
| 378 | 384 | ||
| 385 | static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ | ||
| 386 | |||
| 387 | /* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case | ||
| 388 | * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. | ||
| 389 | */ | ||
| 390 | void rcu_torture_shuffle_tasks(void) | ||
| 391 | { | ||
| 392 | cpumask_t tmp_mask = CPU_MASK_ALL; | ||
| 393 | int i; | ||
| 394 | |||
| 395 | lock_cpu_hotplug(); | ||
| 396 | |||
| 397 | /* No point in shuffling if there is only one online CPU (ex: UP) */ | ||
| 398 | if (num_online_cpus() == 1) { | ||
| 399 | unlock_cpu_hotplug(); | ||
| 400 | return; | ||
| 401 | } | ||
| 402 | |||
| 403 | if (rcu_idle_cpu != -1) | ||
| 404 | cpu_clear(rcu_idle_cpu, tmp_mask); | ||
| 405 | |||
| 406 | set_cpus_allowed(current, tmp_mask); | ||
| 407 | |||
| 408 | if (reader_tasks != NULL) { | ||
| 409 | for (i = 0; i < nrealreaders; i++) | ||
| 410 | if (reader_tasks[i]) | ||
| 411 | set_cpus_allowed(reader_tasks[i], tmp_mask); | ||
| 412 | } | ||
| 413 | |||
| 414 | if (writer_task) | ||
| 415 | set_cpus_allowed(writer_task, tmp_mask); | ||
| 416 | |||
| 417 | if (stats_task) | ||
| 418 | set_cpus_allowed(stats_task, tmp_mask); | ||
| 419 | |||
| 420 | if (rcu_idle_cpu == -1) | ||
| 421 | rcu_idle_cpu = num_online_cpus() - 1; | ||
| 422 | else | ||
| 423 | rcu_idle_cpu--; | ||
| 424 | |||
| 425 | unlock_cpu_hotplug(); | ||
| 426 | } | ||
| 427 | |||
| 428 | /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the | ||
| 429 | * system to become idle at a time and cut off its timer ticks. This is meant | ||
| 430 | * to test the support for such tickless idle CPU in RCU. | ||
| 431 | */ | ||
| 432 | static int | ||
| 433 | rcu_torture_shuffle(void *arg) | ||
| 434 | { | ||
| 435 | VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started"); | ||
| 436 | do { | ||
| 437 | schedule_timeout_interruptible(shuffle_interval * HZ); | ||
| 438 | rcu_torture_shuffle_tasks(); | ||
| 439 | } while (!kthread_should_stop()); | ||
| 440 | VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); | ||
| 441 | return 0; | ||
| 442 | } | ||
| 443 | |||
| 379 | static void | 444 | static void |
| 380 | rcu_torture_cleanup(void) | 445 | rcu_torture_cleanup(void) |
| 381 | { | 446 | { |
| 382 | int i; | 447 | int i; |
| 383 | 448 | ||
| 384 | fullstop = 1; | 449 | fullstop = 1; |
| 450 | if (shuffler_task != NULL) { | ||
| 451 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); | ||
| 452 | kthread_stop(shuffler_task); | ||
| 453 | } | ||
| 454 | shuffler_task = NULL; | ||
| 455 | |||
| 385 | if (writer_task != NULL) { | 456 | if (writer_task != NULL) { |
| 386 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); | 457 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); |
| 387 | kthread_stop(writer_task); | 458 | kthread_stop(writer_task); |
| @@ -430,9 +501,11 @@ rcu_torture_init(void) | |||
| 430 | nrealreaders = nreaders; | 501 | nrealreaders = nreaders; |
| 431 | else | 502 | else |
| 432 | nrealreaders = 2 * num_online_cpus(); | 503 | nrealreaders = 2 * num_online_cpus(); |
| 433 | printk(KERN_ALERT TORTURE_FLAG | 504 | printk(KERN_ALERT TORTURE_FLAG "--- Start of test: nreaders=%d " |
| 434 | "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n", | 505 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " |
| 435 | nrealreaders, stat_interval, verbose); | 506 | "shuffle_interval = %d\n", |
| 507 | nrealreaders, stat_interval, verbose, test_no_idle_hz, | ||
| 508 | shuffle_interval); | ||
| 436 | fullstop = 0; | 509 | fullstop = 0; |
| 437 | 510 | ||
| 438 | /* Set up the freelist. */ | 511 | /* Set up the freelist. */ |
| @@ -502,6 +575,18 @@ rcu_torture_init(void) | |||
| 502 | goto unwind; | 575 | goto unwind; |
| 503 | } | 576 | } |
| 504 | } | 577 | } |
| 578 | if (test_no_idle_hz) { | ||
| 579 | rcu_idle_cpu = num_online_cpus() - 1; | ||
| 580 | /* Create the shuffler thread */ | ||
| 581 | shuffler_task = kthread_run(rcu_torture_shuffle, NULL, | ||
| 582 | "rcu_torture_shuffle"); | ||
| 583 | if (IS_ERR(shuffler_task)) { | ||
| 584 | firsterr = PTR_ERR(shuffler_task); | ||
| 585 | VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); | ||
| 586 | shuffler_task = NULL; | ||
| 587 | goto unwind; | ||
| 588 | } | ||
| 589 | } | ||
| 505 | return 0; | 590 | return 0; |
| 506 | 591 | ||
| 507 | unwind: | 592 | unwind: |
diff --git a/kernel/resource.c b/kernel/resource.c index 92285d822de6..e3080fcc66a3 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
| @@ -464,7 +464,7 @@ struct resource * __request_region(struct resource *parent, unsigned long start, | |||
| 464 | 464 | ||
| 465 | EXPORT_SYMBOL(__request_region); | 465 | EXPORT_SYMBOL(__request_region); |
| 466 | 466 | ||
| 467 | int __deprecated __check_region(struct resource *parent, unsigned long start, unsigned long n) | 467 | int __check_region(struct resource *parent, unsigned long start, unsigned long n) |
| 468 | { | 468 | { |
| 469 | struct resource * res; | 469 | struct resource * res; |
| 470 | 470 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 6f46c94cc29e..c9dec2aa1976 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -27,12 +27,14 @@ | |||
| 27 | #include <linux/smp_lock.h> | 27 | #include <linux/smp_lock.h> |
| 28 | #include <asm/mmu_context.h> | 28 | #include <asm/mmu_context.h> |
| 29 | #include <linux/interrupt.h> | 29 | #include <linux/interrupt.h> |
| 30 | #include <linux/capability.h> | ||
| 30 | #include <linux/completion.h> | 31 | #include <linux/completion.h> |
| 31 | #include <linux/kernel_stat.h> | 32 | #include <linux/kernel_stat.h> |
| 32 | #include <linux/security.h> | 33 | #include <linux/security.h> |
| 33 | #include <linux/notifier.h> | 34 | #include <linux/notifier.h> |
| 34 | #include <linux/profile.h> | 35 | #include <linux/profile.h> |
| 35 | #include <linux/suspend.h> | 36 | #include <linux/suspend.h> |
| 37 | #include <linux/vmalloc.h> | ||
| 36 | #include <linux/blkdev.h> | 38 | #include <linux/blkdev.h> |
| 37 | #include <linux/delay.h> | 39 | #include <linux/delay.h> |
| 38 | #include <linux/smp.h> | 40 | #include <linux/smp.h> |
| @@ -176,6 +178,13 @@ static unsigned int task_timeslice(task_t *p) | |||
| 176 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ | 178 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ |
| 177 | < (long long) (sd)->cache_hot_time) | 179 | < (long long) (sd)->cache_hot_time) |
| 178 | 180 | ||
| 181 | void __put_task_struct_cb(struct rcu_head *rhp) | ||
| 182 | { | ||
| 183 | __put_task_struct(container_of(rhp, struct task_struct, rcu)); | ||
| 184 | } | ||
| 185 | |||
| 186 | EXPORT_SYMBOL_GPL(__put_task_struct_cb); | ||
| 187 | |||
| 179 | /* | 188 | /* |
| 180 | * These are the runqueue data structures: | 189 | * These are the runqueue data structures: |
| 181 | */ | 190 | */ |
| @@ -1281,6 +1290,9 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync) | |||
| 1281 | } | 1290 | } |
| 1282 | } | 1291 | } |
| 1283 | 1292 | ||
| 1293 | if (p->last_waker_cpu != this_cpu) | ||
| 1294 | goto out_set_cpu; | ||
| 1295 | |||
| 1284 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | 1296 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) |
| 1285 | goto out_set_cpu; | 1297 | goto out_set_cpu; |
| 1286 | 1298 | ||
| @@ -1351,6 +1363,8 @@ out_set_cpu: | |||
| 1351 | cpu = task_cpu(p); | 1363 | cpu = task_cpu(p); |
| 1352 | } | 1364 | } |
| 1353 | 1365 | ||
| 1366 | p->last_waker_cpu = this_cpu; | ||
| 1367 | |||
| 1354 | out_activate: | 1368 | out_activate: |
| 1355 | #endif /* CONFIG_SMP */ | 1369 | #endif /* CONFIG_SMP */ |
| 1356 | if (old_state == TASK_UNINTERRUPTIBLE) { | 1370 | if (old_state == TASK_UNINTERRUPTIBLE) { |
| @@ -1432,9 +1446,12 @@ void fastcall sched_fork(task_t *p, int clone_flags) | |||
| 1432 | #ifdef CONFIG_SCHEDSTATS | 1446 | #ifdef CONFIG_SCHEDSTATS |
| 1433 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 1447 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
| 1434 | #endif | 1448 | #endif |
| 1435 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 1449 | #if defined(CONFIG_SMP) |
| 1450 | p->last_waker_cpu = cpu; | ||
| 1451 | #if defined(__ARCH_WANT_UNLOCKED_CTXSW) | ||
| 1436 | p->oncpu = 0; | 1452 | p->oncpu = 0; |
| 1437 | #endif | 1453 | #endif |
| 1454 | #endif | ||
| 1438 | #ifdef CONFIG_PREEMPT | 1455 | #ifdef CONFIG_PREEMPT |
| 1439 | /* Want to start with kernel preemption disabled. */ | 1456 | /* Want to start with kernel preemption disabled. */ |
| 1440 | task_thread_info(p)->preempt_count = 1; | 1457 | task_thread_info(p)->preempt_count = 1; |
| @@ -3972,12 +3989,12 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, | |||
| 3972 | * method, such as ACPI for e.g. | 3989 | * method, such as ACPI for e.g. |
| 3973 | */ | 3990 | */ |
| 3974 | 3991 | ||
| 3975 | cpumask_t cpu_present_map; | 3992 | cpumask_t cpu_present_map __read_mostly; |
| 3976 | EXPORT_SYMBOL(cpu_present_map); | 3993 | EXPORT_SYMBOL(cpu_present_map); |
| 3977 | 3994 | ||
| 3978 | #ifndef CONFIG_SMP | 3995 | #ifndef CONFIG_SMP |
| 3979 | cpumask_t cpu_online_map = CPU_MASK_ALL; | 3996 | cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; |
| 3980 | cpumask_t cpu_possible_map = CPU_MASK_ALL; | 3997 | cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; |
| 3981 | #endif | 3998 | #endif |
| 3982 | 3999 | ||
| 3983 | long sched_getaffinity(pid_t pid, cpumask_t *mask) | 4000 | long sched_getaffinity(pid_t pid, cpumask_t *mask) |
| @@ -4379,6 +4396,7 @@ void show_state(void) | |||
| 4379 | } while_each_thread(g, p); | 4396 | } while_each_thread(g, p); |
| 4380 | 4397 | ||
| 4381 | read_unlock(&tasklist_lock); | 4398 | read_unlock(&tasklist_lock); |
| 4399 | mutex_debug_show_all_locks(); | ||
| 4382 | } | 4400 | } |
| 4383 | 4401 | ||
| 4384 | /** | 4402 | /** |
| @@ -5073,7 +5091,470 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, | |||
| 5073 | 5091 | ||
| 5074 | #define SD_NODES_PER_DOMAIN 16 | 5092 | #define SD_NODES_PER_DOMAIN 16 |
| 5075 | 5093 | ||
| 5094 | /* | ||
| 5095 | * Self-tuning task migration cost measurement between source and target CPUs. | ||
| 5096 | * | ||
| 5097 | * This is done by measuring the cost of manipulating buffers of varying | ||
| 5098 | * sizes. For a given buffer-size here are the steps that are taken: | ||
| 5099 | * | ||
| 5100 | * 1) the source CPU reads+dirties a shared buffer | ||
| 5101 | * 2) the target CPU reads+dirties the same shared buffer | ||
| 5102 | * | ||
| 5103 | * We measure how long they take, in the following 4 scenarios: | ||
| 5104 | * | ||
| 5105 | * - source: CPU1, target: CPU2 | cost1 | ||
| 5106 | * - source: CPU2, target: CPU1 | cost2 | ||
| 5107 | * - source: CPU1, target: CPU1 | cost3 | ||
| 5108 | * - source: CPU2, target: CPU2 | cost4 | ||
| 5109 | * | ||
| 5110 | * We then calculate the cost3+cost4-cost1-cost2 difference - this is | ||
| 5111 | * the cost of migration. | ||
| 5112 | * | ||
| 5113 | * We then start off from a small buffer-size and iterate up to larger | ||
| 5114 | * buffer sizes, in 5% steps - measuring each buffer-size separately, and | ||
| 5115 | * doing a maximum search for the cost. (The maximum cost for a migration | ||
| 5116 | * normally occurs when the working set size is around the effective cache | ||
| 5117 | * size.) | ||
| 5118 | */ | ||
| 5119 | #define SEARCH_SCOPE 2 | ||
| 5120 | #define MIN_CACHE_SIZE (64*1024U) | ||
| 5121 | #define DEFAULT_CACHE_SIZE (5*1024*1024U) | ||
| 5122 | #define ITERATIONS 2 | ||
| 5123 | #define SIZE_THRESH 130 | ||
| 5124 | #define COST_THRESH 130 | ||
| 5125 | |||
| 5126 | /* | ||
| 5127 | * The migration cost is a function of 'domain distance'. Domain | ||
| 5128 | * distance is the number of steps a CPU has to iterate down its | ||
| 5129 | * domain tree to share a domain with the other CPU. The farther | ||
| 5130 | * two CPUs are from each other, the larger the distance gets. | ||
| 5131 | * | ||
| 5132 | * Note that we use the distance only to cache measurement results, | ||
| 5133 | * the distance value is not used numerically otherwise. When two | ||
| 5134 | * CPUs have the same distance it is assumed that the migration | ||
| 5135 | * cost is the same. (this is a simplification but quite practical) | ||
| 5136 | */ | ||
| 5137 | #define MAX_DOMAIN_DISTANCE 32 | ||
| 5138 | |||
| 5139 | static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = | ||
| 5140 | { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL }; | ||
| 5141 | |||
| 5142 | /* | ||
| 5143 | * Allow override of migration cost - in units of microseconds. | ||
| 5144 | * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost | ||
| 5145 | * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: | ||
| 5146 | */ | ||
| 5147 | static int __init migration_cost_setup(char *str) | ||
| 5148 | { | ||
| 5149 | int ints[MAX_DOMAIN_DISTANCE+1], i; | ||
| 5150 | |||
| 5151 | str = get_options(str, ARRAY_SIZE(ints), ints); | ||
| 5152 | |||
| 5153 | printk("#ints: %d\n", ints[0]); | ||
| 5154 | for (i = 1; i <= ints[0]; i++) { | ||
| 5155 | migration_cost[i-1] = (unsigned long long)ints[i]*1000; | ||
| 5156 | printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); | ||
| 5157 | } | ||
| 5158 | return 1; | ||
| 5159 | } | ||
| 5160 | |||
| 5161 | __setup ("migration_cost=", migration_cost_setup); | ||
| 5162 | |||
| 5163 | /* | ||
| 5164 | * Global multiplier (divisor) for migration-cutoff values, | ||
| 5165 | * in percentiles. E.g. use a value of 150 to get 1.5 times | ||
| 5166 | * longer cache-hot cutoff times. | ||
| 5167 | * | ||
| 5168 | * (We scale it from 100 to 128 to long long handling easier.) | ||
| 5169 | */ | ||
| 5170 | |||
| 5171 | #define MIGRATION_FACTOR_SCALE 128 | ||
| 5172 | |||
| 5173 | static unsigned int migration_factor = MIGRATION_FACTOR_SCALE; | ||
| 5174 | |||
| 5175 | static int __init setup_migration_factor(char *str) | ||
| 5176 | { | ||
| 5177 | get_option(&str, &migration_factor); | ||
| 5178 | migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; | ||
| 5179 | return 1; | ||
| 5180 | } | ||
| 5181 | |||
| 5182 | __setup("migration_factor=", setup_migration_factor); | ||
| 5183 | |||
| 5184 | /* | ||
| 5185 | * Estimated distance of two CPUs, measured via the number of domains | ||
| 5186 | * we have to pass for the two CPUs to be in the same span: | ||
| 5187 | */ | ||
| 5188 | static unsigned long domain_distance(int cpu1, int cpu2) | ||
| 5189 | { | ||
| 5190 | unsigned long distance = 0; | ||
| 5191 | struct sched_domain *sd; | ||
| 5192 | |||
| 5193 | for_each_domain(cpu1, sd) { | ||
| 5194 | WARN_ON(!cpu_isset(cpu1, sd->span)); | ||
| 5195 | if (cpu_isset(cpu2, sd->span)) | ||
| 5196 | return distance; | ||
| 5197 | distance++; | ||
| 5198 | } | ||
| 5199 | if (distance >= MAX_DOMAIN_DISTANCE) { | ||
| 5200 | WARN_ON(1); | ||
| 5201 | distance = MAX_DOMAIN_DISTANCE-1; | ||
| 5202 | } | ||
| 5203 | |||
| 5204 | return distance; | ||
| 5205 | } | ||
| 5206 | |||
| 5207 | static unsigned int migration_debug; | ||
| 5208 | |||
| 5209 | static int __init setup_migration_debug(char *str) | ||
| 5210 | { | ||
| 5211 | get_option(&str, &migration_debug); | ||
| 5212 | return 1; | ||
| 5213 | } | ||
| 5214 | |||
| 5215 | __setup("migration_debug=", setup_migration_debug); | ||
| 5216 | |||
| 5217 | /* | ||
| 5218 | * Maximum cache-size that the scheduler should try to measure. | ||
| 5219 | * Architectures with larger caches should tune this up during | ||
| 5220 | * bootup. Gets used in the domain-setup code (i.e. during SMP | ||
| 5221 | * bootup). | ||
| 5222 | */ | ||
| 5223 | unsigned int max_cache_size; | ||
| 5224 | |||
| 5225 | static int __init setup_max_cache_size(char *str) | ||
| 5226 | { | ||
| 5227 | get_option(&str, &max_cache_size); | ||
| 5228 | return 1; | ||
| 5229 | } | ||
| 5230 | |||
| 5231 | __setup("max_cache_size=", setup_max_cache_size); | ||
| 5232 | |||
| 5233 | /* | ||
| 5234 | * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This | ||
| 5235 | * is the operation that is timed, so we try to generate unpredictable | ||
| 5236 | * cachemisses that still end up filling the L2 cache: | ||
| 5237 | */ | ||
| 5238 | static void touch_cache(void *__cache, unsigned long __size) | ||
| 5239 | { | ||
| 5240 | unsigned long size = __size/sizeof(long), chunk1 = size/3, | ||
| 5241 | chunk2 = 2*size/3; | ||
| 5242 | unsigned long *cache = __cache; | ||
| 5243 | int i; | ||
| 5244 | |||
| 5245 | for (i = 0; i < size/6; i += 8) { | ||
| 5246 | switch (i % 6) { | ||
| 5247 | case 0: cache[i]++; | ||
| 5248 | case 1: cache[size-1-i]++; | ||
| 5249 | case 2: cache[chunk1-i]++; | ||
| 5250 | case 3: cache[chunk1+i]++; | ||
| 5251 | case 4: cache[chunk2-i]++; | ||
| 5252 | case 5: cache[chunk2+i]++; | ||
| 5253 | } | ||
| 5254 | } | ||
| 5255 | } | ||
| 5256 | |||
| 5257 | /* | ||
| 5258 | * Measure the cache-cost of one task migration. Returns in units of nsec. | ||
| 5259 | */ | ||
| 5260 | static unsigned long long measure_one(void *cache, unsigned long size, | ||
| 5261 | int source, int target) | ||
| 5262 | { | ||
| 5263 | cpumask_t mask, saved_mask; | ||
| 5264 | unsigned long long t0, t1, t2, t3, cost; | ||
| 5265 | |||
| 5266 | saved_mask = current->cpus_allowed; | ||
| 5267 | |||
| 5268 | /* | ||
| 5269 | * Flush source caches to RAM and invalidate them: | ||
| 5270 | */ | ||
| 5271 | sched_cacheflush(); | ||
| 5272 | |||
| 5273 | /* | ||
| 5274 | * Migrate to the source CPU: | ||
| 5275 | */ | ||
| 5276 | mask = cpumask_of_cpu(source); | ||
| 5277 | set_cpus_allowed(current, mask); | ||
| 5278 | WARN_ON(smp_processor_id() != source); | ||
| 5279 | |||
| 5280 | /* | ||
| 5281 | * Dirty the working set: | ||
| 5282 | */ | ||
| 5283 | t0 = sched_clock(); | ||
| 5284 | touch_cache(cache, size); | ||
| 5285 | t1 = sched_clock(); | ||
| 5286 | |||
| 5287 | /* | ||
| 5288 | * Migrate to the target CPU, dirty the L2 cache and access | ||
| 5289 | * the shared buffer. (which represents the working set | ||
| 5290 | * of a migrated task.) | ||
| 5291 | */ | ||
| 5292 | mask = cpumask_of_cpu(target); | ||
| 5293 | set_cpus_allowed(current, mask); | ||
| 5294 | WARN_ON(smp_processor_id() != target); | ||
| 5295 | |||
| 5296 | t2 = sched_clock(); | ||
| 5297 | touch_cache(cache, size); | ||
| 5298 | t3 = sched_clock(); | ||
| 5299 | |||
| 5300 | cost = t1-t0 + t3-t2; | ||
| 5301 | |||
| 5302 | if (migration_debug >= 2) | ||
| 5303 | printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", | ||
| 5304 | source, target, t1-t0, t1-t0, t3-t2, cost); | ||
| 5305 | /* | ||
| 5306 | * Flush target caches to RAM and invalidate them: | ||
| 5307 | */ | ||
| 5308 | sched_cacheflush(); | ||
| 5309 | |||
| 5310 | set_cpus_allowed(current, saved_mask); | ||
| 5311 | |||
| 5312 | return cost; | ||
| 5313 | } | ||
| 5314 | |||
| 5315 | /* | ||
| 5316 | * Measure a series of task migrations and return the average | ||
| 5317 | * result. Since this code runs early during bootup the system | ||
| 5318 | * is 'undisturbed' and the average latency makes sense. | ||
| 5319 | * | ||
| 5320 | * The algorithm in essence auto-detects the relevant cache-size, | ||
| 5321 | * so it will properly detect different cachesizes for different | ||
| 5322 | * cache-hierarchies, depending on how the CPUs are connected. | ||
| 5323 | * | ||
| 5324 | * Architectures can prime the upper limit of the search range via | ||
| 5325 | * max_cache_size, otherwise the search range defaults to 20MB...64K. | ||
| 5326 | */ | ||
| 5327 | static unsigned long long | ||
| 5328 | measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) | ||
| 5329 | { | ||
| 5330 | unsigned long long cost1, cost2; | ||
| 5331 | int i; | ||
| 5332 | |||
| 5333 | /* | ||
| 5334 | * Measure the migration cost of 'size' bytes, over an | ||
| 5335 | * average of 10 runs: | ||
| 5336 | * | ||
| 5337 | * (We perturb the cache size by a small (0..4k) | ||
| 5338 | * value to compensate size/alignment related artifacts. | ||
| 5339 | * We also subtract the cost of the operation done on | ||
| 5340 | * the same CPU.) | ||
| 5341 | */ | ||
| 5342 | cost1 = 0; | ||
| 5343 | |||
| 5344 | /* | ||
| 5345 | * dry run, to make sure we start off cache-cold on cpu1, | ||
| 5346 | * and to get any vmalloc pagefaults in advance: | ||
| 5347 | */ | ||
| 5348 | measure_one(cache, size, cpu1, cpu2); | ||
| 5349 | for (i = 0; i < ITERATIONS; i++) | ||
| 5350 | cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); | ||
| 5351 | |||
| 5352 | measure_one(cache, size, cpu2, cpu1); | ||
| 5353 | for (i = 0; i < ITERATIONS; i++) | ||
| 5354 | cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); | ||
| 5355 | |||
| 5356 | /* | ||
| 5357 | * (We measure the non-migrating [cached] cost on both | ||
| 5358 | * cpu1 and cpu2, to handle CPUs with different speeds) | ||
| 5359 | */ | ||
| 5360 | cost2 = 0; | ||
| 5361 | |||
| 5362 | measure_one(cache, size, cpu1, cpu1); | ||
| 5363 | for (i = 0; i < ITERATIONS; i++) | ||
| 5364 | cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); | ||
| 5365 | |||
| 5366 | measure_one(cache, size, cpu2, cpu2); | ||
| 5367 | for (i = 0; i < ITERATIONS; i++) | ||
| 5368 | cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); | ||
| 5369 | |||
| 5370 | /* | ||
| 5371 | * Get the per-iteration migration cost: | ||
| 5372 | */ | ||
| 5373 | do_div(cost1, 2*ITERATIONS); | ||
| 5374 | do_div(cost2, 2*ITERATIONS); | ||
| 5375 | |||
| 5376 | return cost1 - cost2; | ||
| 5377 | } | ||
| 5378 | |||
| 5379 | static unsigned long long measure_migration_cost(int cpu1, int cpu2) | ||
| 5380 | { | ||
| 5381 | unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; | ||
| 5382 | unsigned int max_size, size, size_found = 0; | ||
| 5383 | long long cost = 0, prev_cost; | ||
| 5384 | void *cache; | ||
| 5385 | |||
| 5386 | /* | ||
| 5387 | * Search from max_cache_size*5 down to 64K - the real relevant | ||
| 5388 | * cachesize has to lie somewhere inbetween. | ||
| 5389 | */ | ||
| 5390 | if (max_cache_size) { | ||
| 5391 | max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); | ||
| 5392 | size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); | ||
| 5393 | } else { | ||
| 5394 | /* | ||
| 5395 | * Since we have no estimation about the relevant | ||
| 5396 | * search range | ||
| 5397 | */ | ||
| 5398 | max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; | ||
| 5399 | size = MIN_CACHE_SIZE; | ||
| 5400 | } | ||
| 5401 | |||
| 5402 | if (!cpu_online(cpu1) || !cpu_online(cpu2)) { | ||
| 5403 | printk("cpu %d and %d not both online!\n", cpu1, cpu2); | ||
| 5404 | return 0; | ||
| 5405 | } | ||
| 5406 | |||
| 5407 | /* | ||
| 5408 | * Allocate the working set: | ||
| 5409 | */ | ||
| 5410 | cache = vmalloc(max_size); | ||
| 5411 | if (!cache) { | ||
| 5412 | printk("could not vmalloc %d bytes for cache!\n", 2*max_size); | ||
| 5413 | return 1000000; // return 1 msec on very small boxen | ||
| 5414 | } | ||
| 5415 | |||
| 5416 | while (size <= max_size) { | ||
| 5417 | prev_cost = cost; | ||
| 5418 | cost = measure_cost(cpu1, cpu2, cache, size); | ||
| 5419 | |||
| 5420 | /* | ||
| 5421 | * Update the max: | ||
| 5422 | */ | ||
| 5423 | if (cost > 0) { | ||
| 5424 | if (max_cost < cost) { | ||
| 5425 | max_cost = cost; | ||
| 5426 | size_found = size; | ||
| 5427 | } | ||
| 5428 | } | ||
| 5429 | /* | ||
| 5430 | * Calculate average fluctuation, we use this to prevent | ||
| 5431 | * noise from triggering an early break out of the loop: | ||
| 5432 | */ | ||
| 5433 | fluct = abs(cost - prev_cost); | ||
| 5434 | avg_fluct = (avg_fluct + fluct)/2; | ||
| 5435 | |||
| 5436 | if (migration_debug) | ||
| 5437 | printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", | ||
| 5438 | cpu1, cpu2, size, | ||
| 5439 | (long)cost / 1000000, | ||
| 5440 | ((long)cost / 100000) % 10, | ||
| 5441 | (long)max_cost / 1000000, | ||
| 5442 | ((long)max_cost / 100000) % 10, | ||
| 5443 | domain_distance(cpu1, cpu2), | ||
| 5444 | cost, avg_fluct); | ||
| 5445 | |||
| 5446 | /* | ||
| 5447 | * If we iterated at least 20% past the previous maximum, | ||
| 5448 | * and the cost has dropped by more than 20% already, | ||
| 5449 | * (taking fluctuations into account) then we assume to | ||
| 5450 | * have found the maximum and break out of the loop early: | ||
| 5451 | */ | ||
| 5452 | if (size_found && (size*100 > size_found*SIZE_THRESH)) | ||
| 5453 | if (cost+avg_fluct <= 0 || | ||
| 5454 | max_cost*100 > (cost+avg_fluct)*COST_THRESH) { | ||
| 5455 | |||
| 5456 | if (migration_debug) | ||
| 5457 | printk("-> found max.\n"); | ||
| 5458 | break; | ||
| 5459 | } | ||
| 5460 | /* | ||
| 5461 | * Increase the cachesize in 5% steps: | ||
| 5462 | */ | ||
| 5463 | size = size * 20 / 19; | ||
| 5464 | } | ||
| 5465 | |||
| 5466 | if (migration_debug) | ||
| 5467 | printk("[%d][%d] working set size found: %d, cost: %Ld\n", | ||
| 5468 | cpu1, cpu2, size_found, max_cost); | ||
| 5469 | |||
| 5470 | vfree(cache); | ||
| 5471 | |||
| 5472 | /* | ||
| 5473 | * A task is considered 'cache cold' if at least 2 times | ||
| 5474 | * the worst-case cost of migration has passed. | ||
| 5475 | * | ||
| 5476 | * (this limit is only listened to if the load-balancing | ||
| 5477 | * situation is 'nice' - if there is a large imbalance we | ||
| 5478 | * ignore it for the sake of CPU utilization and | ||
| 5479 | * processing fairness.) | ||
| 5480 | */ | ||
| 5481 | return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; | ||
| 5482 | } | ||
| 5483 | |||
| 5484 | static void calibrate_migration_costs(const cpumask_t *cpu_map) | ||
| 5485 | { | ||
| 5486 | int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id(); | ||
| 5487 | unsigned long j0, j1, distance, max_distance = 0; | ||
| 5488 | struct sched_domain *sd; | ||
| 5489 | |||
| 5490 | j0 = jiffies; | ||
| 5491 | |||
| 5492 | /* | ||
| 5493 | * First pass - calculate the cacheflush times: | ||
| 5494 | */ | ||
| 5495 | for_each_cpu_mask(cpu1, *cpu_map) { | ||
| 5496 | for_each_cpu_mask(cpu2, *cpu_map) { | ||
| 5497 | if (cpu1 == cpu2) | ||
| 5498 | continue; | ||
| 5499 | distance = domain_distance(cpu1, cpu2); | ||
| 5500 | max_distance = max(max_distance, distance); | ||
| 5501 | /* | ||
| 5502 | * No result cached yet? | ||
| 5503 | */ | ||
| 5504 | if (migration_cost[distance] == -1LL) | ||
| 5505 | migration_cost[distance] = | ||
| 5506 | measure_migration_cost(cpu1, cpu2); | ||
| 5507 | } | ||
| 5508 | } | ||
| 5509 | /* | ||
| 5510 | * Second pass - update the sched domain hierarchy with | ||
| 5511 | * the new cache-hot-time estimations: | ||
| 5512 | */ | ||
| 5513 | for_each_cpu_mask(cpu, *cpu_map) { | ||
| 5514 | distance = 0; | ||
| 5515 | for_each_domain(cpu, sd) { | ||
| 5516 | sd->cache_hot_time = migration_cost[distance]; | ||
| 5517 | distance++; | ||
| 5518 | } | ||
| 5519 | } | ||
| 5520 | /* | ||
| 5521 | * Print the matrix: | ||
| 5522 | */ | ||
| 5523 | if (migration_debug) | ||
| 5524 | printk("migration: max_cache_size: %d, cpu: %d MHz:\n", | ||
| 5525 | max_cache_size, | ||
| 5526 | #ifdef CONFIG_X86 | ||
| 5527 | cpu_khz/1000 | ||
| 5528 | #else | ||
| 5529 | -1 | ||
| 5530 | #endif | ||
| 5531 | ); | ||
| 5532 | printk("migration_cost="); | ||
| 5533 | for (distance = 0; distance <= max_distance; distance++) { | ||
| 5534 | if (distance) | ||
| 5535 | printk(","); | ||
| 5536 | printk("%ld", (long)migration_cost[distance] / 1000); | ||
| 5537 | } | ||
| 5538 | printk("\n"); | ||
| 5539 | j1 = jiffies; | ||
| 5540 | if (migration_debug) | ||
| 5541 | printk("migration: %ld seconds\n", (j1-j0)/HZ); | ||
| 5542 | |||
| 5543 | /* | ||
| 5544 | * Move back to the original CPU. NUMA-Q gets confused | ||
| 5545 | * if we migrate to another quad during bootup. | ||
| 5546 | */ | ||
| 5547 | if (raw_smp_processor_id() != orig_cpu) { | ||
| 5548 | cpumask_t mask = cpumask_of_cpu(orig_cpu), | ||
| 5549 | saved_mask = current->cpus_allowed; | ||
| 5550 | |||
| 5551 | set_cpus_allowed(current, mask); | ||
| 5552 | set_cpus_allowed(current, saved_mask); | ||
| 5553 | } | ||
| 5554 | } | ||
| 5555 | |||
| 5076 | #ifdef CONFIG_NUMA | 5556 | #ifdef CONFIG_NUMA |
| 5557 | |||
| 5077 | /** | 5558 | /** |
| 5078 | * find_next_best_node - find the next node to include in a sched_domain | 5559 | * find_next_best_node - find the next node to include in a sched_domain |
| 5079 | * @node: node whose sched_domain we're building | 5560 | * @node: node whose sched_domain we're building |
| @@ -5439,6 +5920,10 @@ next_sg: | |||
| 5439 | #endif | 5920 | #endif |
| 5440 | cpu_attach_domain(sd, i); | 5921 | cpu_attach_domain(sd, i); |
| 5441 | } | 5922 | } |
| 5923 | /* | ||
| 5924 | * Tune cache-hot values: | ||
| 5925 | */ | ||
| 5926 | calibrate_migration_costs(cpu_map); | ||
| 5442 | } | 5927 | } |
| 5443 | /* | 5928 | /* |
| 5444 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 5929 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
diff --git a/kernel/signal.c b/kernel/signal.c index d7611f189ef7..1da2e74beb97 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -25,6 +25,7 @@ | |||
| 25 | #include <linux/posix-timers.h> | 25 | #include <linux/posix-timers.h> |
| 26 | #include <linux/signal.h> | 26 | #include <linux/signal.h> |
| 27 | #include <linux/audit.h> | 27 | #include <linux/audit.h> |
| 28 | #include <linux/capability.h> | ||
| 28 | #include <asm/param.h> | 29 | #include <asm/param.h> |
| 29 | #include <asm/uaccess.h> | 30 | #include <asm/uaccess.h> |
| 30 | #include <asm/unistd.h> | 31 | #include <asm/unistd.h> |
| @@ -329,13 +330,20 @@ void __exit_sighand(struct task_struct *tsk) | |||
| 329 | /* Ok, we're done with the signal handlers */ | 330 | /* Ok, we're done with the signal handlers */ |
| 330 | tsk->sighand = NULL; | 331 | tsk->sighand = NULL; |
| 331 | if (atomic_dec_and_test(&sighand->count)) | 332 | if (atomic_dec_and_test(&sighand->count)) |
| 332 | kmem_cache_free(sighand_cachep, sighand); | 333 | sighand_free(sighand); |
| 333 | } | 334 | } |
| 334 | 335 | ||
| 335 | void exit_sighand(struct task_struct *tsk) | 336 | void exit_sighand(struct task_struct *tsk) |
| 336 | { | 337 | { |
| 337 | write_lock_irq(&tasklist_lock); | 338 | write_lock_irq(&tasklist_lock); |
| 338 | __exit_sighand(tsk); | 339 | rcu_read_lock(); |
| 340 | if (tsk->sighand != NULL) { | ||
| 341 | struct sighand_struct *sighand = rcu_dereference(tsk->sighand); | ||
| 342 | spin_lock(&sighand->siglock); | ||
| 343 | __exit_sighand(tsk); | ||
| 344 | spin_unlock(&sighand->siglock); | ||
| 345 | } | ||
| 346 | rcu_read_unlock(); | ||
| 339 | write_unlock_irq(&tasklist_lock); | 347 | write_unlock_irq(&tasklist_lock); |
| 340 | } | 348 | } |
| 341 | 349 | ||
| @@ -345,19 +353,20 @@ void exit_sighand(struct task_struct *tsk) | |||
| 345 | void __exit_signal(struct task_struct *tsk) | 353 | void __exit_signal(struct task_struct *tsk) |
| 346 | { | 354 | { |
| 347 | struct signal_struct * sig = tsk->signal; | 355 | struct signal_struct * sig = tsk->signal; |
| 348 | struct sighand_struct * sighand = tsk->sighand; | 356 | struct sighand_struct * sighand; |
| 349 | 357 | ||
| 350 | if (!sig) | 358 | if (!sig) |
| 351 | BUG(); | 359 | BUG(); |
| 352 | if (!atomic_read(&sig->count)) | 360 | if (!atomic_read(&sig->count)) |
| 353 | BUG(); | 361 | BUG(); |
| 362 | rcu_read_lock(); | ||
| 363 | sighand = rcu_dereference(tsk->sighand); | ||
| 354 | spin_lock(&sighand->siglock); | 364 | spin_lock(&sighand->siglock); |
| 355 | posix_cpu_timers_exit(tsk); | 365 | posix_cpu_timers_exit(tsk); |
| 356 | if (atomic_dec_and_test(&sig->count)) { | 366 | if (atomic_dec_and_test(&sig->count)) { |
| 357 | posix_cpu_timers_exit_group(tsk); | 367 | posix_cpu_timers_exit_group(tsk); |
| 358 | if (tsk == sig->curr_target) | ||
| 359 | sig->curr_target = next_thread(tsk); | ||
| 360 | tsk->signal = NULL; | 368 | tsk->signal = NULL; |
| 369 | __exit_sighand(tsk); | ||
| 361 | spin_unlock(&sighand->siglock); | 370 | spin_unlock(&sighand->siglock); |
| 362 | flush_sigqueue(&sig->shared_pending); | 371 | flush_sigqueue(&sig->shared_pending); |
| 363 | } else { | 372 | } else { |
| @@ -389,9 +398,11 @@ void __exit_signal(struct task_struct *tsk) | |||
| 389 | sig->nvcsw += tsk->nvcsw; | 398 | sig->nvcsw += tsk->nvcsw; |
| 390 | sig->nivcsw += tsk->nivcsw; | 399 | sig->nivcsw += tsk->nivcsw; |
| 391 | sig->sched_time += tsk->sched_time; | 400 | sig->sched_time += tsk->sched_time; |
| 401 | __exit_sighand(tsk); | ||
| 392 | spin_unlock(&sighand->siglock); | 402 | spin_unlock(&sighand->siglock); |
| 393 | sig = NULL; /* Marker for below. */ | 403 | sig = NULL; /* Marker for below. */ |
| 394 | } | 404 | } |
| 405 | rcu_read_unlock(); | ||
| 395 | clear_tsk_thread_flag(tsk,TIF_SIGPENDING); | 406 | clear_tsk_thread_flag(tsk,TIF_SIGPENDING); |
| 396 | flush_sigqueue(&tsk->pending); | 407 | flush_sigqueue(&tsk->pending); |
| 397 | if (sig) { | 408 | if (sig) { |
| @@ -613,6 +624,33 @@ void signal_wake_up(struct task_struct *t, int resume) | |||
| 613 | * Returns 1 if any signals were found. | 624 | * Returns 1 if any signals were found. |
| 614 | * | 625 | * |
| 615 | * All callers must be holding the siglock. | 626 | * All callers must be holding the siglock. |
| 627 | * | ||
| 628 | * This version takes a sigset mask and looks at all signals, | ||
| 629 | * not just those in the first mask word. | ||
| 630 | */ | ||
| 631 | static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) | ||
| 632 | { | ||
| 633 | struct sigqueue *q, *n; | ||
| 634 | sigset_t m; | ||
| 635 | |||
| 636 | sigandsets(&m, mask, &s->signal); | ||
| 637 | if (sigisemptyset(&m)) | ||
| 638 | return 0; | ||
| 639 | |||
| 640 | signandsets(&s->signal, &s->signal, mask); | ||
| 641 | list_for_each_entry_safe(q, n, &s->list, list) { | ||
| 642 | if (sigismember(mask, q->info.si_signo)) { | ||
| 643 | list_del_init(&q->list); | ||
| 644 | __sigqueue_free(q); | ||
| 645 | } | ||
| 646 | } | ||
| 647 | return 1; | ||
| 648 | } | ||
| 649 | /* | ||
| 650 | * Remove signals in mask from the pending set and queue. | ||
| 651 | * Returns 1 if any signals were found. | ||
| 652 | * | ||
| 653 | * All callers must be holding the siglock. | ||
| 616 | */ | 654 | */ |
| 617 | static int rm_from_queue(unsigned long mask, struct sigpending *s) | 655 | static int rm_from_queue(unsigned long mask, struct sigpending *s) |
| 618 | { | 656 | { |
| @@ -1080,18 +1118,29 @@ void zap_other_threads(struct task_struct *p) | |||
| 1080 | } | 1118 | } |
| 1081 | 1119 | ||
| 1082 | /* | 1120 | /* |
| 1083 | * Must be called with the tasklist_lock held for reading! | 1121 | * Must be called under rcu_read_lock() or with tasklist_lock read-held. |
| 1084 | */ | 1122 | */ |
| 1085 | int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) | 1123 | int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) |
| 1086 | { | 1124 | { |
| 1087 | unsigned long flags; | 1125 | unsigned long flags; |
| 1126 | struct sighand_struct *sp; | ||
| 1088 | int ret; | 1127 | int ret; |
| 1089 | 1128 | ||
| 1129 | retry: | ||
| 1090 | ret = check_kill_permission(sig, info, p); | 1130 | ret = check_kill_permission(sig, info, p); |
| 1091 | if (!ret && sig && p->sighand) { | 1131 | if (!ret && sig && (sp = rcu_dereference(p->sighand))) { |
| 1092 | spin_lock_irqsave(&p->sighand->siglock, flags); | 1132 | spin_lock_irqsave(&sp->siglock, flags); |
| 1133 | if (p->sighand != sp) { | ||
| 1134 | spin_unlock_irqrestore(&sp->siglock, flags); | ||
| 1135 | goto retry; | ||
| 1136 | } | ||
| 1137 | if ((atomic_read(&sp->count) == 0) || | ||
| 1138 | (atomic_read(&p->usage) == 0)) { | ||
| 1139 | spin_unlock_irqrestore(&sp->siglock, flags); | ||
| 1140 | return -ESRCH; | ||
| 1141 | } | ||
| 1093 | ret = __group_send_sig_info(sig, info, p); | 1142 | ret = __group_send_sig_info(sig, info, p); |
| 1094 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 1143 | spin_unlock_irqrestore(&sp->siglock, flags); |
| 1095 | } | 1144 | } |
| 1096 | 1145 | ||
| 1097 | return ret; | 1146 | return ret; |
| @@ -1136,14 +1185,21 @@ int | |||
| 1136 | kill_proc_info(int sig, struct siginfo *info, pid_t pid) | 1185 | kill_proc_info(int sig, struct siginfo *info, pid_t pid) |
| 1137 | { | 1186 | { |
| 1138 | int error; | 1187 | int error; |
| 1188 | int acquired_tasklist_lock = 0; | ||
| 1139 | struct task_struct *p; | 1189 | struct task_struct *p; |
| 1140 | 1190 | ||
| 1141 | read_lock(&tasklist_lock); | 1191 | rcu_read_lock(); |
| 1192 | if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) { | ||
| 1193 | read_lock(&tasklist_lock); | ||
| 1194 | acquired_tasklist_lock = 1; | ||
| 1195 | } | ||
| 1142 | p = find_task_by_pid(pid); | 1196 | p = find_task_by_pid(pid); |
| 1143 | error = -ESRCH; | 1197 | error = -ESRCH; |
| 1144 | if (p) | 1198 | if (p) |
| 1145 | error = group_send_sig_info(sig, info, p); | 1199 | error = group_send_sig_info(sig, info, p); |
| 1146 | read_unlock(&tasklist_lock); | 1200 | if (unlikely(acquired_tasklist_lock)) |
| 1201 | read_unlock(&tasklist_lock); | ||
| 1202 | rcu_read_unlock(); | ||
| 1147 | return error; | 1203 | return error; |
| 1148 | } | 1204 | } |
| 1149 | 1205 | ||
| @@ -1163,8 +1219,7 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, | |||
| 1163 | ret = -ESRCH; | 1219 | ret = -ESRCH; |
| 1164 | goto out_unlock; | 1220 | goto out_unlock; |
| 1165 | } | 1221 | } |
| 1166 | if ((!info || ((unsigned long)info != 1 && | 1222 | if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) |
| 1167 | (unsigned long)info != 2 && SI_FROMUSER(info))) | ||
| 1168 | && (euid != p->suid) && (euid != p->uid) | 1223 | && (euid != p->suid) && (euid != p->uid) |
| 1169 | && (uid != p->suid) && (uid != p->uid)) { | 1224 | && (uid != p->suid) && (uid != p->uid)) { |
| 1170 | ret = -EPERM; | 1225 | ret = -EPERM; |
| @@ -1355,16 +1410,54 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
| 1355 | { | 1410 | { |
| 1356 | unsigned long flags; | 1411 | unsigned long flags; |
| 1357 | int ret = 0; | 1412 | int ret = 0; |
| 1413 | struct sighand_struct *sh; | ||
| 1358 | 1414 | ||
| 1359 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); | 1415 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); |
| 1360 | read_lock(&tasklist_lock); | 1416 | |
| 1417 | /* | ||
| 1418 | * The rcu based delayed sighand destroy makes it possible to | ||
| 1419 | * run this without tasklist lock held. The task struct itself | ||
| 1420 | * cannot go away as create_timer did get_task_struct(). | ||
| 1421 | * | ||
| 1422 | * We return -1, when the task is marked exiting, so | ||
| 1423 | * posix_timer_event can redirect it to the group leader | ||
| 1424 | */ | ||
| 1425 | rcu_read_lock(); | ||
| 1361 | 1426 | ||
| 1362 | if (unlikely(p->flags & PF_EXITING)) { | 1427 | if (unlikely(p->flags & PF_EXITING)) { |
| 1363 | ret = -1; | 1428 | ret = -1; |
| 1364 | goto out_err; | 1429 | goto out_err; |
| 1365 | } | 1430 | } |
| 1366 | 1431 | ||
| 1367 | spin_lock_irqsave(&p->sighand->siglock, flags); | 1432 | retry: |
| 1433 | sh = rcu_dereference(p->sighand); | ||
| 1434 | |||
| 1435 | spin_lock_irqsave(&sh->siglock, flags); | ||
| 1436 | if (p->sighand != sh) { | ||
| 1437 | /* We raced with exec() in a multithreaded process... */ | ||
| 1438 | spin_unlock_irqrestore(&sh->siglock, flags); | ||
| 1439 | goto retry; | ||
| 1440 | } | ||
| 1441 | |||
| 1442 | /* | ||
| 1443 | * We do the check here again to handle the following scenario: | ||
| 1444 | * | ||
| 1445 | * CPU 0 CPU 1 | ||
| 1446 | * send_sigqueue | ||
| 1447 | * check PF_EXITING | ||
| 1448 | * interrupt exit code running | ||
| 1449 | * __exit_signal | ||
| 1450 | * lock sighand->siglock | ||
| 1451 | * unlock sighand->siglock | ||
| 1452 | * lock sh->siglock | ||
| 1453 | * add(tsk->pending) flush_sigqueue(tsk->pending) | ||
| 1454 | * | ||
| 1455 | */ | ||
| 1456 | |||
| 1457 | if (unlikely(p->flags & PF_EXITING)) { | ||
| 1458 | ret = -1; | ||
| 1459 | goto out; | ||
| 1460 | } | ||
| 1368 | 1461 | ||
| 1369 | if (unlikely(!list_empty(&q->list))) { | 1462 | if (unlikely(!list_empty(&q->list))) { |
| 1370 | /* | 1463 | /* |
| @@ -1388,9 +1481,9 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
| 1388 | signal_wake_up(p, sig == SIGKILL); | 1481 | signal_wake_up(p, sig == SIGKILL); |
| 1389 | 1482 | ||
| 1390 | out: | 1483 | out: |
| 1391 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 1484 | spin_unlock_irqrestore(&sh->siglock, flags); |
| 1392 | out_err: | 1485 | out_err: |
| 1393 | read_unlock(&tasklist_lock); | 1486 | rcu_read_unlock(); |
| 1394 | 1487 | ||
| 1395 | return ret; | 1488 | return ret; |
| 1396 | } | 1489 | } |
| @@ -1402,7 +1495,9 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
| 1402 | int ret = 0; | 1495 | int ret = 0; |
| 1403 | 1496 | ||
| 1404 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); | 1497 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); |
| 1498 | |||
| 1405 | read_lock(&tasklist_lock); | 1499 | read_lock(&tasklist_lock); |
| 1500 | /* Since it_lock is held, p->sighand cannot be NULL. */ | ||
| 1406 | spin_lock_irqsave(&p->sighand->siglock, flags); | 1501 | spin_lock_irqsave(&p->sighand->siglock, flags); |
| 1407 | handle_stop_signal(sig, p); | 1502 | handle_stop_signal(sig, p); |
| 1408 | 1503 | ||
| @@ -1436,7 +1531,7 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
| 1436 | out: | 1531 | out: |
| 1437 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 1532 | spin_unlock_irqrestore(&p->sighand->siglock, flags); |
| 1438 | read_unlock(&tasklist_lock); | 1533 | read_unlock(&tasklist_lock); |
| 1439 | return(ret); | 1534 | return ret; |
| 1440 | } | 1535 | } |
| 1441 | 1536 | ||
| 1442 | /* | 1537 | /* |
| @@ -2338,6 +2433,7 @@ int | |||
| 2338 | do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) | 2433 | do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) |
| 2339 | { | 2434 | { |
| 2340 | struct k_sigaction *k; | 2435 | struct k_sigaction *k; |
| 2436 | sigset_t mask; | ||
| 2341 | 2437 | ||
| 2342 | if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) | 2438 | if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) |
| 2343 | return -EINVAL; | 2439 | return -EINVAL; |
| @@ -2385,9 +2481,11 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) | |||
| 2385 | *k = *act; | 2481 | *k = *act; |
| 2386 | sigdelsetmask(&k->sa.sa_mask, | 2482 | sigdelsetmask(&k->sa.sa_mask, |
| 2387 | sigmask(SIGKILL) | sigmask(SIGSTOP)); | 2483 | sigmask(SIGKILL) | sigmask(SIGSTOP)); |
| 2388 | rm_from_queue(sigmask(sig), &t->signal->shared_pending); | 2484 | sigemptyset(&mask); |
| 2485 | sigaddset(&mask, sig); | ||
| 2486 | rm_from_queue_full(&mask, &t->signal->shared_pending); | ||
| 2389 | do { | 2487 | do { |
| 2390 | rm_from_queue(sigmask(sig), &t->pending); | 2488 | rm_from_queue_full(&mask, &t->pending); |
| 2391 | recalc_sigpending_tsk(t); | 2489 | recalc_sigpending_tsk(t); |
| 2392 | t = next_thread(t); | 2490 | t = next_thread(t); |
| 2393 | } while (t != current); | 2491 | } while (t != current); |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index b3d4dc858e35..dcfb5d731466 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
| @@ -87,13 +87,9 @@ static int stop_machine(void) | |||
| 87 | { | 87 | { |
| 88 | int i, ret = 0; | 88 | int i, ret = 0; |
| 89 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 89 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; |
| 90 | mm_segment_t old_fs = get_fs(); | ||
| 91 | 90 | ||
| 92 | /* One high-prio thread per cpu. We'll do this one. */ | 91 | /* One high-prio thread per cpu. We'll do this one. */ |
| 93 | set_fs(KERNEL_DS); | 92 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
| 94 | sys_sched_setscheduler(current->pid, SCHED_FIFO, | ||
| 95 | (struct sched_param __user *)¶m); | ||
| 96 | set_fs(old_fs); | ||
| 97 | 93 | ||
| 98 | atomic_set(&stopmachine_thread_ack, 0); | 94 | atomic_set(&stopmachine_thread_ack, 0); |
| 99 | stopmachine_num_threads = 0; | 95 | stopmachine_num_threads = 0; |
diff --git a/kernel/sys.c b/kernel/sys.c index eecf84526afe..d09cac23fdfd 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -19,6 +19,7 @@ | |||
| 19 | #include <linux/kernel.h> | 19 | #include <linux/kernel.h> |
| 20 | #include <linux/kexec.h> | 20 | #include <linux/kexec.h> |
| 21 | #include <linux/workqueue.h> | 21 | #include <linux/workqueue.h> |
| 22 | #include <linux/capability.h> | ||
| 22 | #include <linux/device.h> | 23 | #include <linux/device.h> |
| 23 | #include <linux/key.h> | 24 | #include <linux/key.h> |
| 24 | #include <linux/times.h> | 25 | #include <linux/times.h> |
| @@ -223,6 +224,18 @@ int unregister_reboot_notifier(struct notifier_block * nb) | |||
| 223 | 224 | ||
| 224 | EXPORT_SYMBOL(unregister_reboot_notifier); | 225 | EXPORT_SYMBOL(unregister_reboot_notifier); |
| 225 | 226 | ||
| 227 | #ifndef CONFIG_SECURITY | ||
| 228 | int capable(int cap) | ||
| 229 | { | ||
| 230 | if (cap_raised(current->cap_effective, cap)) { | ||
| 231 | current->flags |= PF_SUPERPRIV; | ||
| 232 | return 1; | ||
| 233 | } | ||
| 234 | return 0; | ||
| 235 | } | ||
| 236 | EXPORT_SYMBOL(capable); | ||
| 237 | #endif | ||
| 238 | |||
| 226 | static int set_one_prio(struct task_struct *p, int niceval, int error) | 239 | static int set_one_prio(struct task_struct *p, int niceval, int error) |
| 227 | { | 240 | { |
| 228 | int no_nice; | 241 | int no_nice; |
| @@ -489,6 +502,12 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user | |||
| 489 | magic2 != LINUX_REBOOT_MAGIC2C)) | 502 | magic2 != LINUX_REBOOT_MAGIC2C)) |
| 490 | return -EINVAL; | 503 | return -EINVAL; |
| 491 | 504 | ||
| 505 | /* Instead of trying to make the power_off code look like | ||
| 506 | * halt when pm_power_off is not set do it the easy way. | ||
| 507 | */ | ||
| 508 | if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) | ||
| 509 | cmd = LINUX_REBOOT_CMD_HALT; | ||
| 510 | |||
| 492 | lock_kernel(); | 511 | lock_kernel(); |
| 493 | switch (cmd) { | 512 | switch (cmd) { |
| 494 | case LINUX_REBOOT_CMD_RESTART: | 513 | case LINUX_REBOOT_CMD_RESTART: |
| @@ -1084,10 +1103,11 @@ asmlinkage long sys_times(struct tms __user * tbuf) | |||
| 1084 | asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | 1103 | asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) |
| 1085 | { | 1104 | { |
| 1086 | struct task_struct *p; | 1105 | struct task_struct *p; |
| 1106 | struct task_struct *group_leader = current->group_leader; | ||
| 1087 | int err = -EINVAL; | 1107 | int err = -EINVAL; |
| 1088 | 1108 | ||
| 1089 | if (!pid) | 1109 | if (!pid) |
| 1090 | pid = current->pid; | 1110 | pid = group_leader->pid; |
| 1091 | if (!pgid) | 1111 | if (!pgid) |
| 1092 | pgid = pid; | 1112 | pgid = pid; |
| 1093 | if (pgid < 0) | 1113 | if (pgid < 0) |
| @@ -1107,16 +1127,16 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | |||
| 1107 | if (!thread_group_leader(p)) | 1127 | if (!thread_group_leader(p)) |
| 1108 | goto out; | 1128 | goto out; |
| 1109 | 1129 | ||
| 1110 | if (p->parent == current || p->real_parent == current) { | 1130 | if (p->real_parent == group_leader) { |
| 1111 | err = -EPERM; | 1131 | err = -EPERM; |
| 1112 | if (p->signal->session != current->signal->session) | 1132 | if (p->signal->session != group_leader->signal->session) |
| 1113 | goto out; | 1133 | goto out; |
| 1114 | err = -EACCES; | 1134 | err = -EACCES; |
| 1115 | if (p->did_exec) | 1135 | if (p->did_exec) |
| 1116 | goto out; | 1136 | goto out; |
| 1117 | } else { | 1137 | } else { |
| 1118 | err = -ESRCH; | 1138 | err = -ESRCH; |
| 1119 | if (p != current) | 1139 | if (p != group_leader) |
| 1120 | goto out; | 1140 | goto out; |
| 1121 | } | 1141 | } |
| 1122 | 1142 | ||
| @@ -1128,7 +1148,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | |||
| 1128 | struct task_struct *p; | 1148 | struct task_struct *p; |
| 1129 | 1149 | ||
| 1130 | do_each_task_pid(pgid, PIDTYPE_PGID, p) { | 1150 | do_each_task_pid(pgid, PIDTYPE_PGID, p) { |
| 1131 | if (p->signal->session == current->signal->session) | 1151 | if (p->signal->session == group_leader->signal->session) |
| 1132 | goto ok_pgid; | 1152 | goto ok_pgid; |
| 1133 | } while_each_task_pid(pgid, PIDTYPE_PGID, p); | 1153 | } while_each_task_pid(pgid, PIDTYPE_PGID, p); |
| 1134 | goto out; | 1154 | goto out; |
| @@ -1208,24 +1228,22 @@ asmlinkage long sys_getsid(pid_t pid) | |||
| 1208 | 1228 | ||
| 1209 | asmlinkage long sys_setsid(void) | 1229 | asmlinkage long sys_setsid(void) |
| 1210 | { | 1230 | { |
| 1231 | struct task_struct *group_leader = current->group_leader; | ||
| 1211 | struct pid *pid; | 1232 | struct pid *pid; |
| 1212 | int err = -EPERM; | 1233 | int err = -EPERM; |
| 1213 | 1234 | ||
| 1214 | if (!thread_group_leader(current)) | ||
| 1215 | return -EINVAL; | ||
| 1216 | |||
| 1217 | down(&tty_sem); | 1235 | down(&tty_sem); |
| 1218 | write_lock_irq(&tasklist_lock); | 1236 | write_lock_irq(&tasklist_lock); |
| 1219 | 1237 | ||
| 1220 | pid = find_pid(PIDTYPE_PGID, current->pid); | 1238 | pid = find_pid(PIDTYPE_PGID, group_leader->pid); |
| 1221 | if (pid) | 1239 | if (pid) |
| 1222 | goto out; | 1240 | goto out; |
| 1223 | 1241 | ||
| 1224 | current->signal->leader = 1; | 1242 | group_leader->signal->leader = 1; |
| 1225 | __set_special_pids(current->pid, current->pid); | 1243 | __set_special_pids(group_leader->pid, group_leader->pid); |
| 1226 | current->signal->tty = NULL; | 1244 | group_leader->signal->tty = NULL; |
| 1227 | current->signal->tty_old_pgrp = 0; | 1245 | group_leader->signal->tty_old_pgrp = 0; |
| 1228 | err = process_group(current); | 1246 | err = process_group(group_leader); |
| 1229 | out: | 1247 | out: |
| 1230 | write_unlock_irq(&tasklist_lock); | 1248 | write_unlock_irq(&tasklist_lock); |
| 1231 | up(&tty_sem); | 1249 | up(&tty_sem); |
| @@ -1687,7 +1705,10 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
| 1687 | if (unlikely(!p->signal)) | 1705 | if (unlikely(!p->signal)) |
| 1688 | return; | 1706 | return; |
| 1689 | 1707 | ||
| 1708 | utime = stime = cputime_zero; | ||
| 1709 | |||
| 1690 | switch (who) { | 1710 | switch (who) { |
| 1711 | case RUSAGE_BOTH: | ||
| 1691 | case RUSAGE_CHILDREN: | 1712 | case RUSAGE_CHILDREN: |
| 1692 | spin_lock_irqsave(&p->sighand->siglock, flags); | 1713 | spin_lock_irqsave(&p->sighand->siglock, flags); |
| 1693 | utime = p->signal->cutime; | 1714 | utime = p->signal->cutime; |
| @@ -1697,22 +1718,11 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
| 1697 | r->ru_minflt = p->signal->cmin_flt; | 1718 | r->ru_minflt = p->signal->cmin_flt; |
| 1698 | r->ru_majflt = p->signal->cmaj_flt; | 1719 | r->ru_majflt = p->signal->cmaj_flt; |
| 1699 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 1720 | spin_unlock_irqrestore(&p->sighand->siglock, flags); |
| 1700 | cputime_to_timeval(utime, &r->ru_utime); | 1721 | |
| 1701 | cputime_to_timeval(stime, &r->ru_stime); | 1722 | if (who == RUSAGE_CHILDREN) |
| 1702 | break; | 1723 | break; |
| 1724 | |||
| 1703 | case RUSAGE_SELF: | 1725 | case RUSAGE_SELF: |
| 1704 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
| 1705 | utime = stime = cputime_zero; | ||
| 1706 | goto sum_group; | ||
| 1707 | case RUSAGE_BOTH: | ||
| 1708 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
| 1709 | utime = p->signal->cutime; | ||
| 1710 | stime = p->signal->cstime; | ||
| 1711 | r->ru_nvcsw = p->signal->cnvcsw; | ||
| 1712 | r->ru_nivcsw = p->signal->cnivcsw; | ||
| 1713 | r->ru_minflt = p->signal->cmin_flt; | ||
| 1714 | r->ru_majflt = p->signal->cmaj_flt; | ||
| 1715 | sum_group: | ||
| 1716 | utime = cputime_add(utime, p->signal->utime); | 1726 | utime = cputime_add(utime, p->signal->utime); |
| 1717 | stime = cputime_add(stime, p->signal->stime); | 1727 | stime = cputime_add(stime, p->signal->stime); |
| 1718 | r->ru_nvcsw += p->signal->nvcsw; | 1728 | r->ru_nvcsw += p->signal->nvcsw; |
| @@ -1729,13 +1739,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
| 1729 | r->ru_majflt += t->maj_flt; | 1739 | r->ru_majflt += t->maj_flt; |
| 1730 | t = next_thread(t); | 1740 | t = next_thread(t); |
| 1731 | } while (t != p); | 1741 | } while (t != p); |
| 1732 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | ||
| 1733 | cputime_to_timeval(utime, &r->ru_utime); | ||
| 1734 | cputime_to_timeval(stime, &r->ru_stime); | ||
| 1735 | break; | 1742 | break; |
| 1743 | |||
| 1736 | default: | 1744 | default: |
| 1737 | BUG(); | 1745 | BUG(); |
| 1738 | } | 1746 | } |
| 1747 | |||
| 1748 | cputime_to_timeval(utime, &r->ru_utime); | ||
| 1749 | cputime_to_timeval(stime, &r->ru_stime); | ||
| 1739 | } | 1750 | } |
| 1740 | 1751 | ||
| 1741 | int getrusage(struct task_struct *p, int who, struct rusage __user *ru) | 1752 | int getrusage(struct task_struct *p, int who, struct rusage __user *ru) |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 1ab2370e2efa..17313b99e53d 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
| @@ -82,6 +82,28 @@ cond_syscall(compat_sys_socketcall); | |||
| 82 | cond_syscall(sys_inotify_init); | 82 | cond_syscall(sys_inotify_init); |
| 83 | cond_syscall(sys_inotify_add_watch); | 83 | cond_syscall(sys_inotify_add_watch); |
| 84 | cond_syscall(sys_inotify_rm_watch); | 84 | cond_syscall(sys_inotify_rm_watch); |
| 85 | cond_syscall(sys_migrate_pages); | ||
| 86 | cond_syscall(sys_chown16); | ||
| 87 | cond_syscall(sys_fchown16); | ||
| 88 | cond_syscall(sys_getegid16); | ||
| 89 | cond_syscall(sys_geteuid16); | ||
| 90 | cond_syscall(sys_getgid16); | ||
| 91 | cond_syscall(sys_getgroups16); | ||
| 92 | cond_syscall(sys_getresgid16); | ||
| 93 | cond_syscall(sys_getresuid16); | ||
| 94 | cond_syscall(sys_getuid16); | ||
| 95 | cond_syscall(sys_lchown16); | ||
| 96 | cond_syscall(sys_setfsgid16); | ||
| 97 | cond_syscall(sys_setfsuid16); | ||
| 98 | cond_syscall(sys_setgid16); | ||
| 99 | cond_syscall(sys_setgroups16); | ||
| 100 | cond_syscall(sys_setregid16); | ||
| 101 | cond_syscall(sys_setresgid16); | ||
| 102 | cond_syscall(sys_setresuid16); | ||
| 103 | cond_syscall(sys_setreuid16); | ||
| 104 | cond_syscall(sys_setuid16); | ||
| 105 | cond_syscall(sys_vm86old); | ||
| 106 | cond_syscall(sys_vm86); | ||
| 85 | 107 | ||
| 86 | /* arch-specific weak syscall entries */ | 108 | /* arch-specific weak syscall entries */ |
| 87 | cond_syscall(sys_pciconfig_read); | 109 | cond_syscall(sys_pciconfig_read); |
| @@ -90,3 +112,5 @@ cond_syscall(sys_pciconfig_iobase); | |||
| 90 | cond_syscall(sys32_ipc); | 112 | cond_syscall(sys32_ipc); |
| 91 | cond_syscall(sys32_sysctl); | 113 | cond_syscall(sys32_sysctl); |
| 92 | cond_syscall(ppc_rtas); | 114 | cond_syscall(ppc_rtas); |
| 115 | cond_syscall(sys_spu_run); | ||
| 116 | cond_syscall(sys_spu_create); | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e5102ea6e104..62d4d9566876 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -25,12 +25,14 @@ | |||
| 25 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
| 26 | #include <linux/sysctl.h> | 26 | #include <linux/sysctl.h> |
| 27 | #include <linux/proc_fs.h> | 27 | #include <linux/proc_fs.h> |
| 28 | #include <linux/capability.h> | ||
| 28 | #include <linux/ctype.h> | 29 | #include <linux/ctype.h> |
| 29 | #include <linux/utsname.h> | 30 | #include <linux/utsname.h> |
| 30 | #include <linux/capability.h> | 31 | #include <linux/capability.h> |
| 31 | #include <linux/smp_lock.h> | 32 | #include <linux/smp_lock.h> |
| 32 | #include <linux/init.h> | 33 | #include <linux/init.h> |
| 33 | #include <linux/kernel.h> | 34 | #include <linux/kernel.h> |
| 35 | #include <linux/kobject.h> | ||
| 34 | #include <linux/net.h> | 36 | #include <linux/net.h> |
| 35 | #include <linux/sysrq.h> | 37 | #include <linux/sysrq.h> |
| 36 | #include <linux/highuid.h> | 38 | #include <linux/highuid.h> |
| @@ -67,6 +69,8 @@ extern int min_free_kbytes; | |||
| 67 | extern int printk_ratelimit_jiffies; | 69 | extern int printk_ratelimit_jiffies; |
| 68 | extern int printk_ratelimit_burst; | 70 | extern int printk_ratelimit_burst; |
| 69 | extern int pid_max_min, pid_max_max; | 71 | extern int pid_max_min, pid_max_max; |
| 72 | extern int sysctl_drop_caches; | ||
| 73 | extern int percpu_pagelist_fraction; | ||
| 70 | 74 | ||
| 71 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | 75 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) |
| 72 | int unknown_nmi_panic; | 76 | int unknown_nmi_panic; |
| @@ -77,15 +81,13 @@ extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *, | |||
| 77 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ | 81 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ |
| 78 | static int maxolduid = 65535; | 82 | static int maxolduid = 65535; |
| 79 | static int minolduid; | 83 | static int minolduid; |
| 84 | static int min_percpu_pagelist_fract = 8; | ||
| 80 | 85 | ||
| 81 | static int ngroups_max = NGROUPS_MAX; | 86 | static int ngroups_max = NGROUPS_MAX; |
| 82 | 87 | ||
| 83 | #ifdef CONFIG_KMOD | 88 | #ifdef CONFIG_KMOD |
| 84 | extern char modprobe_path[]; | 89 | extern char modprobe_path[]; |
| 85 | #endif | 90 | #endif |
| 86 | #ifdef CONFIG_HOTPLUG | ||
| 87 | extern char hotplug_path[]; | ||
| 88 | #endif | ||
| 89 | #ifdef CONFIG_CHR_DEV_SG | 91 | #ifdef CONFIG_CHR_DEV_SG |
| 90 | extern int sg_big_buff; | 92 | extern int sg_big_buff; |
| 91 | #endif | 93 | #endif |
| @@ -110,7 +112,7 @@ extern int pwrsw_enabled; | |||
| 110 | extern int unaligned_enabled; | 112 | extern int unaligned_enabled; |
| 111 | #endif | 113 | #endif |
| 112 | 114 | ||
| 113 | #ifdef CONFIG_ARCH_S390 | 115 | #ifdef CONFIG_S390 |
| 114 | #ifdef CONFIG_MATHEMU | 116 | #ifdef CONFIG_MATHEMU |
| 115 | extern int sysctl_ieee_emulation_warnings; | 117 | extern int sysctl_ieee_emulation_warnings; |
| 116 | #endif | 118 | #endif |
| @@ -397,8 +399,8 @@ static ctl_table kern_table[] = { | |||
| 397 | { | 399 | { |
| 398 | .ctl_name = KERN_HOTPLUG, | 400 | .ctl_name = KERN_HOTPLUG, |
| 399 | .procname = "hotplug", | 401 | .procname = "hotplug", |
| 400 | .data = &hotplug_path, | 402 | .data = &uevent_helper, |
| 401 | .maxlen = HOTPLUG_PATH_LEN, | 403 | .maxlen = UEVENT_HELPER_PATH_LEN, |
| 402 | .mode = 0644, | 404 | .mode = 0644, |
| 403 | .proc_handler = &proc_dostring, | 405 | .proc_handler = &proc_dostring, |
| 404 | .strategy = &sysctl_string, | 406 | .strategy = &sysctl_string, |
| @@ -544,7 +546,7 @@ static ctl_table kern_table[] = { | |||
| 544 | .extra1 = &minolduid, | 546 | .extra1 = &minolduid, |
| 545 | .extra2 = &maxolduid, | 547 | .extra2 = &maxolduid, |
| 546 | }, | 548 | }, |
| 547 | #ifdef CONFIG_ARCH_S390 | 549 | #ifdef CONFIG_S390 |
| 548 | #ifdef CONFIG_MATHEMU | 550 | #ifdef CONFIG_MATHEMU |
| 549 | { | 551 | { |
| 550 | .ctl_name = KERN_IEEE_EMULATION_WARNINGS, | 552 | .ctl_name = KERN_IEEE_EMULATION_WARNINGS, |
| @@ -646,7 +648,7 @@ static ctl_table kern_table[] = { | |||
| 646 | .mode = 0644, | 648 | .mode = 0644, |
| 647 | .proc_handler = &proc_dointvec, | 649 | .proc_handler = &proc_dointvec, |
| 648 | }, | 650 | }, |
| 649 | #if defined(CONFIG_ARCH_S390) | 651 | #if defined(CONFIG_S390) |
| 650 | { | 652 | { |
| 651 | .ctl_name = KERN_SPIN_RETRY, | 653 | .ctl_name = KERN_SPIN_RETRY, |
| 652 | .procname = "spin_retry", | 654 | .procname = "spin_retry", |
| @@ -777,6 +779,15 @@ static ctl_table vm_table[] = { | |||
| 777 | .strategy = &sysctl_intvec, | 779 | .strategy = &sysctl_intvec, |
| 778 | }, | 780 | }, |
| 779 | { | 781 | { |
| 782 | .ctl_name = VM_DROP_PAGECACHE, | ||
| 783 | .procname = "drop_caches", | ||
| 784 | .data = &sysctl_drop_caches, | ||
| 785 | .maxlen = sizeof(int), | ||
| 786 | .mode = 0644, | ||
| 787 | .proc_handler = drop_caches_sysctl_handler, | ||
| 788 | .strategy = &sysctl_intvec, | ||
| 789 | }, | ||
| 790 | { | ||
| 780 | .ctl_name = VM_MIN_FREE_KBYTES, | 791 | .ctl_name = VM_MIN_FREE_KBYTES, |
| 781 | .procname = "min_free_kbytes", | 792 | .procname = "min_free_kbytes", |
| 782 | .data = &min_free_kbytes, | 793 | .data = &min_free_kbytes, |
| @@ -786,6 +797,16 @@ static ctl_table vm_table[] = { | |||
| 786 | .strategy = &sysctl_intvec, | 797 | .strategy = &sysctl_intvec, |
| 787 | .extra1 = &zero, | 798 | .extra1 = &zero, |
| 788 | }, | 799 | }, |
| 800 | { | ||
| 801 | .ctl_name = VM_PERCPU_PAGELIST_FRACTION, | ||
| 802 | .procname = "percpu_pagelist_fraction", | ||
| 803 | .data = &percpu_pagelist_fraction, | ||
| 804 | .maxlen = sizeof(percpu_pagelist_fraction), | ||
| 805 | .mode = 0644, | ||
| 806 | .proc_handler = &percpu_pagelist_fraction_sysctl_handler, | ||
| 807 | .strategy = &sysctl_intvec, | ||
| 808 | .extra1 = &min_percpu_pagelist_fract, | ||
| 809 | }, | ||
| 789 | #ifdef CONFIG_MMU | 810 | #ifdef CONFIG_MMU |
| 790 | { | 811 | { |
| 791 | .ctl_name = VM_MAX_MAP_COUNT, | 812 | .ctl_name = VM_MAX_MAP_COUNT, |
| @@ -2192,27 +2213,32 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen, | |||
| 2192 | void __user *oldval, size_t __user *oldlenp, | 2213 | void __user *oldval, size_t __user *oldlenp, |
| 2193 | void __user *newval, size_t newlen, void **context) | 2214 | void __user *newval, size_t newlen, void **context) |
| 2194 | { | 2215 | { |
| 2195 | size_t l, len; | ||
| 2196 | |||
| 2197 | if (!table->data || !table->maxlen) | 2216 | if (!table->data || !table->maxlen) |
| 2198 | return -ENOTDIR; | 2217 | return -ENOTDIR; |
| 2199 | 2218 | ||
| 2200 | if (oldval && oldlenp) { | 2219 | if (oldval && oldlenp) { |
| 2201 | if (get_user(len, oldlenp)) | 2220 | size_t bufsize; |
| 2221 | if (get_user(bufsize, oldlenp)) | ||
| 2202 | return -EFAULT; | 2222 | return -EFAULT; |
| 2203 | if (len) { | 2223 | if (bufsize) { |
| 2204 | l = strlen(table->data)+1; | 2224 | size_t len = strlen(table->data), copied; |
| 2205 | if (len > l) len = l; | 2225 | |
| 2206 | if (len >= table->maxlen) | 2226 | /* This shouldn't trigger for a well-formed sysctl */ |
| 2227 | if (len > table->maxlen) | ||
| 2207 | len = table->maxlen; | 2228 | len = table->maxlen; |
| 2208 | if(copy_to_user(oldval, table->data, len)) | 2229 | |
| 2230 | /* Copy up to a max of bufsize-1 bytes of the string */ | ||
| 2231 | copied = (len >= bufsize) ? bufsize - 1 : len; | ||
| 2232 | |||
| 2233 | if (copy_to_user(oldval, table->data, copied) || | ||
| 2234 | put_user(0, (char __user *)(oldval + copied))) | ||
| 2209 | return -EFAULT; | 2235 | return -EFAULT; |
| 2210 | if(put_user(len, oldlenp)) | 2236 | if (put_user(len, oldlenp)) |
| 2211 | return -EFAULT; | 2237 | return -EFAULT; |
| 2212 | } | 2238 | } |
| 2213 | } | 2239 | } |
| 2214 | if (newval && newlen) { | 2240 | if (newval && newlen) { |
| 2215 | len = newlen; | 2241 | size_t len = newlen; |
| 2216 | if (len > table->maxlen) | 2242 | if (len > table->maxlen) |
| 2217 | len = table->maxlen; | 2243 | len = table->maxlen; |
| 2218 | if(copy_from_user(table->data, newval, len)) | 2244 | if(copy_from_user(table->data, newval, len)) |
diff --git a/kernel/time.c b/kernel/time.c index b94bfa8c03e0..7477b1d2079e 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
| @@ -29,6 +29,7 @@ | |||
| 29 | 29 | ||
| 30 | #include <linux/module.h> | 30 | #include <linux/module.h> |
| 31 | #include <linux/timex.h> | 31 | #include <linux/timex.h> |
| 32 | #include <linux/capability.h> | ||
| 32 | #include <linux/errno.h> | 33 | #include <linux/errno.h> |
| 33 | #include <linux/smp_lock.h> | 34 | #include <linux/smp_lock.h> |
| 34 | #include <linux/syscalls.h> | 35 | #include <linux/syscalls.h> |
| @@ -154,6 +155,9 @@ int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) | |||
| 154 | static int firsttime = 1; | 155 | static int firsttime = 1; |
| 155 | int error = 0; | 156 | int error = 0; |
| 156 | 157 | ||
| 158 | if (!timespec_valid(tv)) | ||
| 159 | return -EINVAL; | ||
| 160 | |||
| 157 | error = security_settime(tv, tz); | 161 | error = security_settime(tv, tz); |
| 158 | if (error) | 162 | if (error) |
| 159 | return error; | 163 | return error; |
| @@ -561,27 +565,107 @@ void getnstimeofday(struct timespec *tv) | |||
| 561 | EXPORT_SYMBOL_GPL(getnstimeofday); | 565 | EXPORT_SYMBOL_GPL(getnstimeofday); |
| 562 | #endif | 566 | #endif |
| 563 | 567 | ||
| 564 | void getnstimestamp(struct timespec *ts) | 568 | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. |
| 569 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 | ||
| 570 | * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. | ||
| 571 | * | ||
| 572 | * [For the Julian calendar (which was used in Russia before 1917, | ||
| 573 | * Britain & colonies before 1752, anywhere else before 1582, | ||
| 574 | * and is still in use by some communities) leave out the | ||
| 575 | * -year/100+year/400 terms, and add 10.] | ||
| 576 | * | ||
| 577 | * This algorithm was first published by Gauss (I think). | ||
| 578 | * | ||
| 579 | * WARNING: this function will overflow on 2106-02-07 06:28:16 on | ||
| 580 | * machines were long is 32-bit! (However, as time_t is signed, we | ||
| 581 | * will already get problems at other places on 2038-01-19 03:14:08) | ||
| 582 | */ | ||
| 583 | unsigned long | ||
| 584 | mktime(const unsigned int year0, const unsigned int mon0, | ||
| 585 | const unsigned int day, const unsigned int hour, | ||
| 586 | const unsigned int min, const unsigned int sec) | ||
| 565 | { | 587 | { |
| 566 | unsigned int seq; | 588 | unsigned int mon = mon0, year = year0; |
| 567 | struct timespec wall2mono; | ||
| 568 | 589 | ||
| 569 | /* synchronize with settimeofday() changes */ | 590 | /* 1..12 -> 11,12,1..10 */ |
| 570 | do { | 591 | if (0 >= (int) (mon -= 2)) { |
| 571 | seq = read_seqbegin(&xtime_lock); | 592 | mon += 12; /* Puts Feb last since it has leap day */ |
| 572 | getnstimeofday(ts); | 593 | year -= 1; |
| 573 | wall2mono = wall_to_monotonic; | ||
| 574 | } while(unlikely(read_seqretry(&xtime_lock, seq))); | ||
| 575 | |||
| 576 | /* adjust to monotonicaly-increasing values */ | ||
| 577 | ts->tv_sec += wall2mono.tv_sec; | ||
| 578 | ts->tv_nsec += wall2mono.tv_nsec; | ||
| 579 | while (unlikely(ts->tv_nsec >= NSEC_PER_SEC)) { | ||
| 580 | ts->tv_nsec -= NSEC_PER_SEC; | ||
| 581 | ts->tv_sec++; | ||
| 582 | } | 594 | } |
| 595 | |||
| 596 | return ((((unsigned long) | ||
| 597 | (year/4 - year/100 + year/400 + 367*mon/12 + day) + | ||
| 598 | year*365 - 719499 | ||
| 599 | )*24 + hour /* now have hours */ | ||
| 600 | )*60 + min /* now have minutes */ | ||
| 601 | )*60 + sec; /* finally seconds */ | ||
| 602 | } | ||
| 603 | |||
| 604 | EXPORT_SYMBOL(mktime); | ||
| 605 | |||
| 606 | /** | ||
| 607 | * set_normalized_timespec - set timespec sec and nsec parts and normalize | ||
| 608 | * | ||
| 609 | * @ts: pointer to timespec variable to be set | ||
| 610 | * @sec: seconds to set | ||
| 611 | * @nsec: nanoseconds to set | ||
| 612 | * | ||
| 613 | * Set seconds and nanoseconds field of a timespec variable and | ||
| 614 | * normalize to the timespec storage format | ||
| 615 | * | ||
| 616 | * Note: The tv_nsec part is always in the range of | ||
| 617 | * 0 <= tv_nsec < NSEC_PER_SEC | ||
| 618 | * For negative values only the tv_sec field is negative ! | ||
| 619 | */ | ||
| 620 | void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) | ||
| 621 | { | ||
| 622 | while (nsec >= NSEC_PER_SEC) { | ||
| 623 | nsec -= NSEC_PER_SEC; | ||
| 624 | ++sec; | ||
| 625 | } | ||
| 626 | while (nsec < 0) { | ||
| 627 | nsec += NSEC_PER_SEC; | ||
| 628 | --sec; | ||
| 629 | } | ||
| 630 | ts->tv_sec = sec; | ||
| 631 | ts->tv_nsec = nsec; | ||
| 632 | } | ||
| 633 | |||
| 634 | /** | ||
| 635 | * ns_to_timespec - Convert nanoseconds to timespec | ||
| 636 | * @nsec: the nanoseconds value to be converted | ||
| 637 | * | ||
| 638 | * Returns the timespec representation of the nsec parameter. | ||
| 639 | */ | ||
| 640 | inline struct timespec ns_to_timespec(const nsec_t nsec) | ||
| 641 | { | ||
| 642 | struct timespec ts; | ||
| 643 | |||
| 644 | if (nsec) | ||
| 645 | ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC, | ||
| 646 | &ts.tv_nsec); | ||
| 647 | else | ||
| 648 | ts.tv_sec = ts.tv_nsec = 0; | ||
| 649 | |||
| 650 | return ts; | ||
| 651 | } | ||
| 652 | |||
| 653 | /** | ||
| 654 | * ns_to_timeval - Convert nanoseconds to timeval | ||
| 655 | * @nsec: the nanoseconds value to be converted | ||
| 656 | * | ||
| 657 | * Returns the timeval representation of the nsec parameter. | ||
| 658 | */ | ||
| 659 | struct timeval ns_to_timeval(const nsec_t nsec) | ||
| 660 | { | ||
| 661 | struct timespec ts = ns_to_timespec(nsec); | ||
| 662 | struct timeval tv; | ||
| 663 | |||
| 664 | tv.tv_sec = ts.tv_sec; | ||
| 665 | tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000; | ||
| 666 | |||
| 667 | return tv; | ||
| 583 | } | 668 | } |
| 584 | EXPORT_SYMBOL_GPL(getnstimestamp); | ||
| 585 | 669 | ||
| 586 | #if (BITS_PER_LONG < 64) | 670 | #if (BITS_PER_LONG < 64) |
| 587 | u64 get_jiffies_64(void) | 671 | u64 get_jiffies_64(void) |
diff --git a/kernel/timer.c b/kernel/timer.c index fd74268d8663..4f1cb0ab5251 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
| @@ -33,6 +33,7 @@ | |||
| 33 | #include <linux/posix-timers.h> | 33 | #include <linux/posix-timers.h> |
| 34 | #include <linux/cpu.h> | 34 | #include <linux/cpu.h> |
| 35 | #include <linux/syscalls.h> | 35 | #include <linux/syscalls.h> |
| 36 | #include <linux/delay.h> | ||
| 36 | 37 | ||
| 37 | #include <asm/uaccess.h> | 38 | #include <asm/uaccess.h> |
| 38 | #include <asm/unistd.h> | 39 | #include <asm/unistd.h> |
| @@ -857,6 +858,7 @@ static void run_timer_softirq(struct softirq_action *h) | |||
| 857 | { | 858 | { |
| 858 | tvec_base_t *base = &__get_cpu_var(tvec_bases); | 859 | tvec_base_t *base = &__get_cpu_var(tvec_bases); |
| 859 | 860 | ||
| 861 | hrtimer_run_queues(); | ||
| 860 | if (time_after_eq(jiffies, base->timer_jiffies)) | 862 | if (time_after_eq(jiffies, base->timer_jiffies)) |
| 861 | __run_timers(base); | 863 | __run_timers(base); |
| 862 | } | 864 | } |
| @@ -1118,62 +1120,6 @@ asmlinkage long sys_gettid(void) | |||
| 1118 | return current->pid; | 1120 | return current->pid; |
| 1119 | } | 1121 | } |
| 1120 | 1122 | ||
| 1121 | static long __sched nanosleep_restart(struct restart_block *restart) | ||
| 1122 | { | ||
| 1123 | unsigned long expire = restart->arg0, now = jiffies; | ||
| 1124 | struct timespec __user *rmtp = (struct timespec __user *) restart->arg1; | ||
| 1125 | long ret; | ||
| 1126 | |||
| 1127 | /* Did it expire while we handled signals? */ | ||
| 1128 | if (!time_after(expire, now)) | ||
| 1129 | return 0; | ||
| 1130 | |||
| 1131 | expire = schedule_timeout_interruptible(expire - now); | ||
| 1132 | |||
| 1133 | ret = 0; | ||
| 1134 | if (expire) { | ||
| 1135 | struct timespec t; | ||
| 1136 | jiffies_to_timespec(expire, &t); | ||
| 1137 | |||
| 1138 | ret = -ERESTART_RESTARTBLOCK; | ||
| 1139 | if (rmtp && copy_to_user(rmtp, &t, sizeof(t))) | ||
| 1140 | ret = -EFAULT; | ||
| 1141 | /* The 'restart' block is already filled in */ | ||
| 1142 | } | ||
| 1143 | return ret; | ||
| 1144 | } | ||
| 1145 | |||
| 1146 | asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) | ||
| 1147 | { | ||
| 1148 | struct timespec t; | ||
| 1149 | unsigned long expire; | ||
| 1150 | long ret; | ||
| 1151 | |||
| 1152 | if (copy_from_user(&t, rqtp, sizeof(t))) | ||
| 1153 | return -EFAULT; | ||
| 1154 | |||
| 1155 | if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0)) | ||
| 1156 | return -EINVAL; | ||
| 1157 | |||
| 1158 | expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); | ||
| 1159 | expire = schedule_timeout_interruptible(expire); | ||
| 1160 | |||
| 1161 | ret = 0; | ||
| 1162 | if (expire) { | ||
| 1163 | struct restart_block *restart; | ||
| 1164 | jiffies_to_timespec(expire, &t); | ||
| 1165 | if (rmtp && copy_to_user(rmtp, &t, sizeof(t))) | ||
| 1166 | return -EFAULT; | ||
| 1167 | |||
| 1168 | restart = ¤t_thread_info()->restart_block; | ||
| 1169 | restart->fn = nanosleep_restart; | ||
| 1170 | restart->arg0 = jiffies + expire; | ||
| 1171 | restart->arg1 = (unsigned long) rmtp; | ||
| 1172 | ret = -ERESTART_RESTARTBLOCK; | ||
| 1173 | } | ||
| 1174 | return ret; | ||
| 1175 | } | ||
| 1176 | |||
| 1177 | /* | 1123 | /* |
| 1178 | * sys_sysinfo - fill in sysinfo struct | 1124 | * sys_sysinfo - fill in sysinfo struct |
| 1179 | */ | 1125 | */ |
diff --git a/kernel/uid16.c b/kernel/uid16.c index f669941e8b26..aa25605027c8 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c | |||
| @@ -10,6 +10,7 @@ | |||
| 10 | #include <linux/notifier.h> | 10 | #include <linux/notifier.h> |
| 11 | #include <linux/reboot.h> | 11 | #include <linux/reboot.h> |
| 12 | #include <linux/prctl.h> | 12 | #include <linux/prctl.h> |
| 13 | #include <linux/capability.h> | ||
| 13 | #include <linux/init.h> | 14 | #include <linux/init.h> |
| 14 | #include <linux/highuid.h> | 15 | #include <linux/highuid.h> |
| 15 | #include <linux/security.h> | 16 | #include <linux/security.h> |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2bd5aee1c736..82c4fa70595c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -29,7 +29,8 @@ | |||
| 29 | #include <linux/kthread.h> | 29 | #include <linux/kthread.h> |
| 30 | 30 | ||
| 31 | /* | 31 | /* |
| 32 | * The per-CPU workqueue (if single thread, we always use cpu 0's). | 32 | * The per-CPU workqueue (if single thread, we always use the first |
| 33 | * possible cpu). | ||
| 33 | * | 34 | * |
| 34 | * The sequence counters are for flush_scheduled_work(). It wants to wait | 35 | * The sequence counters are for flush_scheduled_work(). It wants to wait |
| 35 | * until until all currently-scheduled works are completed, but it doesn't | 36 | * until until all currently-scheduled works are completed, but it doesn't |
| @@ -69,6 +70,8 @@ struct workqueue_struct { | |||
| 69 | static DEFINE_SPINLOCK(workqueue_lock); | 70 | static DEFINE_SPINLOCK(workqueue_lock); |
| 70 | static LIST_HEAD(workqueues); | 71 | static LIST_HEAD(workqueues); |
| 71 | 72 | ||
| 73 | static int singlethread_cpu; | ||
| 74 | |||
| 72 | /* If it's single threaded, it isn't in the list of workqueues. */ | 75 | /* If it's single threaded, it isn't in the list of workqueues. */ |
| 73 | static inline int is_single_threaded(struct workqueue_struct *wq) | 76 | static inline int is_single_threaded(struct workqueue_struct *wq) |
| 74 | { | 77 | { |
| @@ -102,7 +105,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) | |||
| 102 | 105 | ||
| 103 | if (!test_and_set_bit(0, &work->pending)) { | 106 | if (!test_and_set_bit(0, &work->pending)) { |
| 104 | if (unlikely(is_single_threaded(wq))) | 107 | if (unlikely(is_single_threaded(wq))) |
| 105 | cpu = any_online_cpu(cpu_online_map); | 108 | cpu = singlethread_cpu; |
| 106 | BUG_ON(!list_empty(&work->entry)); | 109 | BUG_ON(!list_empty(&work->entry)); |
| 107 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); | 110 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); |
| 108 | ret = 1; | 111 | ret = 1; |
| @@ -118,7 +121,7 @@ static void delayed_work_timer_fn(unsigned long __data) | |||
| 118 | int cpu = smp_processor_id(); | 121 | int cpu = smp_processor_id(); |
| 119 | 122 | ||
| 120 | if (unlikely(is_single_threaded(wq))) | 123 | if (unlikely(is_single_threaded(wq))) |
| 121 | cpu = any_online_cpu(cpu_online_map); | 124 | cpu = singlethread_cpu; |
| 122 | 125 | ||
| 123 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); | 126 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); |
| 124 | } | 127 | } |
| @@ -267,7 +270,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) | |||
| 267 | 270 | ||
| 268 | if (is_single_threaded(wq)) { | 271 | if (is_single_threaded(wq)) { |
| 269 | /* Always use first cpu's area. */ | 272 | /* Always use first cpu's area. */ |
| 270 | flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, any_online_cpu(cpu_online_map))); | 273 | flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu)); |
| 271 | } else { | 274 | } else { |
| 272 | int cpu; | 275 | int cpu; |
| 273 | 276 | ||
| @@ -315,12 +318,17 @@ struct workqueue_struct *__create_workqueue(const char *name, | |||
| 315 | return NULL; | 318 | return NULL; |
| 316 | 319 | ||
| 317 | wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); | 320 | wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); |
| 321 | if (!wq->cpu_wq) { | ||
| 322 | kfree(wq); | ||
| 323 | return NULL; | ||
| 324 | } | ||
| 325 | |||
| 318 | wq->name = name; | 326 | wq->name = name; |
| 319 | /* We don't need the distraction of CPUs appearing and vanishing. */ | 327 | /* We don't need the distraction of CPUs appearing and vanishing. */ |
| 320 | lock_cpu_hotplug(); | 328 | lock_cpu_hotplug(); |
| 321 | if (singlethread) { | 329 | if (singlethread) { |
| 322 | INIT_LIST_HEAD(&wq->list); | 330 | INIT_LIST_HEAD(&wq->list); |
| 323 | p = create_workqueue_thread(wq, any_online_cpu(cpu_online_map)); | 331 | p = create_workqueue_thread(wq, singlethread_cpu); |
| 324 | if (!p) | 332 | if (!p) |
| 325 | destroy = 1; | 333 | destroy = 1; |
| 326 | else | 334 | else |
| @@ -374,7 +382,7 @@ void destroy_workqueue(struct workqueue_struct *wq) | |||
| 374 | /* We don't need the distraction of CPUs appearing and vanishing. */ | 382 | /* We don't need the distraction of CPUs appearing and vanishing. */ |
| 375 | lock_cpu_hotplug(); | 383 | lock_cpu_hotplug(); |
| 376 | if (is_single_threaded(wq)) | 384 | if (is_single_threaded(wq)) |
| 377 | cleanup_workqueue_thread(wq, any_online_cpu(cpu_online_map)); | 385 | cleanup_workqueue_thread(wq, singlethread_cpu); |
| 378 | else { | 386 | else { |
| 379 | for_each_online_cpu(cpu) | 387 | for_each_online_cpu(cpu) |
| 380 | cleanup_workqueue_thread(wq, cpu); | 388 | cleanup_workqueue_thread(wq, cpu); |
| @@ -419,6 +427,25 @@ int schedule_delayed_work_on(int cpu, | |||
| 419 | return ret; | 427 | return ret; |
| 420 | } | 428 | } |
| 421 | 429 | ||
| 430 | int schedule_on_each_cpu(void (*func) (void *info), void *info) | ||
| 431 | { | ||
| 432 | int cpu; | ||
| 433 | struct work_struct *work; | ||
| 434 | |||
| 435 | work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL); | ||
| 436 | |||
| 437 | if (!work) | ||
| 438 | return -ENOMEM; | ||
| 439 | for_each_online_cpu(cpu) { | ||
| 440 | INIT_WORK(work + cpu, func, info); | ||
| 441 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), | ||
| 442 | work + cpu); | ||
| 443 | } | ||
| 444 | flush_workqueue(keventd_wq); | ||
| 445 | kfree(work); | ||
| 446 | return 0; | ||
| 447 | } | ||
| 448 | |||
| 422 | void flush_scheduled_work(void) | 449 | void flush_scheduled_work(void) |
| 423 | { | 450 | { |
| 424 | flush_workqueue(keventd_wq); | 451 | flush_workqueue(keventd_wq); |
| @@ -543,6 +570,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
| 543 | 570 | ||
| 544 | void init_workqueues(void) | 571 | void init_workqueues(void) |
| 545 | { | 572 | { |
| 573 | singlethread_cpu = first_cpu(cpu_possible_map); | ||
| 546 | hotcpu_notifier(workqueue_cpu_callback, 0); | 574 | hotcpu_notifier(workqueue_cpu_callback, 0); |
| 547 | keventd_wq = create_workqueue("events"); | 575 | keventd_wq = create_workqueue("events"); |
| 548 | BUG_ON(!keventd_wq); | 576 | BUG_ON(!keventd_wq); |
