diff options
Diffstat (limited to 'kernel')
50 files changed, 4681 insertions, 2024 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore new file mode 100644 index 000000000000..f2ab70073bd4 --- /dev/null +++ b/kernel/.gitignore | |||
@@ -0,0 +1,5 @@ | |||
1 | # | ||
2 | # Generated files | ||
3 | # | ||
4 | config_data.h | ||
5 | config_data.gz | ||
diff --git a/kernel/Makefile b/kernel/Makefile index 4f5a1453093a..4ae0fbde815d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -6,15 +6,18 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ | |||
6 | exit.o itimer.o time.o softirq.o resource.o \ | 6 | exit.o itimer.o time.o softirq.o resource.o \ |
7 | sysctl.o capability.o ptrace.o timer.o user.o \ | 7 | sysctl.o capability.o ptrace.o timer.o user.o \ |
8 | signal.o sys.o kmod.o workqueue.o pid.o \ | 8 | signal.o sys.o kmod.o workqueue.o pid.o \ |
9 | rcupdate.o intermodule.o extable.o params.o posix-timers.o \ | 9 | rcupdate.o extable.o params.o posix-timers.o \ |
10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
11 | hrtimer.o | ||
11 | 12 | ||
13 | obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o | ||
12 | obj-$(CONFIG_FUTEX) += futex.o | 14 | obj-$(CONFIG_FUTEX) += futex.o |
13 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 15 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
14 | obj-$(CONFIG_SMP) += cpu.o spinlock.o | 16 | obj-$(CONFIG_SMP) += cpu.o spinlock.o |
15 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o | 17 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o |
16 | obj-$(CONFIG_UID16) += uid16.o | 18 | obj-$(CONFIG_UID16) += uid16.o |
17 | obj-$(CONFIG_MODULES) += module.o | 19 | obj-$(CONFIG_MODULES) += module.o |
20 | obj-$(CONFIG_OBSOLETE_INTERMODULE) += intermodule.o | ||
18 | obj-$(CONFIG_KALLSYMS) += kallsyms.o | 21 | obj-$(CONFIG_KALLSYMS) += kallsyms.o |
19 | obj-$(CONFIG_PM) += power/ | 22 | obj-$(CONFIG_PM) += power/ |
20 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o | 23 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o |
@@ -29,7 +32,6 @@ obj-$(CONFIG_KPROBES) += kprobes.o | |||
29 | obj-$(CONFIG_SYSFS) += ksysfs.o | 32 | obj-$(CONFIG_SYSFS) += ksysfs.o |
30 | obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o | 33 | obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o |
31 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | 34 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ |
32 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | ||
33 | obj-$(CONFIG_SECCOMP) += seccomp.o | 35 | obj-$(CONFIG_SECCOMP) += seccomp.o |
34 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 36 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
35 | 37 | ||
diff --git a/kernel/acct.c b/kernel/acct.c index 6312d6bd43e3..065d8b4e51ef 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -47,6 +47,7 @@ | |||
47 | #include <linux/mm.h> | 47 | #include <linux/mm.h> |
48 | #include <linux/slab.h> | 48 | #include <linux/slab.h> |
49 | #include <linux/acct.h> | 49 | #include <linux/acct.h> |
50 | #include <linux/capability.h> | ||
50 | #include <linux/file.h> | 51 | #include <linux/file.h> |
51 | #include <linux/tty.h> | 52 | #include <linux/tty.h> |
52 | #include <linux/security.h> | 53 | #include <linux/security.h> |
@@ -427,6 +428,7 @@ static void do_acct_process(long exitcode, struct file *file) | |||
427 | u64 elapsed; | 428 | u64 elapsed; |
428 | u64 run_time; | 429 | u64 run_time; |
429 | struct timespec uptime; | 430 | struct timespec uptime; |
431 | unsigned long jiffies; | ||
430 | 432 | ||
431 | /* | 433 | /* |
432 | * First check to see if there is enough free_space to continue | 434 | * First check to see if there is enough free_space to continue |
@@ -467,12 +469,12 @@ static void do_acct_process(long exitcode, struct file *file) | |||
467 | #endif | 469 | #endif |
468 | do_div(elapsed, AHZ); | 470 | do_div(elapsed, AHZ); |
469 | ac.ac_btime = xtime.tv_sec - elapsed; | 471 | ac.ac_btime = xtime.tv_sec - elapsed; |
470 | ac.ac_utime = encode_comp_t(jiffies_to_AHZ( | 472 | jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime, |
471 | current->signal->utime + | 473 | current->signal->utime)); |
472 | current->group_leader->utime)); | 474 | ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies)); |
473 | ac.ac_stime = encode_comp_t(jiffies_to_AHZ( | 475 | jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime, |
474 | current->signal->stime + | 476 | current->signal->stime)); |
475 | current->group_leader->stime)); | 477 | ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies)); |
476 | /* we really need to bite the bullet and change layout */ | 478 | /* we really need to bite the bullet and change layout */ |
477 | ac.ac_uid = current->uid; | 479 | ac.ac_uid = current->uid; |
478 | ac.ac_gid = current->gid; | 480 | ac.ac_gid = current->gid; |
@@ -580,7 +582,8 @@ void acct_process(long exitcode) | |||
580 | void acct_update_integrals(struct task_struct *tsk) | 582 | void acct_update_integrals(struct task_struct *tsk) |
581 | { | 583 | { |
582 | if (likely(tsk->mm)) { | 584 | if (likely(tsk->mm)) { |
583 | long delta = tsk->stime - tsk->acct_stimexpd; | 585 | long delta = |
586 | cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd; | ||
584 | 587 | ||
585 | if (delta == 0) | 588 | if (delta == 0) |
586 | return; | 589 | return; |
diff --git a/kernel/audit.c b/kernel/audit.c index 32fa03ad1984..0a813d2883e5 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -42,8 +42,8 @@ | |||
42 | */ | 42 | */ |
43 | 43 | ||
44 | #include <linux/init.h> | 44 | #include <linux/init.h> |
45 | #include <asm/atomic.h> | ||
46 | #include <asm/types.h> | 45 | #include <asm/types.h> |
46 | #include <asm/atomic.h> | ||
47 | #include <linux/mm.h> | 47 | #include <linux/mm.h> |
48 | #include <linux/module.h> | 48 | #include <linux/module.h> |
49 | #include <linux/err.h> | 49 | #include <linux/err.h> |
@@ -267,7 +267,7 @@ static int audit_set_failure(int state, uid_t loginuid) | |||
267 | return old; | 267 | return old; |
268 | } | 268 | } |
269 | 269 | ||
270 | int kauditd_thread(void *dummy) | 270 | static int kauditd_thread(void *dummy) |
271 | { | 271 | { |
272 | struct sk_buff *skb; | 272 | struct sk_buff *skb; |
273 | 273 | ||
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d8a68509e729..685c25175d96 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -30,8 +30,8 @@ | |||
30 | */ | 30 | */ |
31 | 31 | ||
32 | #include <linux/init.h> | 32 | #include <linux/init.h> |
33 | #include <asm/atomic.h> | ||
34 | #include <asm/types.h> | 33 | #include <asm/types.h> |
34 | #include <asm/atomic.h> | ||
35 | #include <linux/mm.h> | 35 | #include <linux/mm.h> |
36 | #include <linux/module.h> | 36 | #include <linux/module.h> |
37 | #include <linux/mount.h> | 37 | #include <linux/mount.h> |
diff --git a/kernel/capability.c b/kernel/capability.c index 8986a37a67ea..bfa3c92e16f2 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -7,6 +7,7 @@ | |||
7 | * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> | 7 | * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> |
8 | */ | 8 | */ |
9 | 9 | ||
10 | #include <linux/capability.h> | ||
10 | #include <linux/mm.h> | 11 | #include <linux/mm.h> |
11 | #include <linux/module.h> | 12 | #include <linux/module.h> |
12 | #include <linux/security.h> | 13 | #include <linux/security.h> |
diff --git a/kernel/compat.c b/kernel/compat.c index 102296e21ea8..1867290c37e3 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -514,6 +514,24 @@ static int put_compat_itimerspec(struct compat_itimerspec __user *dst, | |||
514 | return 0; | 514 | return 0; |
515 | } | 515 | } |
516 | 516 | ||
517 | long compat_sys_timer_create(clockid_t which_clock, | ||
518 | struct compat_sigevent __user *timer_event_spec, | ||
519 | timer_t __user *created_timer_id) | ||
520 | { | ||
521 | struct sigevent __user *event = NULL; | ||
522 | |||
523 | if (timer_event_spec) { | ||
524 | struct sigevent kevent; | ||
525 | |||
526 | event = compat_alloc_user_space(sizeof(*event)); | ||
527 | if (get_compat_sigevent(&kevent, timer_event_spec) || | ||
528 | copy_to_user(event, &kevent, sizeof(*event))) | ||
529 | return -EFAULT; | ||
530 | } | ||
531 | |||
532 | return sys_timer_create(which_clock, event, created_timer_id); | ||
533 | } | ||
534 | |||
517 | long compat_sys_timer_settime(timer_t timer_id, int flags, | 535 | long compat_sys_timer_settime(timer_t timer_id, int flags, |
518 | struct compat_itimerspec __user *new, | 536 | struct compat_itimerspec __user *new, |
519 | struct compat_itimerspec __user *old) | 537 | struct compat_itimerspec __user *old) |
@@ -649,8 +667,6 @@ int get_compat_sigevent(struct sigevent *event, | |||
649 | ? -EFAULT : 0; | 667 | ? -EFAULT : 0; |
650 | } | 668 | } |
651 | 669 | ||
652 | /* timer_create is architecture specific because it needs sigevent conversion */ | ||
653 | |||
654 | long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask, | 670 | long compat_get_bitmap(unsigned long *mask, compat_ulong_t __user *umask, |
655 | unsigned long bitmap_size) | 671 | unsigned long bitmap_size) |
656 | { | 672 | { |
@@ -855,3 +871,31 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr) | |||
855 | } | 871 | } |
856 | 872 | ||
857 | #endif /* __ARCH_WANT_COMPAT_SYS_TIME */ | 873 | #endif /* __ARCH_WANT_COMPAT_SYS_TIME */ |
874 | |||
875 | #ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND | ||
876 | asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize) | ||
877 | { | ||
878 | sigset_t newset; | ||
879 | compat_sigset_t newset32; | ||
880 | |||
881 | /* XXX: Don't preclude handling different sized sigset_t's. */ | ||
882 | if (sigsetsize != sizeof(sigset_t)) | ||
883 | return -EINVAL; | ||
884 | |||
885 | if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) | ||
886 | return -EFAULT; | ||
887 | sigset_from_compat(&newset, &newset32); | ||
888 | sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); | ||
889 | |||
890 | spin_lock_irq(¤t->sighand->siglock); | ||
891 | current->saved_sigmask = current->blocked; | ||
892 | current->blocked = newset; | ||
893 | recalc_sigpending(); | ||
894 | spin_unlock_irq(¤t->sighand->siglock); | ||
895 | |||
896 | current->state = TASK_INTERRUPTIBLE; | ||
897 | schedule(); | ||
898 | set_thread_flag(TIF_RESTORE_SIGMASK); | ||
899 | return -ERESTARTNOHAND; | ||
900 | } | ||
901 | #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ | ||
diff --git a/kernel/configs.c b/kernel/configs.c index 986f7af31e0a..009e1ebdcb88 100644 --- a/kernel/configs.c +++ b/kernel/configs.c | |||
@@ -3,7 +3,7 @@ | |||
3 | * Echo the kernel .config file used to build the kernel | 3 | * Echo the kernel .config file used to build the kernel |
4 | * | 4 | * |
5 | * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com> | 5 | * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com> |
6 | * Copyright (C) 2002 Randy Dunlap <rddunlap@osdl.org> | 6 | * Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net> |
7 | * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com> | 7 | * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com> |
8 | * Copyright (C) 2002 Hewlett-Packard Company | 8 | * Copyright (C) 2002 Hewlett-Packard Company |
9 | * | 9 | * |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7430640f9816..fe2f71f92ae0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/namei.h> | 39 | #include <linux/namei.h> |
40 | #include <linux/pagemap.h> | 40 | #include <linux/pagemap.h> |
41 | #include <linux/proc_fs.h> | 41 | #include <linux/proc_fs.h> |
42 | #include <linux/rcupdate.h> | ||
42 | #include <linux/sched.h> | 43 | #include <linux/sched.h> |
43 | #include <linux/seq_file.h> | 44 | #include <linux/seq_file.h> |
44 | #include <linux/slab.h> | 45 | #include <linux/slab.h> |
@@ -54,7 +55,23 @@ | |||
54 | #include <asm/atomic.h> | 55 | #include <asm/atomic.h> |
55 | #include <asm/semaphore.h> | 56 | #include <asm/semaphore.h> |
56 | 57 | ||
57 | #define CPUSET_SUPER_MAGIC 0x27e0eb | 58 | #define CPUSET_SUPER_MAGIC 0x27e0eb |
59 | |||
60 | /* | ||
61 | * Tracks how many cpusets are currently defined in system. | ||
62 | * When there is only one cpuset (the root cpuset) we can | ||
63 | * short circuit some hooks. | ||
64 | */ | ||
65 | int number_of_cpusets __read_mostly; | ||
66 | |||
67 | /* See "Frequency meter" comments, below. */ | ||
68 | |||
69 | struct fmeter { | ||
70 | int cnt; /* unprocessed events count */ | ||
71 | int val; /* most recent output value */ | ||
72 | time_t time; /* clock (secs) when val computed */ | ||
73 | spinlock_t lock; /* guards read or write of above */ | ||
74 | }; | ||
58 | 75 | ||
59 | struct cpuset { | 76 | struct cpuset { |
60 | unsigned long flags; /* "unsigned long" so bitops work */ | 77 | unsigned long flags; /* "unsigned long" so bitops work */ |
@@ -80,13 +97,16 @@ struct cpuset { | |||
80 | * Copy of global cpuset_mems_generation as of the most | 97 | * Copy of global cpuset_mems_generation as of the most |
81 | * recent time this cpuset changed its mems_allowed. | 98 | * recent time this cpuset changed its mems_allowed. |
82 | */ | 99 | */ |
83 | int mems_generation; | 100 | int mems_generation; |
101 | |||
102 | struct fmeter fmeter; /* memory_pressure filter */ | ||
84 | }; | 103 | }; |
85 | 104 | ||
86 | /* bits in struct cpuset flags field */ | 105 | /* bits in struct cpuset flags field */ |
87 | typedef enum { | 106 | typedef enum { |
88 | CS_CPU_EXCLUSIVE, | 107 | CS_CPU_EXCLUSIVE, |
89 | CS_MEM_EXCLUSIVE, | 108 | CS_MEM_EXCLUSIVE, |
109 | CS_MEMORY_MIGRATE, | ||
90 | CS_REMOVED, | 110 | CS_REMOVED, |
91 | CS_NOTIFY_ON_RELEASE | 111 | CS_NOTIFY_ON_RELEASE |
92 | } cpuset_flagbits_t; | 112 | } cpuset_flagbits_t; |
@@ -112,6 +132,11 @@ static inline int notify_on_release(const struct cpuset *cs) | |||
112 | return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 132 | return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
113 | } | 133 | } |
114 | 134 | ||
135 | static inline int is_memory_migrate(const struct cpuset *cs) | ||
136 | { | ||
137 | return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags); | ||
138 | } | ||
139 | |||
115 | /* | 140 | /* |
116 | * Increment this atomic integer everytime any cpuset changes its | 141 | * Increment this atomic integer everytime any cpuset changes its |
117 | * mems_allowed value. Users of cpusets can track this generation | 142 | * mems_allowed value. Users of cpusets can track this generation |
@@ -137,13 +162,10 @@ static struct cpuset top_cpuset = { | |||
137 | .count = ATOMIC_INIT(0), | 162 | .count = ATOMIC_INIT(0), |
138 | .sibling = LIST_HEAD_INIT(top_cpuset.sibling), | 163 | .sibling = LIST_HEAD_INIT(top_cpuset.sibling), |
139 | .children = LIST_HEAD_INIT(top_cpuset.children), | 164 | .children = LIST_HEAD_INIT(top_cpuset.children), |
140 | .parent = NULL, | ||
141 | .dentry = NULL, | ||
142 | .mems_generation = 0, | ||
143 | }; | 165 | }; |
144 | 166 | ||
145 | static struct vfsmount *cpuset_mount; | 167 | static struct vfsmount *cpuset_mount; |
146 | static struct super_block *cpuset_sb = NULL; | 168 | static struct super_block *cpuset_sb; |
147 | 169 | ||
148 | /* | 170 | /* |
149 | * We have two global cpuset semaphores below. They can nest. | 171 | * We have two global cpuset semaphores below. They can nest. |
@@ -227,6 +249,11 @@ static struct super_block *cpuset_sb = NULL; | |||
227 | * a tasks cpuset pointer we use task_lock(), which acts on a spinlock | 249 | * a tasks cpuset pointer we use task_lock(), which acts on a spinlock |
228 | * (task->alloc_lock) already in the task_struct routinely used for | 250 | * (task->alloc_lock) already in the task_struct routinely used for |
229 | * such matters. | 251 | * such matters. |
252 | * | ||
253 | * P.S. One more locking exception. RCU is used to guard the | ||
254 | * update of a tasks cpuset pointer by attach_task() and the | ||
255 | * access of task->cpuset->mems_generation via that pointer in | ||
256 | * the routine cpuset_update_task_memory_state(). | ||
230 | */ | 257 | */ |
231 | 258 | ||
232 | static DECLARE_MUTEX(manage_sem); | 259 | static DECLARE_MUTEX(manage_sem); |
@@ -304,7 +331,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry) | |||
304 | spin_lock(&dcache_lock); | 331 | spin_lock(&dcache_lock); |
305 | node = dentry->d_subdirs.next; | 332 | node = dentry->d_subdirs.next; |
306 | while (node != &dentry->d_subdirs) { | 333 | while (node != &dentry->d_subdirs) { |
307 | struct dentry *d = list_entry(node, struct dentry, d_child); | 334 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); |
308 | list_del_init(node); | 335 | list_del_init(node); |
309 | if (d->d_inode) { | 336 | if (d->d_inode) { |
310 | d = dget_locked(d); | 337 | d = dget_locked(d); |
@@ -316,7 +343,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry) | |||
316 | } | 343 | } |
317 | node = dentry->d_subdirs.next; | 344 | node = dentry->d_subdirs.next; |
318 | } | 345 | } |
319 | list_del_init(&dentry->d_child); | 346 | list_del_init(&dentry->d_u.d_child); |
320 | spin_unlock(&dcache_lock); | 347 | spin_unlock(&dcache_lock); |
321 | remove_dir(dentry); | 348 | remove_dir(dentry); |
322 | } | 349 | } |
@@ -570,20 +597,43 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
570 | BUG_ON(!nodes_intersects(*pmask, node_online_map)); | 597 | BUG_ON(!nodes_intersects(*pmask, node_online_map)); |
571 | } | 598 | } |
572 | 599 | ||
573 | /* | 600 | /** |
574 | * Refresh current tasks mems_allowed and mems_generation from current | 601 | * cpuset_update_task_memory_state - update task memory placement |
575 | * tasks cpuset. | 602 | * |
603 | * If the current tasks cpusets mems_allowed changed behind our | ||
604 | * backs, update current->mems_allowed, mems_generation and task NUMA | ||
605 | * mempolicy to the new value. | ||
576 | * | 606 | * |
577 | * Call without callback_sem or task_lock() held. May be called with | 607 | * Task mempolicy is updated by rebinding it relative to the |
578 | * or without manage_sem held. Will acquire task_lock() and might | 608 | * current->cpuset if a task has its memory placement changed. |
579 | * acquire callback_sem during call. | 609 | * Do not call this routine if in_interrupt(). |
580 | * | 610 | * |
581 | * The task_lock() is required to dereference current->cpuset safely. | 611 | * Call without callback_sem or task_lock() held. May be called |
582 | * Without it, we could pick up the pointer value of current->cpuset | 612 | * with or without manage_sem held. Doesn't need task_lock to guard |
583 | * in one instruction, and then attach_task could give us a different | 613 | * against another task changing a non-NULL cpuset pointer to NULL, |
584 | * cpuset, and then the cpuset we had could be removed and freed, | 614 | * as that is only done by a task on itself, and if the current task |
585 | * and then on our next instruction, we could dereference a no longer | 615 | * is here, it is not simultaneously in the exit code NULL'ing its |
586 | * valid cpuset pointer to get its mems_generation field. | 616 | * cpuset pointer. This routine also might acquire callback_sem and |
617 | * current->mm->mmap_sem during call. | ||
618 | * | ||
619 | * Reading current->cpuset->mems_generation doesn't need task_lock | ||
620 | * to guard the current->cpuset derefence, because it is guarded | ||
621 | * from concurrent freeing of current->cpuset by attach_task(), | ||
622 | * using RCU. | ||
623 | * | ||
624 | * The rcu_dereference() is technically probably not needed, | ||
625 | * as I don't actually mind if I see a new cpuset pointer but | ||
626 | * an old value of mems_generation. However this really only | ||
627 | * matters on alpha systems using cpusets heavily. If I dropped | ||
628 | * that rcu_dereference(), it would save them a memory barrier. | ||
629 | * For all other arch's, rcu_dereference is a no-op anyway, and for | ||
630 | * alpha systems not using cpusets, another planned optimization, | ||
631 | * avoiding the rcu critical section for tasks in the root cpuset | ||
632 | * which is statically allocated, so can't vanish, will make this | ||
633 | * irrelevant. Better to use RCU as intended, than to engage in | ||
634 | * some cute trick to save a memory barrier that is impossible to | ||
635 | * test, for alpha systems using cpusets heavily, which might not | ||
636 | * even exist. | ||
587 | * | 637 | * |
588 | * This routine is needed to update the per-task mems_allowed data, | 638 | * This routine is needed to update the per-task mems_allowed data, |
589 | * within the tasks context, when it is trying to allocate memory | 639 | * within the tasks context, when it is trying to allocate memory |
@@ -591,27 +641,31 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
591 | * task has been modifying its cpuset. | 641 | * task has been modifying its cpuset. |
592 | */ | 642 | */ |
593 | 643 | ||
594 | static void refresh_mems(void) | 644 | void cpuset_update_task_memory_state() |
595 | { | 645 | { |
596 | int my_cpusets_mem_gen; | 646 | int my_cpusets_mem_gen; |
647 | struct task_struct *tsk = current; | ||
648 | struct cpuset *cs; | ||
597 | 649 | ||
598 | task_lock(current); | 650 | if (tsk->cpuset == &top_cpuset) { |
599 | my_cpusets_mem_gen = current->cpuset->mems_generation; | 651 | /* Don't need rcu for top_cpuset. It's never freed. */ |
600 | task_unlock(current); | 652 | my_cpusets_mem_gen = top_cpuset.mems_generation; |
601 | 653 | } else { | |
602 | if (current->cpuset_mems_generation != my_cpusets_mem_gen) { | 654 | rcu_read_lock(); |
603 | struct cpuset *cs; | 655 | cs = rcu_dereference(tsk->cpuset); |
604 | nodemask_t oldmem = current->mems_allowed; | 656 | my_cpusets_mem_gen = cs->mems_generation; |
657 | rcu_read_unlock(); | ||
658 | } | ||
605 | 659 | ||
660 | if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { | ||
606 | down(&callback_sem); | 661 | down(&callback_sem); |
607 | task_lock(current); | 662 | task_lock(tsk); |
608 | cs = current->cpuset; | 663 | cs = tsk->cpuset; /* Maybe changed when task not locked */ |
609 | guarantee_online_mems(cs, ¤t->mems_allowed); | 664 | guarantee_online_mems(cs, &tsk->mems_allowed); |
610 | current->cpuset_mems_generation = cs->mems_generation; | 665 | tsk->cpuset_mems_generation = cs->mems_generation; |
611 | task_unlock(current); | 666 | task_unlock(tsk); |
612 | up(&callback_sem); | 667 | up(&callback_sem); |
613 | if (!nodes_equal(oldmem, current->mems_allowed)) | 668 | mpol_rebind_task(tsk, &tsk->mems_allowed); |
614 | numa_policy_rebind(&oldmem, ¤t->mems_allowed); | ||
615 | } | 669 | } |
616 | } | 670 | } |
617 | 671 | ||
@@ -766,36 +820,150 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
766 | } | 820 | } |
767 | 821 | ||
768 | /* | 822 | /* |
823 | * Handle user request to change the 'mems' memory placement | ||
824 | * of a cpuset. Needs to validate the request, update the | ||
825 | * cpusets mems_allowed and mems_generation, and for each | ||
826 | * task in the cpuset, rebind any vma mempolicies and if | ||
827 | * the cpuset is marked 'memory_migrate', migrate the tasks | ||
828 | * pages to the new memory. | ||
829 | * | ||
769 | * Call with manage_sem held. May take callback_sem during call. | 830 | * Call with manage_sem held. May take callback_sem during call. |
831 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | ||
832 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | ||
833 | * their mempolicies to the cpusets new mems_allowed. | ||
770 | */ | 834 | */ |
771 | 835 | ||
772 | static int update_nodemask(struct cpuset *cs, char *buf) | 836 | static int update_nodemask(struct cpuset *cs, char *buf) |
773 | { | 837 | { |
774 | struct cpuset trialcs; | 838 | struct cpuset trialcs; |
839 | nodemask_t oldmem; | ||
840 | struct task_struct *g, *p; | ||
841 | struct mm_struct **mmarray; | ||
842 | int i, n, ntasks; | ||
843 | int migrate; | ||
844 | int fudge; | ||
775 | int retval; | 845 | int retval; |
776 | 846 | ||
777 | trialcs = *cs; | 847 | trialcs = *cs; |
778 | retval = nodelist_parse(buf, trialcs.mems_allowed); | 848 | retval = nodelist_parse(buf, trialcs.mems_allowed); |
779 | if (retval < 0) | 849 | if (retval < 0) |
780 | return retval; | 850 | goto done; |
781 | nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); | 851 | nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); |
782 | if (nodes_empty(trialcs.mems_allowed)) | 852 | oldmem = cs->mems_allowed; |
783 | return -ENOSPC; | 853 | if (nodes_equal(oldmem, trialcs.mems_allowed)) { |
854 | retval = 0; /* Too easy - nothing to do */ | ||
855 | goto done; | ||
856 | } | ||
857 | if (nodes_empty(trialcs.mems_allowed)) { | ||
858 | retval = -ENOSPC; | ||
859 | goto done; | ||
860 | } | ||
784 | retval = validate_change(cs, &trialcs); | 861 | retval = validate_change(cs, &trialcs); |
785 | if (retval == 0) { | 862 | if (retval < 0) |
786 | down(&callback_sem); | 863 | goto done; |
787 | cs->mems_allowed = trialcs.mems_allowed; | 864 | |
788 | atomic_inc(&cpuset_mems_generation); | 865 | down(&callback_sem); |
789 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | 866 | cs->mems_allowed = trialcs.mems_allowed; |
790 | up(&callback_sem); | 867 | atomic_inc(&cpuset_mems_generation); |
868 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | ||
869 | up(&callback_sem); | ||
870 | |||
871 | set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ | ||
872 | |||
873 | fudge = 10; /* spare mmarray[] slots */ | ||
874 | fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ | ||
875 | retval = -ENOMEM; | ||
876 | |||
877 | /* | ||
878 | * Allocate mmarray[] to hold mm reference for each task | ||
879 | * in cpuset cs. Can't kmalloc GFP_KERNEL while holding | ||
880 | * tasklist_lock. We could use GFP_ATOMIC, but with a | ||
881 | * few more lines of code, we can retry until we get a big | ||
882 | * enough mmarray[] w/o using GFP_ATOMIC. | ||
883 | */ | ||
884 | while (1) { | ||
885 | ntasks = atomic_read(&cs->count); /* guess */ | ||
886 | ntasks += fudge; | ||
887 | mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); | ||
888 | if (!mmarray) | ||
889 | goto done; | ||
890 | write_lock_irq(&tasklist_lock); /* block fork */ | ||
891 | if (atomic_read(&cs->count) <= ntasks) | ||
892 | break; /* got enough */ | ||
893 | write_unlock_irq(&tasklist_lock); /* try again */ | ||
894 | kfree(mmarray); | ||
791 | } | 895 | } |
896 | |||
897 | n = 0; | ||
898 | |||
899 | /* Load up mmarray[] with mm reference for each task in cpuset. */ | ||
900 | do_each_thread(g, p) { | ||
901 | struct mm_struct *mm; | ||
902 | |||
903 | if (n >= ntasks) { | ||
904 | printk(KERN_WARNING | ||
905 | "Cpuset mempolicy rebind incomplete.\n"); | ||
906 | continue; | ||
907 | } | ||
908 | if (p->cpuset != cs) | ||
909 | continue; | ||
910 | mm = get_task_mm(p); | ||
911 | if (!mm) | ||
912 | continue; | ||
913 | mmarray[n++] = mm; | ||
914 | } while_each_thread(g, p); | ||
915 | write_unlock_irq(&tasklist_lock); | ||
916 | |||
917 | /* | ||
918 | * Now that we've dropped the tasklist spinlock, we can | ||
919 | * rebind the vma mempolicies of each mm in mmarray[] to their | ||
920 | * new cpuset, and release that mm. The mpol_rebind_mm() | ||
921 | * call takes mmap_sem, which we couldn't take while holding | ||
922 | * tasklist_lock. Forks can happen again now - the mpol_copy() | ||
923 | * cpuset_being_rebound check will catch such forks, and rebind | ||
924 | * their vma mempolicies too. Because we still hold the global | ||
925 | * cpuset manage_sem, we know that no other rebind effort will | ||
926 | * be contending for the global variable cpuset_being_rebound. | ||
927 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() | ||
928 | * is idempotent. Also migrate pages in each mm to new nodes. | ||
929 | */ | ||
930 | migrate = is_memory_migrate(cs); | ||
931 | for (i = 0; i < n; i++) { | ||
932 | struct mm_struct *mm = mmarray[i]; | ||
933 | |||
934 | mpol_rebind_mm(mm, &cs->mems_allowed); | ||
935 | if (migrate) { | ||
936 | do_migrate_pages(mm, &oldmem, &cs->mems_allowed, | ||
937 | MPOL_MF_MOVE_ALL); | ||
938 | } | ||
939 | mmput(mm); | ||
940 | } | ||
941 | |||
942 | /* We're done rebinding vma's to this cpusets new mems_allowed. */ | ||
943 | kfree(mmarray); | ||
944 | set_cpuset_being_rebound(NULL); | ||
945 | retval = 0; | ||
946 | done: | ||
792 | return retval; | 947 | return retval; |
793 | } | 948 | } |
794 | 949 | ||
795 | /* | 950 | /* |
951 | * Call with manage_sem held. | ||
952 | */ | ||
953 | |||
954 | static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) | ||
955 | { | ||
956 | if (simple_strtoul(buf, NULL, 10) != 0) | ||
957 | cpuset_memory_pressure_enabled = 1; | ||
958 | else | ||
959 | cpuset_memory_pressure_enabled = 0; | ||
960 | return 0; | ||
961 | } | ||
962 | |||
963 | /* | ||
796 | * update_flag - read a 0 or a 1 in a file and update associated flag | 964 | * update_flag - read a 0 or a 1 in a file and update associated flag |
797 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, | 965 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, |
798 | * CS_NOTIFY_ON_RELEASE) | 966 | * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) |
799 | * cs: the cpuset to update | 967 | * cs: the cpuset to update |
800 | * buf: the buffer where we read the 0 or 1 | 968 | * buf: the buffer where we read the 0 or 1 |
801 | * | 969 | * |
@@ -834,6 +1002,104 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
834 | } | 1002 | } |
835 | 1003 | ||
836 | /* | 1004 | /* |
1005 | * Frequency meter - How fast is some event occuring? | ||
1006 | * | ||
1007 | * These routines manage a digitally filtered, constant time based, | ||
1008 | * event frequency meter. There are four routines: | ||
1009 | * fmeter_init() - initialize a frequency meter. | ||
1010 | * fmeter_markevent() - called each time the event happens. | ||
1011 | * fmeter_getrate() - returns the recent rate of such events. | ||
1012 | * fmeter_update() - internal routine used to update fmeter. | ||
1013 | * | ||
1014 | * A common data structure is passed to each of these routines, | ||
1015 | * which is used to keep track of the state required to manage the | ||
1016 | * frequency meter and its digital filter. | ||
1017 | * | ||
1018 | * The filter works on the number of events marked per unit time. | ||
1019 | * The filter is single-pole low-pass recursive (IIR). The time unit | ||
1020 | * is 1 second. Arithmetic is done using 32-bit integers scaled to | ||
1021 | * simulate 3 decimal digits of precision (multiplied by 1000). | ||
1022 | * | ||
1023 | * With an FM_COEF of 933, and a time base of 1 second, the filter | ||
1024 | * has a half-life of 10 seconds, meaning that if the events quit | ||
1025 | * happening, then the rate returned from the fmeter_getrate() | ||
1026 | * will be cut in half each 10 seconds, until it converges to zero. | ||
1027 | * | ||
1028 | * It is not worth doing a real infinitely recursive filter. If more | ||
1029 | * than FM_MAXTICKS ticks have elapsed since the last filter event, | ||
1030 | * just compute FM_MAXTICKS ticks worth, by which point the level | ||
1031 | * will be stable. | ||
1032 | * | ||
1033 | * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid | ||
1034 | * arithmetic overflow in the fmeter_update() routine. | ||
1035 | * | ||
1036 | * Given the simple 32 bit integer arithmetic used, this meter works | ||
1037 | * best for reporting rates between one per millisecond (msec) and | ||
1038 | * one per 32 (approx) seconds. At constant rates faster than one | ||
1039 | * per msec it maxes out at values just under 1,000,000. At constant | ||
1040 | * rates between one per msec, and one per second it will stabilize | ||
1041 | * to a value N*1000, where N is the rate of events per second. | ||
1042 | * At constant rates between one per second and one per 32 seconds, | ||
1043 | * it will be choppy, moving up on the seconds that have an event, | ||
1044 | * and then decaying until the next event. At rates slower than | ||
1045 | * about one in 32 seconds, it decays all the way back to zero between | ||
1046 | * each event. | ||
1047 | */ | ||
1048 | |||
1049 | #define FM_COEF 933 /* coefficient for half-life of 10 secs */ | ||
1050 | #define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */ | ||
1051 | #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ | ||
1052 | #define FM_SCALE 1000 /* faux fixed point scale */ | ||
1053 | |||
1054 | /* Initialize a frequency meter */ | ||
1055 | static void fmeter_init(struct fmeter *fmp) | ||
1056 | { | ||
1057 | fmp->cnt = 0; | ||
1058 | fmp->val = 0; | ||
1059 | fmp->time = 0; | ||
1060 | spin_lock_init(&fmp->lock); | ||
1061 | } | ||
1062 | |||
1063 | /* Internal meter update - process cnt events and update value */ | ||
1064 | static void fmeter_update(struct fmeter *fmp) | ||
1065 | { | ||
1066 | time_t now = get_seconds(); | ||
1067 | time_t ticks = now - fmp->time; | ||
1068 | |||
1069 | if (ticks == 0) | ||
1070 | return; | ||
1071 | |||
1072 | ticks = min(FM_MAXTICKS, ticks); | ||
1073 | while (ticks-- > 0) | ||
1074 | fmp->val = (FM_COEF * fmp->val) / FM_SCALE; | ||
1075 | fmp->time = now; | ||
1076 | |||
1077 | fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; | ||
1078 | fmp->cnt = 0; | ||
1079 | } | ||
1080 | |||
1081 | /* Process any previous ticks, then bump cnt by one (times scale). */ | ||
1082 | static void fmeter_markevent(struct fmeter *fmp) | ||
1083 | { | ||
1084 | spin_lock(&fmp->lock); | ||
1085 | fmeter_update(fmp); | ||
1086 | fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); | ||
1087 | spin_unlock(&fmp->lock); | ||
1088 | } | ||
1089 | |||
1090 | /* Process any previous ticks, then return current value. */ | ||
1091 | static int fmeter_getrate(struct fmeter *fmp) | ||
1092 | { | ||
1093 | int val; | ||
1094 | |||
1095 | spin_lock(&fmp->lock); | ||
1096 | fmeter_update(fmp); | ||
1097 | val = fmp->val; | ||
1098 | spin_unlock(&fmp->lock); | ||
1099 | return val; | ||
1100 | } | ||
1101 | |||
1102 | /* | ||
837 | * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly | 1103 | * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly |
838 | * writing the path of the old cpuset in 'ppathbuf' if it needs to be | 1104 | * writing the path of the old cpuset in 'ppathbuf' if it needs to be |
839 | * notified on release. | 1105 | * notified on release. |
@@ -848,6 +1114,8 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
848 | struct task_struct *tsk; | 1114 | struct task_struct *tsk; |
849 | struct cpuset *oldcs; | 1115 | struct cpuset *oldcs; |
850 | cpumask_t cpus; | 1116 | cpumask_t cpus; |
1117 | nodemask_t from, to; | ||
1118 | struct mm_struct *mm; | ||
851 | 1119 | ||
852 | if (sscanf(pidbuf, "%d", &pid) != 1) | 1120 | if (sscanf(pidbuf, "%d", &pid) != 1) |
853 | return -EIO; | 1121 | return -EIO; |
@@ -887,14 +1155,27 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
887 | return -ESRCH; | 1155 | return -ESRCH; |
888 | } | 1156 | } |
889 | atomic_inc(&cs->count); | 1157 | atomic_inc(&cs->count); |
890 | tsk->cpuset = cs; | 1158 | rcu_assign_pointer(tsk->cpuset, cs); |
891 | task_unlock(tsk); | 1159 | task_unlock(tsk); |
892 | 1160 | ||
893 | guarantee_online_cpus(cs, &cpus); | 1161 | guarantee_online_cpus(cs, &cpus); |
894 | set_cpus_allowed(tsk, cpus); | 1162 | set_cpus_allowed(tsk, cpus); |
895 | 1163 | ||
1164 | from = oldcs->mems_allowed; | ||
1165 | to = cs->mems_allowed; | ||
1166 | |||
896 | up(&callback_sem); | 1167 | up(&callback_sem); |
1168 | |||
1169 | mm = get_task_mm(tsk); | ||
1170 | if (mm) { | ||
1171 | mpol_rebind_mm(mm, &to); | ||
1172 | mmput(mm); | ||
1173 | } | ||
1174 | |||
1175 | if (is_memory_migrate(cs)) | ||
1176 | do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL); | ||
897 | put_task_struct(tsk); | 1177 | put_task_struct(tsk); |
1178 | synchronize_rcu(); | ||
898 | if (atomic_dec_and_test(&oldcs->count)) | 1179 | if (atomic_dec_and_test(&oldcs->count)) |
899 | check_for_release(oldcs, ppathbuf); | 1180 | check_for_release(oldcs, ppathbuf); |
900 | return 0; | 1181 | return 0; |
@@ -905,11 +1186,14 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
905 | typedef enum { | 1186 | typedef enum { |
906 | FILE_ROOT, | 1187 | FILE_ROOT, |
907 | FILE_DIR, | 1188 | FILE_DIR, |
1189 | FILE_MEMORY_MIGRATE, | ||
908 | FILE_CPULIST, | 1190 | FILE_CPULIST, |
909 | FILE_MEMLIST, | 1191 | FILE_MEMLIST, |
910 | FILE_CPU_EXCLUSIVE, | 1192 | FILE_CPU_EXCLUSIVE, |
911 | FILE_MEM_EXCLUSIVE, | 1193 | FILE_MEM_EXCLUSIVE, |
912 | FILE_NOTIFY_ON_RELEASE, | 1194 | FILE_NOTIFY_ON_RELEASE, |
1195 | FILE_MEMORY_PRESSURE_ENABLED, | ||
1196 | FILE_MEMORY_PRESSURE, | ||
913 | FILE_TASKLIST, | 1197 | FILE_TASKLIST, |
914 | } cpuset_filetype_t; | 1198 | } cpuset_filetype_t; |
915 | 1199 | ||
@@ -960,6 +1244,15 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
960 | case FILE_NOTIFY_ON_RELEASE: | 1244 | case FILE_NOTIFY_ON_RELEASE: |
961 | retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); | 1245 | retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); |
962 | break; | 1246 | break; |
1247 | case FILE_MEMORY_MIGRATE: | ||
1248 | retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); | ||
1249 | break; | ||
1250 | case FILE_MEMORY_PRESSURE_ENABLED: | ||
1251 | retval = update_memory_pressure_enabled(cs, buffer); | ||
1252 | break; | ||
1253 | case FILE_MEMORY_PRESSURE: | ||
1254 | retval = -EACCES; | ||
1255 | break; | ||
963 | case FILE_TASKLIST: | 1256 | case FILE_TASKLIST: |
964 | retval = attach_task(cs, buffer, &pathbuf); | 1257 | retval = attach_task(cs, buffer, &pathbuf); |
965 | break; | 1258 | break; |
@@ -1060,6 +1353,15 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | |||
1060 | case FILE_NOTIFY_ON_RELEASE: | 1353 | case FILE_NOTIFY_ON_RELEASE: |
1061 | *s++ = notify_on_release(cs) ? '1' : '0'; | 1354 | *s++ = notify_on_release(cs) ? '1' : '0'; |
1062 | break; | 1355 | break; |
1356 | case FILE_MEMORY_MIGRATE: | ||
1357 | *s++ = is_memory_migrate(cs) ? '1' : '0'; | ||
1358 | break; | ||
1359 | case FILE_MEMORY_PRESSURE_ENABLED: | ||
1360 | *s++ = cpuset_memory_pressure_enabled ? '1' : '0'; | ||
1361 | break; | ||
1362 | case FILE_MEMORY_PRESSURE: | ||
1363 | s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter)); | ||
1364 | break; | ||
1063 | default: | 1365 | default: |
1064 | retval = -EINVAL; | 1366 | retval = -EINVAL; |
1065 | goto out; | 1367 | goto out; |
@@ -1178,7 +1480,7 @@ static int cpuset_create_file(struct dentry *dentry, int mode) | |||
1178 | 1480 | ||
1179 | /* | 1481 | /* |
1180 | * cpuset_create_dir - create a directory for an object. | 1482 | * cpuset_create_dir - create a directory for an object. |
1181 | * cs: the cpuset we create the directory for. | 1483 | * cs: the cpuset we create the directory for. |
1182 | * It must have a valid ->parent field | 1484 | * It must have a valid ->parent field |
1183 | * And we are going to fill its ->dentry field. | 1485 | * And we are going to fill its ->dentry field. |
1184 | * name: The name to give to the cpuset directory. Will be copied. | 1486 | * name: The name to give to the cpuset directory. Will be copied. |
@@ -1211,7 +1513,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft) | |||
1211 | struct dentry *dentry; | 1513 | struct dentry *dentry; |
1212 | int error; | 1514 | int error; |
1213 | 1515 | ||
1214 | down(&dir->d_inode->i_sem); | 1516 | mutex_lock(&dir->d_inode->i_mutex); |
1215 | dentry = cpuset_get_dentry(dir, cft->name); | 1517 | dentry = cpuset_get_dentry(dir, cft->name); |
1216 | if (!IS_ERR(dentry)) { | 1518 | if (!IS_ERR(dentry)) { |
1217 | error = cpuset_create_file(dentry, 0644 | S_IFREG); | 1519 | error = cpuset_create_file(dentry, 0644 | S_IFREG); |
@@ -1220,7 +1522,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft) | |||
1220 | dput(dentry); | 1522 | dput(dentry); |
1221 | } else | 1523 | } else |
1222 | error = PTR_ERR(dentry); | 1524 | error = PTR_ERR(dentry); |
1223 | up(&dir->d_inode->i_sem); | 1525 | mutex_unlock(&dir->d_inode->i_mutex); |
1224 | return error; | 1526 | return error; |
1225 | } | 1527 | } |
1226 | 1528 | ||
@@ -1252,7 +1554,7 @@ struct ctr_struct { | |||
1252 | * when reading out p->cpuset, as we don't really care if it changes | 1554 | * when reading out p->cpuset, as we don't really care if it changes |
1253 | * on the next cycle, and we are not going to try to dereference it. | 1555 | * on the next cycle, and we are not going to try to dereference it. |
1254 | */ | 1556 | */ |
1255 | static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) | 1557 | static int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) |
1256 | { | 1558 | { |
1257 | int n = 0; | 1559 | int n = 0; |
1258 | struct task_struct *g, *p; | 1560 | struct task_struct *g, *p; |
@@ -1408,6 +1710,21 @@ static struct cftype cft_notify_on_release = { | |||
1408 | .private = FILE_NOTIFY_ON_RELEASE, | 1710 | .private = FILE_NOTIFY_ON_RELEASE, |
1409 | }; | 1711 | }; |
1410 | 1712 | ||
1713 | static struct cftype cft_memory_migrate = { | ||
1714 | .name = "memory_migrate", | ||
1715 | .private = FILE_MEMORY_MIGRATE, | ||
1716 | }; | ||
1717 | |||
1718 | static struct cftype cft_memory_pressure_enabled = { | ||
1719 | .name = "memory_pressure_enabled", | ||
1720 | .private = FILE_MEMORY_PRESSURE_ENABLED, | ||
1721 | }; | ||
1722 | |||
1723 | static struct cftype cft_memory_pressure = { | ||
1724 | .name = "memory_pressure", | ||
1725 | .private = FILE_MEMORY_PRESSURE, | ||
1726 | }; | ||
1727 | |||
1411 | static int cpuset_populate_dir(struct dentry *cs_dentry) | 1728 | static int cpuset_populate_dir(struct dentry *cs_dentry) |
1412 | { | 1729 | { |
1413 | int err; | 1730 | int err; |
@@ -1422,6 +1739,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry) | |||
1422 | return err; | 1739 | return err; |
1423 | if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) | 1740 | if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) |
1424 | return err; | 1741 | return err; |
1742 | if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0) | ||
1743 | return err; | ||
1744 | if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) | ||
1745 | return err; | ||
1425 | if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) | 1746 | if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) |
1426 | return err; | 1747 | return err; |
1427 | return 0; | 1748 | return 0; |
@@ -1446,7 +1767,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1446 | return -ENOMEM; | 1767 | return -ENOMEM; |
1447 | 1768 | ||
1448 | down(&manage_sem); | 1769 | down(&manage_sem); |
1449 | refresh_mems(); | 1770 | cpuset_update_task_memory_state(); |
1450 | cs->flags = 0; | 1771 | cs->flags = 0; |
1451 | if (notify_on_release(parent)) | 1772 | if (notify_on_release(parent)) |
1452 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 1773 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
@@ -1457,11 +1778,13 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1457 | INIT_LIST_HEAD(&cs->children); | 1778 | INIT_LIST_HEAD(&cs->children); |
1458 | atomic_inc(&cpuset_mems_generation); | 1779 | atomic_inc(&cpuset_mems_generation); |
1459 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | 1780 | cs->mems_generation = atomic_read(&cpuset_mems_generation); |
1781 | fmeter_init(&cs->fmeter); | ||
1460 | 1782 | ||
1461 | cs->parent = parent; | 1783 | cs->parent = parent; |
1462 | 1784 | ||
1463 | down(&callback_sem); | 1785 | down(&callback_sem); |
1464 | list_add(&cs->sibling, &cs->parent->children); | 1786 | list_add(&cs->sibling, &cs->parent->children); |
1787 | number_of_cpusets++; | ||
1465 | up(&callback_sem); | 1788 | up(&callback_sem); |
1466 | 1789 | ||
1467 | err = cpuset_create_dir(cs, name, mode); | 1790 | err = cpuset_create_dir(cs, name, mode); |
@@ -1470,7 +1793,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1470 | 1793 | ||
1471 | /* | 1794 | /* |
1472 | * Release manage_sem before cpuset_populate_dir() because it | 1795 | * Release manage_sem before cpuset_populate_dir() because it |
1473 | * will down() this new directory's i_sem and if we race with | 1796 | * will down() this new directory's i_mutex and if we race with |
1474 | * another mkdir, we might deadlock. | 1797 | * another mkdir, we might deadlock. |
1475 | */ | 1798 | */ |
1476 | up(&manage_sem); | 1799 | up(&manage_sem); |
@@ -1489,7 +1812,7 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode) | |||
1489 | { | 1812 | { |
1490 | struct cpuset *c_parent = dentry->d_parent->d_fsdata; | 1813 | struct cpuset *c_parent = dentry->d_parent->d_fsdata; |
1491 | 1814 | ||
1492 | /* the vfs holds inode->i_sem already */ | 1815 | /* the vfs holds inode->i_mutex already */ |
1493 | return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); | 1816 | return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); |
1494 | } | 1817 | } |
1495 | 1818 | ||
@@ -1500,10 +1823,10 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1500 | struct cpuset *parent; | 1823 | struct cpuset *parent; |
1501 | char *pathbuf = NULL; | 1824 | char *pathbuf = NULL; |
1502 | 1825 | ||
1503 | /* the vfs holds both inode->i_sem already */ | 1826 | /* the vfs holds both inode->i_mutex already */ |
1504 | 1827 | ||
1505 | down(&manage_sem); | 1828 | down(&manage_sem); |
1506 | refresh_mems(); | 1829 | cpuset_update_task_memory_state(); |
1507 | if (atomic_read(&cs->count) > 0) { | 1830 | if (atomic_read(&cs->count) > 0) { |
1508 | up(&manage_sem); | 1831 | up(&manage_sem); |
1509 | return -EBUSY; | 1832 | return -EBUSY; |
@@ -1524,6 +1847,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1524 | spin_unlock(&d->d_lock); | 1847 | spin_unlock(&d->d_lock); |
1525 | cpuset_d_remove_dir(d); | 1848 | cpuset_d_remove_dir(d); |
1526 | dput(d); | 1849 | dput(d); |
1850 | number_of_cpusets--; | ||
1527 | up(&callback_sem); | 1851 | up(&callback_sem); |
1528 | if (list_empty(&parent->children)) | 1852 | if (list_empty(&parent->children)) |
1529 | check_for_release(parent, &pathbuf); | 1853 | check_for_release(parent, &pathbuf); |
@@ -1532,6 +1856,21 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1532 | return 0; | 1856 | return 0; |
1533 | } | 1857 | } |
1534 | 1858 | ||
1859 | /* | ||
1860 | * cpuset_init_early - just enough so that the calls to | ||
1861 | * cpuset_update_task_memory_state() in early init code | ||
1862 | * are harmless. | ||
1863 | */ | ||
1864 | |||
1865 | int __init cpuset_init_early(void) | ||
1866 | { | ||
1867 | struct task_struct *tsk = current; | ||
1868 | |||
1869 | tsk->cpuset = &top_cpuset; | ||
1870 | tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation); | ||
1871 | return 0; | ||
1872 | } | ||
1873 | |||
1535 | /** | 1874 | /** |
1536 | * cpuset_init - initialize cpusets at system boot | 1875 | * cpuset_init - initialize cpusets at system boot |
1537 | * | 1876 | * |
@@ -1546,6 +1885,7 @@ int __init cpuset_init(void) | |||
1546 | top_cpuset.cpus_allowed = CPU_MASK_ALL; | 1885 | top_cpuset.cpus_allowed = CPU_MASK_ALL; |
1547 | top_cpuset.mems_allowed = NODE_MASK_ALL; | 1886 | top_cpuset.mems_allowed = NODE_MASK_ALL; |
1548 | 1887 | ||
1888 | fmeter_init(&top_cpuset.fmeter); | ||
1549 | atomic_inc(&cpuset_mems_generation); | 1889 | atomic_inc(&cpuset_mems_generation); |
1550 | top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); | 1890 | top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); |
1551 | 1891 | ||
@@ -1566,7 +1906,11 @@ int __init cpuset_init(void) | |||
1566 | root->d_inode->i_nlink++; | 1906 | root->d_inode->i_nlink++; |
1567 | top_cpuset.dentry = root; | 1907 | top_cpuset.dentry = root; |
1568 | root->d_inode->i_op = &cpuset_dir_inode_operations; | 1908 | root->d_inode->i_op = &cpuset_dir_inode_operations; |
1909 | number_of_cpusets = 1; | ||
1569 | err = cpuset_populate_dir(root); | 1910 | err = cpuset_populate_dir(root); |
1911 | /* memory_pressure_enabled is in root cpuset only */ | ||
1912 | if (err == 0) | ||
1913 | err = cpuset_add_file(root, &cft_memory_pressure_enabled); | ||
1570 | out: | 1914 | out: |
1571 | return err; | 1915 | return err; |
1572 | } | 1916 | } |
@@ -1632,15 +1976,13 @@ void cpuset_fork(struct task_struct *child) | |||
1632 | * | 1976 | * |
1633 | * We don't need to task_lock() this reference to tsk->cpuset, | 1977 | * We don't need to task_lock() this reference to tsk->cpuset, |
1634 | * because tsk is already marked PF_EXITING, so attach_task() won't | 1978 | * because tsk is already marked PF_EXITING, so attach_task() won't |
1635 | * mess with it. | 1979 | * mess with it, or task is a failed fork, never visible to attach_task. |
1636 | **/ | 1980 | **/ |
1637 | 1981 | ||
1638 | void cpuset_exit(struct task_struct *tsk) | 1982 | void cpuset_exit(struct task_struct *tsk) |
1639 | { | 1983 | { |
1640 | struct cpuset *cs; | 1984 | struct cpuset *cs; |
1641 | 1985 | ||
1642 | BUG_ON(!(tsk->flags & PF_EXITING)); | ||
1643 | |||
1644 | cs = tsk->cpuset; | 1986 | cs = tsk->cpuset; |
1645 | tsk->cpuset = NULL; | 1987 | tsk->cpuset = NULL; |
1646 | 1988 | ||
@@ -1667,14 +2009,14 @@ void cpuset_exit(struct task_struct *tsk) | |||
1667 | * tasks cpuset. | 2009 | * tasks cpuset. |
1668 | **/ | 2010 | **/ |
1669 | 2011 | ||
1670 | cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) | 2012 | cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) |
1671 | { | 2013 | { |
1672 | cpumask_t mask; | 2014 | cpumask_t mask; |
1673 | 2015 | ||
1674 | down(&callback_sem); | 2016 | down(&callback_sem); |
1675 | task_lock((struct task_struct *)tsk); | 2017 | task_lock(tsk); |
1676 | guarantee_online_cpus(tsk->cpuset, &mask); | 2018 | guarantee_online_cpus(tsk->cpuset, &mask); |
1677 | task_unlock((struct task_struct *)tsk); | 2019 | task_unlock(tsk); |
1678 | up(&callback_sem); | 2020 | up(&callback_sem); |
1679 | 2021 | ||
1680 | return mask; | 2022 | return mask; |
@@ -1686,43 +2028,26 @@ void cpuset_init_current_mems_allowed(void) | |||
1686 | } | 2028 | } |
1687 | 2029 | ||
1688 | /** | 2030 | /** |
1689 | * cpuset_update_current_mems_allowed - update mems parameters to new values | 2031 | * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. |
1690 | * | 2032 | * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. |
1691 | * If the current tasks cpusets mems_allowed changed behind our backs, | ||
1692 | * update current->mems_allowed and mems_generation to the new value. | ||
1693 | * Do not call this routine if in_interrupt(). | ||
1694 | * | 2033 | * |
1695 | * Call without callback_sem or task_lock() held. May be called | 2034 | * Description: Returns the nodemask_t mems_allowed of the cpuset |
1696 | * with or without manage_sem held. Unless exiting, it will acquire | 2035 | * attached to the specified @tsk. Guaranteed to return some non-empty |
1697 | * task_lock(). Also might acquire callback_sem during call to | 2036 | * subset of node_online_map, even if this means going outside the |
1698 | * refresh_mems(). | 2037 | * tasks cpuset. |
1699 | */ | 2038 | **/ |
1700 | 2039 | ||
1701 | void cpuset_update_current_mems_allowed(void) | 2040 | nodemask_t cpuset_mems_allowed(struct task_struct *tsk) |
1702 | { | 2041 | { |
1703 | struct cpuset *cs; | 2042 | nodemask_t mask; |
1704 | int need_to_refresh = 0; | ||
1705 | 2043 | ||
1706 | task_lock(current); | 2044 | down(&callback_sem); |
1707 | cs = current->cpuset; | 2045 | task_lock(tsk); |
1708 | if (!cs) | 2046 | guarantee_online_mems(tsk->cpuset, &mask); |
1709 | goto done; | 2047 | task_unlock(tsk); |
1710 | if (current->cpuset_mems_generation != cs->mems_generation) | 2048 | up(&callback_sem); |
1711 | need_to_refresh = 1; | ||
1712 | done: | ||
1713 | task_unlock(current); | ||
1714 | if (need_to_refresh) | ||
1715 | refresh_mems(); | ||
1716 | } | ||
1717 | 2049 | ||
1718 | /** | 2050 | return mask; |
1719 | * cpuset_restrict_to_mems_allowed - limit nodes to current mems_allowed | ||
1720 | * @nodes: pointer to a node bitmap that is and-ed with mems_allowed | ||
1721 | */ | ||
1722 | void cpuset_restrict_to_mems_allowed(unsigned long *nodes) | ||
1723 | { | ||
1724 | bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed), | ||
1725 | MAX_NUMNODES); | ||
1726 | } | 2051 | } |
1727 | 2052 | ||
1728 | /** | 2053 | /** |
@@ -1795,7 +2120,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | |||
1795 | * GFP_USER - only nodes in current tasks mems allowed ok. | 2120 | * GFP_USER - only nodes in current tasks mems allowed ok. |
1796 | **/ | 2121 | **/ |
1797 | 2122 | ||
1798 | int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | 2123 | int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) |
1799 | { | 2124 | { |
1800 | int node; /* node that zone z is on */ | 2125 | int node; /* node that zone z is on */ |
1801 | const struct cpuset *cs; /* current cpuset ancestors */ | 2126 | const struct cpuset *cs; /* current cpuset ancestors */ |
@@ -1825,6 +2150,33 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | |||
1825 | } | 2150 | } |
1826 | 2151 | ||
1827 | /** | 2152 | /** |
2153 | * cpuset_lock - lock out any changes to cpuset structures | ||
2154 | * | ||
2155 | * The out of memory (oom) code needs to lock down cpusets | ||
2156 | * from being changed while it scans the tasklist looking for a | ||
2157 | * task in an overlapping cpuset. Expose callback_sem via this | ||
2158 | * cpuset_lock() routine, so the oom code can lock it, before | ||
2159 | * locking the task list. The tasklist_lock is a spinlock, so | ||
2160 | * must be taken inside callback_sem. | ||
2161 | */ | ||
2162 | |||
2163 | void cpuset_lock(void) | ||
2164 | { | ||
2165 | down(&callback_sem); | ||
2166 | } | ||
2167 | |||
2168 | /** | ||
2169 | * cpuset_unlock - release lock on cpuset changes | ||
2170 | * | ||
2171 | * Undo the lock taken in a previous cpuset_lock() call. | ||
2172 | */ | ||
2173 | |||
2174 | void cpuset_unlock(void) | ||
2175 | { | ||
2176 | up(&callback_sem); | ||
2177 | } | ||
2178 | |||
2179 | /** | ||
1828 | * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? | 2180 | * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? |
1829 | * @p: pointer to task_struct of some other task. | 2181 | * @p: pointer to task_struct of some other task. |
1830 | * | 2182 | * |
@@ -1833,7 +2185,7 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | |||
1833 | * determine if task @p's memory usage might impact the memory | 2185 | * determine if task @p's memory usage might impact the memory |
1834 | * available to the current task. | 2186 | * available to the current task. |
1835 | * | 2187 | * |
1836 | * Acquires callback_sem - not suitable for calling from a fast path. | 2188 | * Call while holding callback_sem. |
1837 | **/ | 2189 | **/ |
1838 | 2190 | ||
1839 | int cpuset_excl_nodes_overlap(const struct task_struct *p) | 2191 | int cpuset_excl_nodes_overlap(const struct task_struct *p) |
@@ -1841,8 +2193,6 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) | |||
1841 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ | 2193 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ |
1842 | int overlap = 0; /* do cpusets overlap? */ | 2194 | int overlap = 0; /* do cpusets overlap? */ |
1843 | 2195 | ||
1844 | down(&callback_sem); | ||
1845 | |||
1846 | task_lock(current); | 2196 | task_lock(current); |
1847 | if (current->flags & PF_EXITING) { | 2197 | if (current->flags & PF_EXITING) { |
1848 | task_unlock(current); | 2198 | task_unlock(current); |
@@ -1861,12 +2211,46 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) | |||
1861 | 2211 | ||
1862 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); | 2212 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); |
1863 | done: | 2213 | done: |
1864 | up(&callback_sem); | ||
1865 | |||
1866 | return overlap; | 2214 | return overlap; |
1867 | } | 2215 | } |
1868 | 2216 | ||
1869 | /* | 2217 | /* |
2218 | * Collection of memory_pressure is suppressed unless | ||
2219 | * this flag is enabled by writing "1" to the special | ||
2220 | * cpuset file 'memory_pressure_enabled' in the root cpuset. | ||
2221 | */ | ||
2222 | |||
2223 | int cpuset_memory_pressure_enabled __read_mostly; | ||
2224 | |||
2225 | /** | ||
2226 | * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. | ||
2227 | * | ||
2228 | * Keep a running average of the rate of synchronous (direct) | ||
2229 | * page reclaim efforts initiated by tasks in each cpuset. | ||
2230 | * | ||
2231 | * This represents the rate at which some task in the cpuset | ||
2232 | * ran low on memory on all nodes it was allowed to use, and | ||
2233 | * had to enter the kernels page reclaim code in an effort to | ||
2234 | * create more free memory by tossing clean pages or swapping | ||
2235 | * or writing dirty pages. | ||
2236 | * | ||
2237 | * Display to user space in the per-cpuset read-only file | ||
2238 | * "memory_pressure". Value displayed is an integer | ||
2239 | * representing the recent rate of entry into the synchronous | ||
2240 | * (direct) page reclaim by any task attached to the cpuset. | ||
2241 | **/ | ||
2242 | |||
2243 | void __cpuset_memory_pressure_bump(void) | ||
2244 | { | ||
2245 | struct cpuset *cs; | ||
2246 | |||
2247 | task_lock(current); | ||
2248 | cs = current->cpuset; | ||
2249 | fmeter_markevent(&cs->fmeter); | ||
2250 | task_unlock(current); | ||
2251 | } | ||
2252 | |||
2253 | /* | ||
1870 | * proc_cpuset_show() | 2254 | * proc_cpuset_show() |
1871 | * - Print tasks cpuset path into seq_file. | 2255 | * - Print tasks cpuset path into seq_file. |
1872 | * - Used for /proc/<pid>/cpuset. | 2256 | * - Used for /proc/<pid>/cpuset. |
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c deleted file mode 100644 index 334c37f5218a..000000000000 --- a/kernel/crash_dump.c +++ /dev/null | |||
@@ -1,61 +0,0 @@ | |||
1 | /* | ||
2 | * kernel/crash_dump.c - Memory preserving reboot related code. | ||
3 | * | ||
4 | * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) | ||
5 | * Copyright (C) IBM Corporation, 2004. All rights reserved | ||
6 | */ | ||
7 | |||
8 | #include <linux/smp_lock.h> | ||
9 | #include <linux/errno.h> | ||
10 | #include <linux/proc_fs.h> | ||
11 | #include <linux/bootmem.h> | ||
12 | #include <linux/highmem.h> | ||
13 | #include <linux/crash_dump.h> | ||
14 | |||
15 | #include <asm/io.h> | ||
16 | #include <asm/uaccess.h> | ||
17 | |||
18 | /* Stores the physical address of elf header of crash image. */ | ||
19 | unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; | ||
20 | |||
21 | /** | ||
22 | * copy_oldmem_page - copy one page from "oldmem" | ||
23 | * @pfn: page frame number to be copied | ||
24 | * @buf: target memory address for the copy; this can be in kernel address | ||
25 | * space or user address space (see @userbuf) | ||
26 | * @csize: number of bytes to copy | ||
27 | * @offset: offset in bytes into the page (based on pfn) to begin the copy | ||
28 | * @userbuf: if set, @buf is in user address space, use copy_to_user(), | ||
29 | * otherwise @buf is in kernel address space, use memcpy(). | ||
30 | * | ||
31 | * Copy a page from "oldmem". For this page, there is no pte mapped | ||
32 | * in the current kernel. We stitch up a pte, similar to kmap_atomic. | ||
33 | */ | ||
34 | ssize_t copy_oldmem_page(unsigned long pfn, char *buf, | ||
35 | size_t csize, unsigned long offset, int userbuf) | ||
36 | { | ||
37 | void *page, *vaddr; | ||
38 | |||
39 | if (!csize) | ||
40 | return 0; | ||
41 | |||
42 | page = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
43 | if (!page) | ||
44 | return -ENOMEM; | ||
45 | |||
46 | vaddr = kmap_atomic_pfn(pfn, KM_PTE0); | ||
47 | copy_page(page, vaddr); | ||
48 | kunmap_atomic(vaddr, KM_PTE0); | ||
49 | |||
50 | if (userbuf) { | ||
51 | if (copy_to_user(buf, (page + offset), csize)) { | ||
52 | kfree(page); | ||
53 | return -EFAULT; | ||
54 | } | ||
55 | } else { | ||
56 | memcpy(buf, (page + offset), csize); | ||
57 | } | ||
58 | |||
59 | kfree(page); | ||
60 | return csize; | ||
61 | } | ||
diff --git a/kernel/exit.c b/kernel/exit.c index ee515683b92d..93cee3671332 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/interrupt.h> | 10 | #include <linux/interrupt.h> |
11 | #include <linux/smp_lock.h> | 11 | #include <linux/smp_lock.h> |
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <linux/capability.h> | ||
13 | #include <linux/completion.h> | 14 | #include <linux/completion.h> |
14 | #include <linux/personality.h> | 15 | #include <linux/personality.h> |
15 | #include <linux/tty.h> | 16 | #include <linux/tty.h> |
@@ -29,6 +30,7 @@ | |||
29 | #include <linux/syscalls.h> | 30 | #include <linux/syscalls.h> |
30 | #include <linux/signal.h> | 31 | #include <linux/signal.h> |
31 | #include <linux/cn_proc.h> | 32 | #include <linux/cn_proc.h> |
33 | #include <linux/mutex.h> | ||
32 | 34 | ||
33 | #include <asm/uaccess.h> | 35 | #include <asm/uaccess.h> |
34 | #include <asm/unistd.h> | 36 | #include <asm/unistd.h> |
@@ -72,7 +74,6 @@ repeat: | |||
72 | __ptrace_unlink(p); | 74 | __ptrace_unlink(p); |
73 | BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); | 75 | BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); |
74 | __exit_signal(p); | 76 | __exit_signal(p); |
75 | __exit_sighand(p); | ||
76 | /* | 77 | /* |
77 | * Note that the fastpath in sys_times depends on __exit_signal having | 78 | * Note that the fastpath in sys_times depends on __exit_signal having |
78 | * updated the counters before a task is removed from the tasklist of | 79 | * updated the counters before a task is removed from the tasklist of |
@@ -192,7 +193,7 @@ int is_orphaned_pgrp(int pgrp) | |||
192 | return retval; | 193 | return retval; |
193 | } | 194 | } |
194 | 195 | ||
195 | static inline int has_stopped_jobs(int pgrp) | 196 | static int has_stopped_jobs(int pgrp) |
196 | { | 197 | { |
197 | int retval = 0; | 198 | int retval = 0; |
198 | struct task_struct *p; | 199 | struct task_struct *p; |
@@ -229,7 +230,7 @@ static inline int has_stopped_jobs(int pgrp) | |||
229 | * | 230 | * |
230 | * NOTE that reparent_to_init() gives the caller full capabilities. | 231 | * NOTE that reparent_to_init() gives the caller full capabilities. |
231 | */ | 232 | */ |
232 | static inline void reparent_to_init(void) | 233 | static void reparent_to_init(void) |
233 | { | 234 | { |
234 | write_lock_irq(&tasklist_lock); | 235 | write_lock_irq(&tasklist_lock); |
235 | 236 | ||
@@ -243,7 +244,9 @@ static inline void reparent_to_init(void) | |||
243 | /* Set the exit signal to SIGCHLD so we signal init on exit */ | 244 | /* Set the exit signal to SIGCHLD so we signal init on exit */ |
244 | current->exit_signal = SIGCHLD; | 245 | current->exit_signal = SIGCHLD; |
245 | 246 | ||
246 | if ((current->policy == SCHED_NORMAL) && (task_nice(current) < 0)) | 247 | if ((current->policy == SCHED_NORMAL || |
248 | current->policy == SCHED_BATCH) | ||
249 | && (task_nice(current) < 0)) | ||
247 | set_user_nice(current, 0); | 250 | set_user_nice(current, 0); |
248 | /* cpus_allowed? */ | 251 | /* cpus_allowed? */ |
249 | /* rt_priority? */ | 252 | /* rt_priority? */ |
@@ -258,7 +261,7 @@ static inline void reparent_to_init(void) | |||
258 | 261 | ||
259 | void __set_special_pids(pid_t session, pid_t pgrp) | 262 | void __set_special_pids(pid_t session, pid_t pgrp) |
260 | { | 263 | { |
261 | struct task_struct *curr = current; | 264 | struct task_struct *curr = current->group_leader; |
262 | 265 | ||
263 | if (curr->signal->session != session) { | 266 | if (curr->signal->session != session) { |
264 | detach_pid(curr, PIDTYPE_SID); | 267 | detach_pid(curr, PIDTYPE_SID); |
@@ -366,7 +369,7 @@ void daemonize(const char *name, ...) | |||
366 | 369 | ||
367 | EXPORT_SYMBOL(daemonize); | 370 | EXPORT_SYMBOL(daemonize); |
368 | 371 | ||
369 | static inline void close_files(struct files_struct * files) | 372 | static void close_files(struct files_struct * files) |
370 | { | 373 | { |
371 | int i, j; | 374 | int i, j; |
372 | struct fdtable *fdt; | 375 | struct fdtable *fdt; |
@@ -540,7 +543,7 @@ static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_re | |||
540 | p->real_parent = reaper; | 543 | p->real_parent = reaper; |
541 | } | 544 | } |
542 | 545 | ||
543 | static inline void reparent_thread(task_t *p, task_t *father, int traced) | 546 | static void reparent_thread(task_t *p, task_t *father, int traced) |
544 | { | 547 | { |
545 | /* We don't want people slaying init. */ | 548 | /* We don't want people slaying init. */ |
546 | if (p->exit_signal != -1) | 549 | if (p->exit_signal != -1) |
@@ -604,7 +607,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced) | |||
604 | * group, and if no such member exists, give it to | 607 | * group, and if no such member exists, give it to |
605 | * the global child reaper process (ie "init") | 608 | * the global child reaper process (ie "init") |
606 | */ | 609 | */ |
607 | static inline void forget_original_parent(struct task_struct * father, | 610 | static void forget_original_parent(struct task_struct * father, |
608 | struct list_head *to_release) | 611 | struct list_head *to_release) |
609 | { | 612 | { |
610 | struct task_struct *p, *reaper = father; | 613 | struct task_struct *p, *reaper = father; |
@@ -842,7 +845,7 @@ fastcall NORET_TYPE void do_exit(long code) | |||
842 | } | 845 | } |
843 | group_dead = atomic_dec_and_test(&tsk->signal->live); | 846 | group_dead = atomic_dec_and_test(&tsk->signal->live); |
844 | if (group_dead) { | 847 | if (group_dead) { |
845 | del_timer_sync(&tsk->signal->real_timer); | 848 | hrtimer_cancel(&tsk->signal->real_timer); |
846 | exit_itimers(tsk->signal); | 849 | exit_itimers(tsk->signal); |
847 | acct_process(code); | 850 | acct_process(code); |
848 | } | 851 | } |
@@ -870,6 +873,10 @@ fastcall NORET_TYPE void do_exit(long code) | |||
870 | mpol_free(tsk->mempolicy); | 873 | mpol_free(tsk->mempolicy); |
871 | tsk->mempolicy = NULL; | 874 | tsk->mempolicy = NULL; |
872 | #endif | 875 | #endif |
876 | /* | ||
877 | * If DEBUG_MUTEXES is on, make sure we are holding no locks: | ||
878 | */ | ||
879 | mutex_debug_check_no_locks_held(tsk); | ||
873 | 880 | ||
874 | /* PF_DEAD causes final put_task_struct after we schedule. */ | 881 | /* PF_DEAD causes final put_task_struct after we schedule. */ |
875 | preempt_disable(); | 882 | preempt_disable(); |
@@ -926,7 +933,6 @@ do_group_exit(int exit_code) | |||
926 | /* Another thread got here before we took the lock. */ | 933 | /* Another thread got here before we took the lock. */ |
927 | exit_code = sig->group_exit_code; | 934 | exit_code = sig->group_exit_code; |
928 | else { | 935 | else { |
929 | sig->flags = SIGNAL_GROUP_EXIT; | ||
930 | sig->group_exit_code = exit_code; | 936 | sig->group_exit_code = exit_code; |
931 | zap_other_threads(current); | 937 | zap_other_threads(current); |
932 | } | 938 | } |
@@ -1068,6 +1074,9 @@ static int wait_task_zombie(task_t *p, int noreap, | |||
1068 | } | 1074 | } |
1069 | 1075 | ||
1070 | if (likely(p->real_parent == p->parent) && likely(p->signal)) { | 1076 | if (likely(p->real_parent == p->parent) && likely(p->signal)) { |
1077 | struct signal_struct *psig; | ||
1078 | struct signal_struct *sig; | ||
1079 | |||
1071 | /* | 1080 | /* |
1072 | * The resource counters for the group leader are in its | 1081 | * The resource counters for the group leader are in its |
1073 | * own task_struct. Those for dead threads in the group | 1082 | * own task_struct. Those for dead threads in the group |
@@ -1084,24 +1093,26 @@ static int wait_task_zombie(task_t *p, int noreap, | |||
1084 | * here reaping other children at the same time. | 1093 | * here reaping other children at the same time. |
1085 | */ | 1094 | */ |
1086 | spin_lock_irq(&p->parent->sighand->siglock); | 1095 | spin_lock_irq(&p->parent->sighand->siglock); |
1087 | p->parent->signal->cutime = | 1096 | psig = p->parent->signal; |
1088 | cputime_add(p->parent->signal->cutime, | 1097 | sig = p->signal; |
1098 | psig->cutime = | ||
1099 | cputime_add(psig->cutime, | ||
1089 | cputime_add(p->utime, | 1100 | cputime_add(p->utime, |
1090 | cputime_add(p->signal->utime, | 1101 | cputime_add(sig->utime, |
1091 | p->signal->cutime))); | 1102 | sig->cutime))); |
1092 | p->parent->signal->cstime = | 1103 | psig->cstime = |
1093 | cputime_add(p->parent->signal->cstime, | 1104 | cputime_add(psig->cstime, |
1094 | cputime_add(p->stime, | 1105 | cputime_add(p->stime, |
1095 | cputime_add(p->signal->stime, | 1106 | cputime_add(sig->stime, |
1096 | p->signal->cstime))); | 1107 | sig->cstime))); |
1097 | p->parent->signal->cmin_flt += | 1108 | psig->cmin_flt += |
1098 | p->min_flt + p->signal->min_flt + p->signal->cmin_flt; | 1109 | p->min_flt + sig->min_flt + sig->cmin_flt; |
1099 | p->parent->signal->cmaj_flt += | 1110 | psig->cmaj_flt += |
1100 | p->maj_flt + p->signal->maj_flt + p->signal->cmaj_flt; | 1111 | p->maj_flt + sig->maj_flt + sig->cmaj_flt; |
1101 | p->parent->signal->cnvcsw += | 1112 | psig->cnvcsw += |
1102 | p->nvcsw + p->signal->nvcsw + p->signal->cnvcsw; | 1113 | p->nvcsw + sig->nvcsw + sig->cnvcsw; |
1103 | p->parent->signal->cnivcsw += | 1114 | psig->cnivcsw += |
1104 | p->nivcsw + p->signal->nivcsw + p->signal->cnivcsw; | 1115 | p->nivcsw + sig->nivcsw + sig->cnivcsw; |
1105 | spin_unlock_irq(&p->parent->sighand->siglock); | 1116 | spin_unlock_irq(&p->parent->sighand->siglock); |
1106 | } | 1117 | } |
1107 | 1118 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index fb8572a42297..4ae8cfc1c89c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/binfmts.h> | 28 | #include <linux/binfmts.h> |
29 | #include <linux/mman.h> | 29 | #include <linux/mman.h> |
30 | #include <linux/fs.h> | 30 | #include <linux/fs.h> |
31 | #include <linux/capability.h> | ||
31 | #include <linux/cpu.h> | 32 | #include <linux/cpu.h> |
32 | #include <linux/cpuset.h> | 33 | #include <linux/cpuset.h> |
33 | #include <linux/security.h> | 34 | #include <linux/security.h> |
@@ -743,6 +744,14 @@ int unshare_files(void) | |||
743 | 744 | ||
744 | EXPORT_SYMBOL(unshare_files); | 745 | EXPORT_SYMBOL(unshare_files); |
745 | 746 | ||
747 | void sighand_free_cb(struct rcu_head *rhp) | ||
748 | { | ||
749 | struct sighand_struct *sp; | ||
750 | |||
751 | sp = container_of(rhp, struct sighand_struct, rcu); | ||
752 | kmem_cache_free(sighand_cachep, sp); | ||
753 | } | ||
754 | |||
746 | static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) | 755 | static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) |
747 | { | 756 | { |
748 | struct sighand_struct *sig; | 757 | struct sighand_struct *sig; |
@@ -752,7 +761,7 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t | |||
752 | return 0; | 761 | return 0; |
753 | } | 762 | } |
754 | sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); | 763 | sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); |
755 | tsk->sighand = sig; | 764 | rcu_assign_pointer(tsk->sighand, sig); |
756 | if (!sig) | 765 | if (!sig) |
757 | return -ENOMEM; | 766 | return -ENOMEM; |
758 | spin_lock_init(&sig->siglock); | 767 | spin_lock_init(&sig->siglock); |
@@ -793,19 +802,16 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
793 | init_sigpending(&sig->shared_pending); | 802 | init_sigpending(&sig->shared_pending); |
794 | INIT_LIST_HEAD(&sig->posix_timers); | 803 | INIT_LIST_HEAD(&sig->posix_timers); |
795 | 804 | ||
796 | sig->it_real_value = sig->it_real_incr = 0; | 805 | hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC); |
806 | sig->it_real_incr.tv64 = 0; | ||
797 | sig->real_timer.function = it_real_fn; | 807 | sig->real_timer.function = it_real_fn; |
798 | sig->real_timer.data = (unsigned long) tsk; | 808 | sig->real_timer.data = tsk; |
799 | init_timer(&sig->real_timer); | ||
800 | 809 | ||
801 | sig->it_virt_expires = cputime_zero; | 810 | sig->it_virt_expires = cputime_zero; |
802 | sig->it_virt_incr = cputime_zero; | 811 | sig->it_virt_incr = cputime_zero; |
803 | sig->it_prof_expires = cputime_zero; | 812 | sig->it_prof_expires = cputime_zero; |
804 | sig->it_prof_incr = cputime_zero; | 813 | sig->it_prof_incr = cputime_zero; |
805 | 814 | ||
806 | sig->tty = current->signal->tty; | ||
807 | sig->pgrp = process_group(current); | ||
808 | sig->session = current->signal->session; | ||
809 | sig->leader = 0; /* session leadership doesn't inherit */ | 815 | sig->leader = 0; /* session leadership doesn't inherit */ |
810 | sig->tty_old_pgrp = 0; | 816 | sig->tty_old_pgrp = 0; |
811 | 817 | ||
@@ -964,15 +970,20 @@ static task_t *copy_process(unsigned long clone_flags, | |||
964 | p->io_context = NULL; | 970 | p->io_context = NULL; |
965 | p->io_wait = NULL; | 971 | p->io_wait = NULL; |
966 | p->audit_context = NULL; | 972 | p->audit_context = NULL; |
973 | cpuset_fork(p); | ||
967 | #ifdef CONFIG_NUMA | 974 | #ifdef CONFIG_NUMA |
968 | p->mempolicy = mpol_copy(p->mempolicy); | 975 | p->mempolicy = mpol_copy(p->mempolicy); |
969 | if (IS_ERR(p->mempolicy)) { | 976 | if (IS_ERR(p->mempolicy)) { |
970 | retval = PTR_ERR(p->mempolicy); | 977 | retval = PTR_ERR(p->mempolicy); |
971 | p->mempolicy = NULL; | 978 | p->mempolicy = NULL; |
972 | goto bad_fork_cleanup; | 979 | goto bad_fork_cleanup_cpuset; |
973 | } | 980 | } |
974 | #endif | 981 | #endif |
975 | 982 | ||
983 | #ifdef CONFIG_DEBUG_MUTEXES | ||
984 | p->blocked_on = NULL; /* not blocked yet */ | ||
985 | #endif | ||
986 | |||
976 | p->tgid = p->pid; | 987 | p->tgid = p->pid; |
977 | if (clone_flags & CLONE_THREAD) | 988 | if (clone_flags & CLONE_THREAD) |
978 | p->tgid = current->tgid; | 989 | p->tgid = current->tgid; |
@@ -1127,25 +1138,19 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1127 | attach_pid(p, PIDTYPE_PID, p->pid); | 1138 | attach_pid(p, PIDTYPE_PID, p->pid); |
1128 | attach_pid(p, PIDTYPE_TGID, p->tgid); | 1139 | attach_pid(p, PIDTYPE_TGID, p->tgid); |
1129 | if (thread_group_leader(p)) { | 1140 | if (thread_group_leader(p)) { |
1141 | p->signal->tty = current->signal->tty; | ||
1142 | p->signal->pgrp = process_group(current); | ||
1143 | p->signal->session = current->signal->session; | ||
1130 | attach_pid(p, PIDTYPE_PGID, process_group(p)); | 1144 | attach_pid(p, PIDTYPE_PGID, process_group(p)); |
1131 | attach_pid(p, PIDTYPE_SID, p->signal->session); | 1145 | attach_pid(p, PIDTYPE_SID, p->signal->session); |
1132 | if (p->pid) | 1146 | if (p->pid) |
1133 | __get_cpu_var(process_counts)++; | 1147 | __get_cpu_var(process_counts)++; |
1134 | } | 1148 | } |
1135 | 1149 | ||
1136 | if (!current->signal->tty && p->signal->tty) | ||
1137 | p->signal->tty = NULL; | ||
1138 | |||
1139 | nr_threads++; | 1150 | nr_threads++; |
1140 | total_forks++; | 1151 | total_forks++; |
1141 | write_unlock_irq(&tasklist_lock); | 1152 | write_unlock_irq(&tasklist_lock); |
1142 | proc_fork_connector(p); | 1153 | proc_fork_connector(p); |
1143 | cpuset_fork(p); | ||
1144 | retval = 0; | ||
1145 | |||
1146 | fork_out: | ||
1147 | if (retval) | ||
1148 | return ERR_PTR(retval); | ||
1149 | return p; | 1154 | return p; |
1150 | 1155 | ||
1151 | bad_fork_cleanup_namespace: | 1156 | bad_fork_cleanup_namespace: |
@@ -1172,7 +1177,9 @@ bad_fork_cleanup_security: | |||
1172 | bad_fork_cleanup_policy: | 1177 | bad_fork_cleanup_policy: |
1173 | #ifdef CONFIG_NUMA | 1178 | #ifdef CONFIG_NUMA |
1174 | mpol_free(p->mempolicy); | 1179 | mpol_free(p->mempolicy); |
1180 | bad_fork_cleanup_cpuset: | ||
1175 | #endif | 1181 | #endif |
1182 | cpuset_exit(p); | ||
1176 | bad_fork_cleanup: | 1183 | bad_fork_cleanup: |
1177 | if (p->binfmt) | 1184 | if (p->binfmt) |
1178 | module_put(p->binfmt->module); | 1185 | module_put(p->binfmt->module); |
@@ -1184,7 +1191,8 @@ bad_fork_cleanup_count: | |||
1184 | free_uid(p->user); | 1191 | free_uid(p->user); |
1185 | bad_fork_free: | 1192 | bad_fork_free: |
1186 | free_task(p); | 1193 | free_task(p); |
1187 | goto fork_out; | 1194 | fork_out: |
1195 | return ERR_PTR(retval); | ||
1188 | } | 1196 | } |
1189 | 1197 | ||
1190 | struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) | 1198 | struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) |
@@ -1290,6 +1298,10 @@ long do_fork(unsigned long clone_flags, | |||
1290 | return pid; | 1298 | return pid; |
1291 | } | 1299 | } |
1292 | 1300 | ||
1301 | #ifndef ARCH_MIN_MMSTRUCT_ALIGN | ||
1302 | #define ARCH_MIN_MMSTRUCT_ALIGN 0 | ||
1303 | #endif | ||
1304 | |||
1293 | void __init proc_caches_init(void) | 1305 | void __init proc_caches_init(void) |
1294 | { | 1306 | { |
1295 | sighand_cachep = kmem_cache_create("sighand_cache", | 1307 | sighand_cachep = kmem_cache_create("sighand_cache", |
@@ -1308,6 +1320,6 @@ void __init proc_caches_init(void) | |||
1308 | sizeof(struct vm_area_struct), 0, | 1320 | sizeof(struct vm_area_struct), 0, |
1309 | SLAB_PANIC, NULL, NULL); | 1321 | SLAB_PANIC, NULL, NULL); |
1310 | mm_cachep = kmem_cache_create("mm_struct", | 1322 | mm_cachep = kmem_cache_create("mm_struct", |
1311 | sizeof(struct mm_struct), 0, | 1323 | sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, |
1312 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); | 1324 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); |
1313 | } | 1325 | } |
diff --git a/kernel/futex.c b/kernel/futex.c index 5e71a6bf6f6b..5efa2f978032 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -356,6 +356,13 @@ retry: | |||
356 | if (bh1 != bh2) | 356 | if (bh1 != bh2) |
357 | spin_unlock(&bh2->lock); | 357 | spin_unlock(&bh2->lock); |
358 | 358 | ||
359 | #ifndef CONFIG_MMU | ||
360 | /* we don't get EFAULT from MMU faults if we don't have an MMU, | ||
361 | * but we might get them from range checking */ | ||
362 | ret = op_ret; | ||
363 | goto out; | ||
364 | #endif | ||
365 | |||
359 | if (unlikely(op_ret != -EFAULT)) { | 366 | if (unlikely(op_ret != -EFAULT)) { |
360 | ret = op_ret; | 367 | ret = op_ret; |
361 | goto out; | 368 | goto out; |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c new file mode 100644 index 000000000000..f1c4155b49ac --- /dev/null +++ b/kernel/hrtimer.c | |||
@@ -0,0 +1,826 @@ | |||
1 | /* | ||
2 | * linux/kernel/hrtimer.c | ||
3 | * | ||
4 | * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de> | ||
5 | * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar | ||
6 | * | ||
7 | * High-resolution kernel timers | ||
8 | * | ||
9 | * In contrast to the low-resolution timeout API implemented in | ||
10 | * kernel/timer.c, hrtimers provide finer resolution and accuracy | ||
11 | * depending on system configuration and capabilities. | ||
12 | * | ||
13 | * These timers are currently used for: | ||
14 | * - itimers | ||
15 | * - POSIX timers | ||
16 | * - nanosleep | ||
17 | * - precise in-kernel timing | ||
18 | * | ||
19 | * Started by: Thomas Gleixner and Ingo Molnar | ||
20 | * | ||
21 | * Credits: | ||
22 | * based on kernel/timer.c | ||
23 | * | ||
24 | * For licencing details see kernel-base/COPYING | ||
25 | */ | ||
26 | |||
27 | #include <linux/cpu.h> | ||
28 | #include <linux/module.h> | ||
29 | #include <linux/percpu.h> | ||
30 | #include <linux/hrtimer.h> | ||
31 | #include <linux/notifier.h> | ||
32 | #include <linux/syscalls.h> | ||
33 | #include <linux/interrupt.h> | ||
34 | |||
35 | #include <asm/uaccess.h> | ||
36 | |||
37 | /** | ||
38 | * ktime_get - get the monotonic time in ktime_t format | ||
39 | * | ||
40 | * returns the time in ktime_t format | ||
41 | */ | ||
42 | static ktime_t ktime_get(void) | ||
43 | { | ||
44 | struct timespec now; | ||
45 | |||
46 | ktime_get_ts(&now); | ||
47 | |||
48 | return timespec_to_ktime(now); | ||
49 | } | ||
50 | |||
51 | /** | ||
52 | * ktime_get_real - get the real (wall-) time in ktime_t format | ||
53 | * | ||
54 | * returns the time in ktime_t format | ||
55 | */ | ||
56 | static ktime_t ktime_get_real(void) | ||
57 | { | ||
58 | struct timespec now; | ||
59 | |||
60 | getnstimeofday(&now); | ||
61 | |||
62 | return timespec_to_ktime(now); | ||
63 | } | ||
64 | |||
65 | EXPORT_SYMBOL_GPL(ktime_get_real); | ||
66 | |||
67 | /* | ||
68 | * The timer bases: | ||
69 | */ | ||
70 | |||
71 | #define MAX_HRTIMER_BASES 2 | ||
72 | |||
73 | static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) = | ||
74 | { | ||
75 | { | ||
76 | .index = CLOCK_REALTIME, | ||
77 | .get_time = &ktime_get_real, | ||
78 | .resolution = KTIME_REALTIME_RES, | ||
79 | }, | ||
80 | { | ||
81 | .index = CLOCK_MONOTONIC, | ||
82 | .get_time = &ktime_get, | ||
83 | .resolution = KTIME_MONOTONIC_RES, | ||
84 | }, | ||
85 | }; | ||
86 | |||
87 | /** | ||
88 | * ktime_get_ts - get the monotonic clock in timespec format | ||
89 | * | ||
90 | * @ts: pointer to timespec variable | ||
91 | * | ||
92 | * The function calculates the monotonic clock from the realtime | ||
93 | * clock and the wall_to_monotonic offset and stores the result | ||
94 | * in normalized timespec format in the variable pointed to by ts. | ||
95 | */ | ||
96 | void ktime_get_ts(struct timespec *ts) | ||
97 | { | ||
98 | struct timespec tomono; | ||
99 | unsigned long seq; | ||
100 | |||
101 | do { | ||
102 | seq = read_seqbegin(&xtime_lock); | ||
103 | getnstimeofday(ts); | ||
104 | tomono = wall_to_monotonic; | ||
105 | |||
106 | } while (read_seqretry(&xtime_lock, seq)); | ||
107 | |||
108 | set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, | ||
109 | ts->tv_nsec + tomono.tv_nsec); | ||
110 | } | ||
111 | EXPORT_SYMBOL_GPL(ktime_get_ts); | ||
112 | |||
113 | /* | ||
114 | * Functions and macros which are different for UP/SMP systems are kept in a | ||
115 | * single place | ||
116 | */ | ||
117 | #ifdef CONFIG_SMP | ||
118 | |||
119 | #define set_curr_timer(b, t) do { (b)->curr_timer = (t); } while (0) | ||
120 | |||
121 | /* | ||
122 | * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock | ||
123 | * means that all timers which are tied to this base via timer->base are | ||
124 | * locked, and the base itself is locked too. | ||
125 | * | ||
126 | * So __run_timers/migrate_timers can safely modify all timers which could | ||
127 | * be found on the lists/queues. | ||
128 | * | ||
129 | * When the timer's base is locked, and the timer removed from list, it is | ||
130 | * possible to set timer->base = NULL and drop the lock: the timer remains | ||
131 | * locked. | ||
132 | */ | ||
133 | static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer, | ||
134 | unsigned long *flags) | ||
135 | { | ||
136 | struct hrtimer_base *base; | ||
137 | |||
138 | for (;;) { | ||
139 | base = timer->base; | ||
140 | if (likely(base != NULL)) { | ||
141 | spin_lock_irqsave(&base->lock, *flags); | ||
142 | if (likely(base == timer->base)) | ||
143 | return base; | ||
144 | /* The timer has migrated to another CPU: */ | ||
145 | spin_unlock_irqrestore(&base->lock, *flags); | ||
146 | } | ||
147 | cpu_relax(); | ||
148 | } | ||
149 | } | ||
150 | |||
151 | /* | ||
152 | * Switch the timer base to the current CPU when possible. | ||
153 | */ | ||
154 | static inline struct hrtimer_base * | ||
155 | switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base) | ||
156 | { | ||
157 | struct hrtimer_base *new_base; | ||
158 | |||
159 | new_base = &__get_cpu_var(hrtimer_bases[base->index]); | ||
160 | |||
161 | if (base != new_base) { | ||
162 | /* | ||
163 | * We are trying to schedule the timer on the local CPU. | ||
164 | * However we can't change timer's base while it is running, | ||
165 | * so we keep it on the same CPU. No hassle vs. reprogramming | ||
166 | * the event source in the high resolution case. The softirq | ||
167 | * code will take care of this when the timer function has | ||
168 | * completed. There is no conflict as we hold the lock until | ||
169 | * the timer is enqueued. | ||
170 | */ | ||
171 | if (unlikely(base->curr_timer == timer)) | ||
172 | return base; | ||
173 | |||
174 | /* See the comment in lock_timer_base() */ | ||
175 | timer->base = NULL; | ||
176 | spin_unlock(&base->lock); | ||
177 | spin_lock(&new_base->lock); | ||
178 | timer->base = new_base; | ||
179 | } | ||
180 | return new_base; | ||
181 | } | ||
182 | |||
183 | #else /* CONFIG_SMP */ | ||
184 | |||
185 | #define set_curr_timer(b, t) do { } while (0) | ||
186 | |||
187 | static inline struct hrtimer_base * | ||
188 | lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) | ||
189 | { | ||
190 | struct hrtimer_base *base = timer->base; | ||
191 | |||
192 | spin_lock_irqsave(&base->lock, *flags); | ||
193 | |||
194 | return base; | ||
195 | } | ||
196 | |||
197 | #define switch_hrtimer_base(t, b) (b) | ||
198 | |||
199 | #endif /* !CONFIG_SMP */ | ||
200 | |||
201 | /* | ||
202 | * Functions for the union type storage format of ktime_t which are | ||
203 | * too large for inlining: | ||
204 | */ | ||
205 | #if BITS_PER_LONG < 64 | ||
206 | # ifndef CONFIG_KTIME_SCALAR | ||
207 | /** | ||
208 | * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable | ||
209 | * | ||
210 | * @kt: addend | ||
211 | * @nsec: the scalar nsec value to add | ||
212 | * | ||
213 | * Returns the sum of kt and nsec in ktime_t format | ||
214 | */ | ||
215 | ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) | ||
216 | { | ||
217 | ktime_t tmp; | ||
218 | |||
219 | if (likely(nsec < NSEC_PER_SEC)) { | ||
220 | tmp.tv64 = nsec; | ||
221 | } else { | ||
222 | unsigned long rem = do_div(nsec, NSEC_PER_SEC); | ||
223 | |||
224 | tmp = ktime_set((long)nsec, rem); | ||
225 | } | ||
226 | |||
227 | return ktime_add(kt, tmp); | ||
228 | } | ||
229 | |||
230 | #else /* CONFIG_KTIME_SCALAR */ | ||
231 | |||
232 | # endif /* !CONFIG_KTIME_SCALAR */ | ||
233 | |||
234 | /* | ||
235 | * Divide a ktime value by a nanosecond value | ||
236 | */ | ||
237 | static unsigned long ktime_divns(const ktime_t kt, nsec_t div) | ||
238 | { | ||
239 | u64 dclc, inc, dns; | ||
240 | int sft = 0; | ||
241 | |||
242 | dclc = dns = ktime_to_ns(kt); | ||
243 | inc = div; | ||
244 | /* Make sure the divisor is less than 2^32: */ | ||
245 | while (div >> 32) { | ||
246 | sft++; | ||
247 | div >>= 1; | ||
248 | } | ||
249 | dclc >>= sft; | ||
250 | do_div(dclc, (unsigned long) div); | ||
251 | |||
252 | return (unsigned long) dclc; | ||
253 | } | ||
254 | |||
255 | #else /* BITS_PER_LONG < 64 */ | ||
256 | # define ktime_divns(kt, div) (unsigned long)((kt).tv64 / (div)) | ||
257 | #endif /* BITS_PER_LONG >= 64 */ | ||
258 | |||
259 | /* | ||
260 | * Counterpart to lock_timer_base above: | ||
261 | */ | ||
262 | static inline | ||
263 | void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) | ||
264 | { | ||
265 | spin_unlock_irqrestore(&timer->base->lock, *flags); | ||
266 | } | ||
267 | |||
268 | /** | ||
269 | * hrtimer_forward - forward the timer expiry | ||
270 | * | ||
271 | * @timer: hrtimer to forward | ||
272 | * @interval: the interval to forward | ||
273 | * | ||
274 | * Forward the timer expiry so it will expire in the future. | ||
275 | * Returns the number of overruns. | ||
276 | */ | ||
277 | unsigned long | ||
278 | hrtimer_forward(struct hrtimer *timer, ktime_t interval) | ||
279 | { | ||
280 | unsigned long orun = 1; | ||
281 | ktime_t delta, now; | ||
282 | |||
283 | now = timer->base->get_time(); | ||
284 | |||
285 | delta = ktime_sub(now, timer->expires); | ||
286 | |||
287 | if (delta.tv64 < 0) | ||
288 | return 0; | ||
289 | |||
290 | if (interval.tv64 < timer->base->resolution.tv64) | ||
291 | interval.tv64 = timer->base->resolution.tv64; | ||
292 | |||
293 | if (unlikely(delta.tv64 >= interval.tv64)) { | ||
294 | nsec_t incr = ktime_to_ns(interval); | ||
295 | |||
296 | orun = ktime_divns(delta, incr); | ||
297 | timer->expires = ktime_add_ns(timer->expires, incr * orun); | ||
298 | if (timer->expires.tv64 > now.tv64) | ||
299 | return orun; | ||
300 | /* | ||
301 | * This (and the ktime_add() below) is the | ||
302 | * correction for exact: | ||
303 | */ | ||
304 | orun++; | ||
305 | } | ||
306 | timer->expires = ktime_add(timer->expires, interval); | ||
307 | |||
308 | return orun; | ||
309 | } | ||
310 | |||
311 | /* | ||
312 | * enqueue_hrtimer - internal function to (re)start a timer | ||
313 | * | ||
314 | * The timer is inserted in expiry order. Insertion into the | ||
315 | * red black tree is O(log(n)). Must hold the base lock. | ||
316 | */ | ||
317 | static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | ||
318 | { | ||
319 | struct rb_node **link = &base->active.rb_node; | ||
320 | struct rb_node *parent = NULL; | ||
321 | struct hrtimer *entry; | ||
322 | |||
323 | /* | ||
324 | * Find the right place in the rbtree: | ||
325 | */ | ||
326 | while (*link) { | ||
327 | parent = *link; | ||
328 | entry = rb_entry(parent, struct hrtimer, node); | ||
329 | /* | ||
330 | * We dont care about collisions. Nodes with | ||
331 | * the same expiry time stay together. | ||
332 | */ | ||
333 | if (timer->expires.tv64 < entry->expires.tv64) | ||
334 | link = &(*link)->rb_left; | ||
335 | else | ||
336 | link = &(*link)->rb_right; | ||
337 | } | ||
338 | |||
339 | /* | ||
340 | * Insert the timer to the rbtree and check whether it | ||
341 | * replaces the first pending timer | ||
342 | */ | ||
343 | rb_link_node(&timer->node, parent, link); | ||
344 | rb_insert_color(&timer->node, &base->active); | ||
345 | |||
346 | timer->state = HRTIMER_PENDING; | ||
347 | |||
348 | if (!base->first || timer->expires.tv64 < | ||
349 | rb_entry(base->first, struct hrtimer, node)->expires.tv64) | ||
350 | base->first = &timer->node; | ||
351 | } | ||
352 | |||
353 | /* | ||
354 | * __remove_hrtimer - internal function to remove a timer | ||
355 | * | ||
356 | * Caller must hold the base lock. | ||
357 | */ | ||
358 | static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | ||
359 | { | ||
360 | /* | ||
361 | * Remove the timer from the rbtree and replace the | ||
362 | * first entry pointer if necessary. | ||
363 | */ | ||
364 | if (base->first == &timer->node) | ||
365 | base->first = rb_next(&timer->node); | ||
366 | rb_erase(&timer->node, &base->active); | ||
367 | } | ||
368 | |||
369 | /* | ||
370 | * remove hrtimer, called with base lock held | ||
371 | */ | ||
372 | static inline int | ||
373 | remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) | ||
374 | { | ||
375 | if (hrtimer_active(timer)) { | ||
376 | __remove_hrtimer(timer, base); | ||
377 | timer->state = HRTIMER_INACTIVE; | ||
378 | return 1; | ||
379 | } | ||
380 | return 0; | ||
381 | } | ||
382 | |||
383 | /** | ||
384 | * hrtimer_start - (re)start an relative timer on the current CPU | ||
385 | * | ||
386 | * @timer: the timer to be added | ||
387 | * @tim: expiry time | ||
388 | * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) | ||
389 | * | ||
390 | * Returns: | ||
391 | * 0 on success | ||
392 | * 1 when the timer was active | ||
393 | */ | ||
394 | int | ||
395 | hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) | ||
396 | { | ||
397 | struct hrtimer_base *base, *new_base; | ||
398 | unsigned long flags; | ||
399 | int ret; | ||
400 | |||
401 | base = lock_hrtimer_base(timer, &flags); | ||
402 | |||
403 | /* Remove an active timer from the queue: */ | ||
404 | ret = remove_hrtimer(timer, base); | ||
405 | |||
406 | /* Switch the timer base, if necessary: */ | ||
407 | new_base = switch_hrtimer_base(timer, base); | ||
408 | |||
409 | if (mode == HRTIMER_REL) | ||
410 | tim = ktime_add(tim, new_base->get_time()); | ||
411 | timer->expires = tim; | ||
412 | |||
413 | enqueue_hrtimer(timer, new_base); | ||
414 | |||
415 | unlock_hrtimer_base(timer, &flags); | ||
416 | |||
417 | return ret; | ||
418 | } | ||
419 | |||
420 | /** | ||
421 | * hrtimer_try_to_cancel - try to deactivate a timer | ||
422 | * | ||
423 | * @timer: hrtimer to stop | ||
424 | * | ||
425 | * Returns: | ||
426 | * 0 when the timer was not active | ||
427 | * 1 when the timer was active | ||
428 | * -1 when the timer is currently excuting the callback function and | ||
429 | * can not be stopped | ||
430 | */ | ||
431 | int hrtimer_try_to_cancel(struct hrtimer *timer) | ||
432 | { | ||
433 | struct hrtimer_base *base; | ||
434 | unsigned long flags; | ||
435 | int ret = -1; | ||
436 | |||
437 | base = lock_hrtimer_base(timer, &flags); | ||
438 | |||
439 | if (base->curr_timer != timer) | ||
440 | ret = remove_hrtimer(timer, base); | ||
441 | |||
442 | unlock_hrtimer_base(timer, &flags); | ||
443 | |||
444 | return ret; | ||
445 | |||
446 | } | ||
447 | |||
448 | /** | ||
449 | * hrtimer_cancel - cancel a timer and wait for the handler to finish. | ||
450 | * | ||
451 | * @timer: the timer to be cancelled | ||
452 | * | ||
453 | * Returns: | ||
454 | * 0 when the timer was not active | ||
455 | * 1 when the timer was active | ||
456 | */ | ||
457 | int hrtimer_cancel(struct hrtimer *timer) | ||
458 | { | ||
459 | for (;;) { | ||
460 | int ret = hrtimer_try_to_cancel(timer); | ||
461 | |||
462 | if (ret >= 0) | ||
463 | return ret; | ||
464 | } | ||
465 | } | ||
466 | |||
467 | /** | ||
468 | * hrtimer_get_remaining - get remaining time for the timer | ||
469 | * | ||
470 | * @timer: the timer to read | ||
471 | */ | ||
472 | ktime_t hrtimer_get_remaining(const struct hrtimer *timer) | ||
473 | { | ||
474 | struct hrtimer_base *base; | ||
475 | unsigned long flags; | ||
476 | ktime_t rem; | ||
477 | |||
478 | base = lock_hrtimer_base(timer, &flags); | ||
479 | rem = ktime_sub(timer->expires, timer->base->get_time()); | ||
480 | unlock_hrtimer_base(timer, &flags); | ||
481 | |||
482 | return rem; | ||
483 | } | ||
484 | |||
485 | /** | ||
486 | * hrtimer_rebase - rebase an initialized hrtimer to a different base | ||
487 | * | ||
488 | * @timer: the timer to be rebased | ||
489 | * @clock_id: the clock to be used | ||
490 | */ | ||
491 | void hrtimer_rebase(struct hrtimer *timer, const clockid_t clock_id) | ||
492 | { | ||
493 | struct hrtimer_base *bases; | ||
494 | |||
495 | bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); | ||
496 | timer->base = &bases[clock_id]; | ||
497 | } | ||
498 | |||
499 | /** | ||
500 | * hrtimer_init - initialize a timer to the given clock | ||
501 | * | ||
502 | * @timer: the timer to be initialized | ||
503 | * @clock_id: the clock to be used | ||
504 | */ | ||
505 | void hrtimer_init(struct hrtimer *timer, const clockid_t clock_id) | ||
506 | { | ||
507 | memset(timer, 0, sizeof(struct hrtimer)); | ||
508 | hrtimer_rebase(timer, clock_id); | ||
509 | } | ||
510 | |||
511 | /** | ||
512 | * hrtimer_get_res - get the timer resolution for a clock | ||
513 | * | ||
514 | * @which_clock: which clock to query | ||
515 | * @tp: pointer to timespec variable to store the resolution | ||
516 | * | ||
517 | * Store the resolution of the clock selected by which_clock in the | ||
518 | * variable pointed to by tp. | ||
519 | */ | ||
520 | int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) | ||
521 | { | ||
522 | struct hrtimer_base *bases; | ||
523 | |||
524 | bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); | ||
525 | *tp = ktime_to_timespec(bases[which_clock].resolution); | ||
526 | |||
527 | return 0; | ||
528 | } | ||
529 | |||
530 | /* | ||
531 | * Expire the per base hrtimer-queue: | ||
532 | */ | ||
533 | static inline void run_hrtimer_queue(struct hrtimer_base *base) | ||
534 | { | ||
535 | ktime_t now = base->get_time(); | ||
536 | struct rb_node *node; | ||
537 | |||
538 | spin_lock_irq(&base->lock); | ||
539 | |||
540 | while ((node = base->first)) { | ||
541 | struct hrtimer *timer; | ||
542 | int (*fn)(void *); | ||
543 | int restart; | ||
544 | void *data; | ||
545 | |||
546 | timer = rb_entry(node, struct hrtimer, node); | ||
547 | if (now.tv64 <= timer->expires.tv64) | ||
548 | break; | ||
549 | |||
550 | fn = timer->function; | ||
551 | data = timer->data; | ||
552 | set_curr_timer(base, timer); | ||
553 | __remove_hrtimer(timer, base); | ||
554 | spin_unlock_irq(&base->lock); | ||
555 | |||
556 | /* | ||
557 | * fn == NULL is special case for the simplest timer | ||
558 | * variant - wake up process and do not restart: | ||
559 | */ | ||
560 | if (!fn) { | ||
561 | wake_up_process(data); | ||
562 | restart = HRTIMER_NORESTART; | ||
563 | } else | ||
564 | restart = fn(data); | ||
565 | |||
566 | spin_lock_irq(&base->lock); | ||
567 | |||
568 | if (restart == HRTIMER_RESTART) | ||
569 | enqueue_hrtimer(timer, base); | ||
570 | else | ||
571 | timer->state = HRTIMER_EXPIRED; | ||
572 | } | ||
573 | set_curr_timer(base, NULL); | ||
574 | spin_unlock_irq(&base->lock); | ||
575 | } | ||
576 | |||
577 | /* | ||
578 | * Called from timer softirq every jiffy, expire hrtimers: | ||
579 | */ | ||
580 | void hrtimer_run_queues(void) | ||
581 | { | ||
582 | struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); | ||
583 | int i; | ||
584 | |||
585 | for (i = 0; i < MAX_HRTIMER_BASES; i++) | ||
586 | run_hrtimer_queue(&base[i]); | ||
587 | } | ||
588 | |||
589 | /* | ||
590 | * Sleep related functions: | ||
591 | */ | ||
592 | |||
593 | /** | ||
594 | * schedule_hrtimer - sleep until timeout | ||
595 | * | ||
596 | * @timer: hrtimer variable initialized with the correct clock base | ||
597 | * @mode: timeout value is abs/rel | ||
598 | * | ||
599 | * Make the current task sleep until @timeout is | ||
600 | * elapsed. | ||
601 | * | ||
602 | * You can set the task state as follows - | ||
603 | * | ||
604 | * %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to | ||
605 | * pass before the routine returns. The routine will return 0 | ||
606 | * | ||
607 | * %TASK_INTERRUPTIBLE - the routine may return early if a signal is | ||
608 | * delivered to the current task. In this case the remaining time | ||
609 | * will be returned | ||
610 | * | ||
611 | * The current task state is guaranteed to be TASK_RUNNING when this | ||
612 | * routine returns. | ||
613 | */ | ||
614 | static ktime_t __sched | ||
615 | schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode) | ||
616 | { | ||
617 | /* fn stays NULL, meaning single-shot wakeup: */ | ||
618 | timer->data = current; | ||
619 | |||
620 | hrtimer_start(timer, timer->expires, mode); | ||
621 | |||
622 | schedule(); | ||
623 | hrtimer_cancel(timer); | ||
624 | |||
625 | /* Return the remaining time: */ | ||
626 | if (timer->state != HRTIMER_EXPIRED) | ||
627 | return ktime_sub(timer->expires, timer->base->get_time()); | ||
628 | else | ||
629 | return (ktime_t) {.tv64 = 0 }; | ||
630 | } | ||
631 | |||
632 | static inline ktime_t __sched | ||
633 | schedule_hrtimer_interruptible(struct hrtimer *timer, | ||
634 | const enum hrtimer_mode mode) | ||
635 | { | ||
636 | set_current_state(TASK_INTERRUPTIBLE); | ||
637 | |||
638 | return schedule_hrtimer(timer, mode); | ||
639 | } | ||
640 | |||
641 | static long __sched | ||
642 | nanosleep_restart(struct restart_block *restart, clockid_t clockid) | ||
643 | { | ||
644 | struct timespec __user *rmtp; | ||
645 | struct timespec tu; | ||
646 | void *rfn_save = restart->fn; | ||
647 | struct hrtimer timer; | ||
648 | ktime_t rem; | ||
649 | |||
650 | restart->fn = do_no_restart_syscall; | ||
651 | |||
652 | hrtimer_init(&timer, clockid); | ||
653 | |||
654 | timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0; | ||
655 | |||
656 | rem = schedule_hrtimer_interruptible(&timer, HRTIMER_ABS); | ||
657 | |||
658 | if (rem.tv64 <= 0) | ||
659 | return 0; | ||
660 | |||
661 | rmtp = (struct timespec __user *) restart->arg2; | ||
662 | tu = ktime_to_timespec(rem); | ||
663 | if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu))) | ||
664 | return -EFAULT; | ||
665 | |||
666 | restart->fn = rfn_save; | ||
667 | |||
668 | /* The other values in restart are already filled in */ | ||
669 | return -ERESTART_RESTARTBLOCK; | ||
670 | } | ||
671 | |||
672 | static long __sched nanosleep_restart_mono(struct restart_block *restart) | ||
673 | { | ||
674 | return nanosleep_restart(restart, CLOCK_MONOTONIC); | ||
675 | } | ||
676 | |||
677 | static long __sched nanosleep_restart_real(struct restart_block *restart) | ||
678 | { | ||
679 | return nanosleep_restart(restart, CLOCK_REALTIME); | ||
680 | } | ||
681 | |||
682 | long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, | ||
683 | const enum hrtimer_mode mode, const clockid_t clockid) | ||
684 | { | ||
685 | struct restart_block *restart; | ||
686 | struct hrtimer timer; | ||
687 | struct timespec tu; | ||
688 | ktime_t rem; | ||
689 | |||
690 | hrtimer_init(&timer, clockid); | ||
691 | |||
692 | timer.expires = timespec_to_ktime(*rqtp); | ||
693 | |||
694 | rem = schedule_hrtimer_interruptible(&timer, mode); | ||
695 | if (rem.tv64 <= 0) | ||
696 | return 0; | ||
697 | |||
698 | /* Absolute timers do not update the rmtp value: */ | ||
699 | if (mode == HRTIMER_ABS) | ||
700 | return -ERESTARTNOHAND; | ||
701 | |||
702 | tu = ktime_to_timespec(rem); | ||
703 | |||
704 | if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu))) | ||
705 | return -EFAULT; | ||
706 | |||
707 | restart = ¤t_thread_info()->restart_block; | ||
708 | restart->fn = (clockid == CLOCK_MONOTONIC) ? | ||
709 | nanosleep_restart_mono : nanosleep_restart_real; | ||
710 | restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF; | ||
711 | restart->arg1 = timer.expires.tv64 >> 32; | ||
712 | restart->arg2 = (unsigned long) rmtp; | ||
713 | |||
714 | return -ERESTART_RESTARTBLOCK; | ||
715 | } | ||
716 | |||
717 | asmlinkage long | ||
718 | sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) | ||
719 | { | ||
720 | struct timespec tu; | ||
721 | |||
722 | if (copy_from_user(&tu, rqtp, sizeof(tu))) | ||
723 | return -EFAULT; | ||
724 | |||
725 | if (!timespec_valid(&tu)) | ||
726 | return -EINVAL; | ||
727 | |||
728 | return hrtimer_nanosleep(&tu, rmtp, HRTIMER_REL, CLOCK_MONOTONIC); | ||
729 | } | ||
730 | |||
731 | /* | ||
732 | * Functions related to boot-time initialization: | ||
733 | */ | ||
734 | static void __devinit init_hrtimers_cpu(int cpu) | ||
735 | { | ||
736 | struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu); | ||
737 | int i; | ||
738 | |||
739 | for (i = 0; i < MAX_HRTIMER_BASES; i++) { | ||
740 | spin_lock_init(&base->lock); | ||
741 | base++; | ||
742 | } | ||
743 | } | ||
744 | |||
745 | #ifdef CONFIG_HOTPLUG_CPU | ||
746 | |||
747 | static void migrate_hrtimer_list(struct hrtimer_base *old_base, | ||
748 | struct hrtimer_base *new_base) | ||
749 | { | ||
750 | struct hrtimer *timer; | ||
751 | struct rb_node *node; | ||
752 | |||
753 | while ((node = rb_first(&old_base->active))) { | ||
754 | timer = rb_entry(node, struct hrtimer, node); | ||
755 | __remove_hrtimer(timer, old_base); | ||
756 | timer->base = new_base; | ||
757 | enqueue_hrtimer(timer, new_base); | ||
758 | } | ||
759 | } | ||
760 | |||
761 | static void migrate_hrtimers(int cpu) | ||
762 | { | ||
763 | struct hrtimer_base *old_base, *new_base; | ||
764 | int i; | ||
765 | |||
766 | BUG_ON(cpu_online(cpu)); | ||
767 | old_base = per_cpu(hrtimer_bases, cpu); | ||
768 | new_base = get_cpu_var(hrtimer_bases); | ||
769 | |||
770 | local_irq_disable(); | ||
771 | |||
772 | for (i = 0; i < MAX_HRTIMER_BASES; i++) { | ||
773 | |||
774 | spin_lock(&new_base->lock); | ||
775 | spin_lock(&old_base->lock); | ||
776 | |||
777 | BUG_ON(old_base->curr_timer); | ||
778 | |||
779 | migrate_hrtimer_list(old_base, new_base); | ||
780 | |||
781 | spin_unlock(&old_base->lock); | ||
782 | spin_unlock(&new_base->lock); | ||
783 | old_base++; | ||
784 | new_base++; | ||
785 | } | ||
786 | |||
787 | local_irq_enable(); | ||
788 | put_cpu_var(hrtimer_bases); | ||
789 | } | ||
790 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
791 | |||
792 | static int __devinit hrtimer_cpu_notify(struct notifier_block *self, | ||
793 | unsigned long action, void *hcpu) | ||
794 | { | ||
795 | long cpu = (long)hcpu; | ||
796 | |||
797 | switch (action) { | ||
798 | |||
799 | case CPU_UP_PREPARE: | ||
800 | init_hrtimers_cpu(cpu); | ||
801 | break; | ||
802 | |||
803 | #ifdef CONFIG_HOTPLUG_CPU | ||
804 | case CPU_DEAD: | ||
805 | migrate_hrtimers(cpu); | ||
806 | break; | ||
807 | #endif | ||
808 | |||
809 | default: | ||
810 | break; | ||
811 | } | ||
812 | |||
813 | return NOTIFY_OK; | ||
814 | } | ||
815 | |||
816 | static struct notifier_block __devinitdata hrtimers_nb = { | ||
817 | .notifier_call = hrtimer_cpu_notify, | ||
818 | }; | ||
819 | |||
820 | void __init hrtimers_init(void) | ||
821 | { | ||
822 | hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, | ||
823 | (void *)(long)smp_processor_id()); | ||
824 | register_cpu_notifier(&hrtimers_nb); | ||
825 | } | ||
826 | |||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 81c49a4d679e..97d5559997d2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -366,6 +366,8 @@ int request_irq(unsigned int irq, | |||
366 | action->next = NULL; | 366 | action->next = NULL; |
367 | action->dev_id = dev_id; | 367 | action->dev_id = dev_id; |
368 | 368 | ||
369 | select_smp_affinity(irq); | ||
370 | |||
369 | retval = setup_irq(irq, action); | 371 | retval = setup_irq(irq, action); |
370 | if (retval) | 372 | if (retval) |
371 | kfree(action); | 373 | kfree(action); |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index f26e534c6585..d03b5eef8ce0 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -10,6 +10,8 @@ | |||
10 | #include <linux/proc_fs.h> | 10 | #include <linux/proc_fs.h> |
11 | #include <linux/interrupt.h> | 11 | #include <linux/interrupt.h> |
12 | 12 | ||
13 | #include "internals.h" | ||
14 | |||
13 | static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; | 15 | static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; |
14 | 16 | ||
15 | #ifdef CONFIG_SMP | 17 | #ifdef CONFIG_SMP |
@@ -68,7 +70,9 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, | |||
68 | */ | 70 | */ |
69 | cpus_and(tmp, new_value, cpu_online_map); | 71 | cpus_and(tmp, new_value, cpu_online_map); |
70 | if (cpus_empty(tmp)) | 72 | if (cpus_empty(tmp)) |
71 | return -EINVAL; | 73 | /* Special case for empty set - allow the architecture |
74 | code to set default SMP affinity. */ | ||
75 | return select_smp_affinity(irq) ? -EINVAL : full_count; | ||
72 | 76 | ||
73 | proc_set_irq_affinity(irq, new_value); | 77 | proc_set_irq_affinity(irq, new_value); |
74 | 78 | ||
diff --git a/kernel/itimer.c b/kernel/itimer.c index 7c1b25e25e47..c2c05c4ff28d 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c | |||
@@ -12,36 +12,46 @@ | |||
12 | #include <linux/syscalls.h> | 12 | #include <linux/syscalls.h> |
13 | #include <linux/time.h> | 13 | #include <linux/time.h> |
14 | #include <linux/posix-timers.h> | 14 | #include <linux/posix-timers.h> |
15 | #include <linux/hrtimer.h> | ||
15 | 16 | ||
16 | #include <asm/uaccess.h> | 17 | #include <asm/uaccess.h> |
17 | 18 | ||
18 | static unsigned long it_real_value(struct signal_struct *sig) | 19 | /** |
20 | * itimer_get_remtime - get remaining time for the timer | ||
21 | * | ||
22 | * @timer: the timer to read | ||
23 | * | ||
24 | * Returns the delta between the expiry time and now, which can be | ||
25 | * less than zero or 1usec for an pending expired timer | ||
26 | */ | ||
27 | static struct timeval itimer_get_remtime(struct hrtimer *timer) | ||
19 | { | 28 | { |
20 | unsigned long val = 0; | 29 | ktime_t rem = hrtimer_get_remaining(timer); |
21 | if (timer_pending(&sig->real_timer)) { | ||
22 | val = sig->real_timer.expires - jiffies; | ||
23 | 30 | ||
24 | /* look out for negative/zero itimer.. */ | 31 | /* |
25 | if ((long) val <= 0) | 32 | * Racy but safe: if the itimer expires after the above |
26 | val = 1; | 33 | * hrtimer_get_remtime() call but before this condition |
27 | } | 34 | * then we return 0 - which is correct. |
28 | return val; | 35 | */ |
36 | if (hrtimer_active(timer)) { | ||
37 | if (rem.tv64 <= 0) | ||
38 | rem.tv64 = NSEC_PER_USEC; | ||
39 | } else | ||
40 | rem.tv64 = 0; | ||
41 | |||
42 | return ktime_to_timeval(rem); | ||
29 | } | 43 | } |
30 | 44 | ||
31 | int do_getitimer(int which, struct itimerval *value) | 45 | int do_getitimer(int which, struct itimerval *value) |
32 | { | 46 | { |
33 | struct task_struct *tsk = current; | 47 | struct task_struct *tsk = current; |
34 | unsigned long interval, val; | ||
35 | cputime_t cinterval, cval; | 48 | cputime_t cinterval, cval; |
36 | 49 | ||
37 | switch (which) { | 50 | switch (which) { |
38 | case ITIMER_REAL: | 51 | case ITIMER_REAL: |
39 | spin_lock_irq(&tsk->sighand->siglock); | 52 | value->it_value = itimer_get_remtime(&tsk->signal->real_timer); |
40 | interval = tsk->signal->it_real_incr; | 53 | value->it_interval = |
41 | val = it_real_value(tsk->signal); | 54 | ktime_to_timeval(tsk->signal->it_real_incr); |
42 | spin_unlock_irq(&tsk->sighand->siglock); | ||
43 | jiffies_to_timeval(val, &value->it_value); | ||
44 | jiffies_to_timeval(interval, &value->it_interval); | ||
45 | break; | 55 | break; |
46 | case ITIMER_VIRTUAL: | 56 | case ITIMER_VIRTUAL: |
47 | read_lock(&tasklist_lock); | 57 | read_lock(&tasklist_lock); |
@@ -113,59 +123,45 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value) | |||
113 | } | 123 | } |
114 | 124 | ||
115 | 125 | ||
116 | void it_real_fn(unsigned long __data) | 126 | /* |
127 | * The timer is automagically restarted, when interval != 0 | ||
128 | */ | ||
129 | int it_real_fn(void *data) | ||
117 | { | 130 | { |
118 | struct task_struct * p = (struct task_struct *) __data; | 131 | struct task_struct *tsk = (struct task_struct *) data; |
119 | unsigned long inc = p->signal->it_real_incr; | ||
120 | 132 | ||
121 | send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p); | 133 | send_group_sig_info(SIGALRM, SEND_SIG_PRIV, tsk); |
122 | 134 | ||
123 | /* | 135 | if (tsk->signal->it_real_incr.tv64 != 0) { |
124 | * Now restart the timer if necessary. We don't need any locking | 136 | hrtimer_forward(&tsk->signal->real_timer, |
125 | * here because do_setitimer makes sure we have finished running | 137 | tsk->signal->it_real_incr); |
126 | * before it touches anything. | 138 | |
127 | * Note, we KNOW we are (or should be) at a jiffie edge here so | 139 | return HRTIMER_RESTART; |
128 | * we don't need the +1 stuff. Also, we want to use the prior | 140 | } |
129 | * expire value so as to not "slip" a jiffie if we are late. | 141 | return HRTIMER_NORESTART; |
130 | * Deal with requesting a time prior to "now" here rather than | ||
131 | * in add_timer. | ||
132 | */ | ||
133 | if (!inc) | ||
134 | return; | ||
135 | while (time_before_eq(p->signal->real_timer.expires, jiffies)) | ||
136 | p->signal->real_timer.expires += inc; | ||
137 | add_timer(&p->signal->real_timer); | ||
138 | } | 142 | } |
139 | 143 | ||
140 | int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) | 144 | int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) |
141 | { | 145 | { |
142 | struct task_struct *tsk = current; | 146 | struct task_struct *tsk = current; |
143 | unsigned long val, interval, expires; | 147 | struct hrtimer *timer; |
148 | ktime_t expires; | ||
144 | cputime_t cval, cinterval, nval, ninterval; | 149 | cputime_t cval, cinterval, nval, ninterval; |
145 | 150 | ||
146 | switch (which) { | 151 | switch (which) { |
147 | case ITIMER_REAL: | 152 | case ITIMER_REAL: |
148 | again: | 153 | timer = &tsk->signal->real_timer; |
149 | spin_lock_irq(&tsk->sighand->siglock); | 154 | hrtimer_cancel(timer); |
150 | interval = tsk->signal->it_real_incr; | ||
151 | val = it_real_value(tsk->signal); | ||
152 | /* We are sharing ->siglock with it_real_fn() */ | ||
153 | if (try_to_del_timer_sync(&tsk->signal->real_timer) < 0) { | ||
154 | spin_unlock_irq(&tsk->sighand->siglock); | ||
155 | goto again; | ||
156 | } | ||
157 | tsk->signal->it_real_incr = | ||
158 | timeval_to_jiffies(&value->it_interval); | ||
159 | expires = timeval_to_jiffies(&value->it_value); | ||
160 | if (expires) | ||
161 | mod_timer(&tsk->signal->real_timer, | ||
162 | jiffies + 1 + expires); | ||
163 | spin_unlock_irq(&tsk->sighand->siglock); | ||
164 | if (ovalue) { | 155 | if (ovalue) { |
165 | jiffies_to_timeval(val, &ovalue->it_value); | 156 | ovalue->it_value = itimer_get_remtime(timer); |
166 | jiffies_to_timeval(interval, | 157 | ovalue->it_interval |
167 | &ovalue->it_interval); | 158 | = ktime_to_timeval(tsk->signal->it_real_incr); |
168 | } | 159 | } |
160 | tsk->signal->it_real_incr = | ||
161 | timeval_to_ktime(value->it_interval); | ||
162 | expires = timeval_to_ktime(value->it_value); | ||
163 | if (expires.tv64 != 0) | ||
164 | hrtimer_start(timer, expires, HRTIMER_REL); | ||
169 | break; | 165 | break; |
170 | case ITIMER_VIRTUAL: | 166 | case ITIMER_VIRTUAL: |
171 | nval = timeval_to_cputime(&value->it_value); | 167 | nval = timeval_to_cputime(&value->it_value); |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 2c95848fbce8..bf39d28e4c0e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -6,6 +6,7 @@ | |||
6 | * Version 2. See the file COPYING for more details. | 6 | * Version 2. See the file COPYING for more details. |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/capability.h> | ||
9 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
10 | #include <linux/file.h> | 11 | #include <linux/file.h> |
11 | #include <linux/slab.h> | 12 | #include <linux/slab.h> |
@@ -26,6 +27,9 @@ | |||
26 | #include <asm/system.h> | 27 | #include <asm/system.h> |
27 | #include <asm/semaphore.h> | 28 | #include <asm/semaphore.h> |
28 | 29 | ||
30 | /* Per cpu memory for storing cpu states in case of system crash. */ | ||
31 | note_buf_t* crash_notes; | ||
32 | |||
29 | /* Location of the reserved area for the crash kernel */ | 33 | /* Location of the reserved area for the crash kernel */ |
30 | struct resource crashk_res = { | 34 | struct resource crashk_res = { |
31 | .name = "Crash kernel", | 35 | .name = "Crash kernel", |
@@ -1054,9 +1058,24 @@ void crash_kexec(struct pt_regs *regs) | |||
1054 | if (!locked) { | 1058 | if (!locked) { |
1055 | image = xchg(&kexec_crash_image, NULL); | 1059 | image = xchg(&kexec_crash_image, NULL); |
1056 | if (image) { | 1060 | if (image) { |
1057 | machine_crash_shutdown(regs); | 1061 | struct pt_regs fixed_regs; |
1062 | crash_setup_regs(&fixed_regs, regs); | ||
1063 | machine_crash_shutdown(&fixed_regs); | ||
1058 | machine_kexec(image); | 1064 | machine_kexec(image); |
1059 | } | 1065 | } |
1060 | xchg(&kexec_lock, 0); | 1066 | xchg(&kexec_lock, 0); |
1061 | } | 1067 | } |
1062 | } | 1068 | } |
1069 | |||
1070 | static int __init crash_notes_memory_init(void) | ||
1071 | { | ||
1072 | /* Allocate memory for saving cpu registers. */ | ||
1073 | crash_notes = alloc_percpu(note_buf_t); | ||
1074 | if (!crash_notes) { | ||
1075 | printk("Kexec: Memory allocation for saving cpu register" | ||
1076 | " states failed\n"); | ||
1077 | return -ENOMEM; | ||
1078 | } | ||
1079 | return 0; | ||
1080 | } | ||
1081 | module_init(crash_notes_memory_init) | ||
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3bb71e63a37e..3ea6325228da 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -48,10 +48,11 @@ | |||
48 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; | 48 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; |
49 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; | 49 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; |
50 | 50 | ||
51 | static DEFINE_SPINLOCK(kprobe_lock); /* Protects kprobe_table */ | 51 | DECLARE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ |
52 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ | 52 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ |
53 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; | 53 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; |
54 | 54 | ||
55 | #ifdef __ARCH_WANT_KPROBES_INSN_SLOT | ||
55 | /* | 56 | /* |
56 | * kprobe->ainsn.insn points to the copy of the instruction to be | 57 | * kprobe->ainsn.insn points to the copy of the instruction to be |
57 | * single-stepped. x86_64, POWER4 and above have no-exec support and | 58 | * single-stepped. x86_64, POWER4 and above have no-exec support and |
@@ -151,6 +152,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot) | |||
151 | } | 152 | } |
152 | } | 153 | } |
153 | } | 154 | } |
155 | #endif | ||
154 | 156 | ||
155 | /* We have preemption disabled.. so it is safe to use __ versions */ | 157 | /* We have preemption disabled.. so it is safe to use __ versions */ |
156 | static inline void set_kprobe_instance(struct kprobe *kp) | 158 | static inline void set_kprobe_instance(struct kprobe *kp) |
@@ -165,7 +167,7 @@ static inline void reset_kprobe_instance(void) | |||
165 | 167 | ||
166 | /* | 168 | /* |
167 | * This routine is called either: | 169 | * This routine is called either: |
168 | * - under the kprobe_lock spinlock - during kprobe_[un]register() | 170 | * - under the kprobe_mutex - during kprobe_[un]register() |
169 | * OR | 171 | * OR |
170 | * - with preemption disabled - from arch/xxx/kernel/kprobes.c | 172 | * - with preemption disabled - from arch/xxx/kernel/kprobes.c |
171 | */ | 173 | */ |
@@ -418,7 +420,6 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | |||
418 | /* | 420 | /* |
419 | * This is the second or subsequent kprobe at the address - handle | 421 | * This is the second or subsequent kprobe at the address - handle |
420 | * the intricacies | 422 | * the intricacies |
421 | * TODO: Move kcalloc outside the spin_lock | ||
422 | */ | 423 | */ |
423 | static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | 424 | static int __kprobes register_aggr_kprobe(struct kprobe *old_p, |
424 | struct kprobe *p) | 425 | struct kprobe *p) |
@@ -430,7 +431,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
430 | copy_kprobe(old_p, p); | 431 | copy_kprobe(old_p, p); |
431 | ret = add_new_kprobe(old_p, p); | 432 | ret = add_new_kprobe(old_p, p); |
432 | } else { | 433 | } else { |
433 | ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC); | 434 | ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); |
434 | if (!ap) | 435 | if (!ap) |
435 | return -ENOMEM; | 436 | return -ENOMEM; |
436 | add_aggr_kprobe(ap, old_p); | 437 | add_aggr_kprobe(ap, old_p); |
@@ -440,25 +441,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
440 | return ret; | 441 | return ret; |
441 | } | 442 | } |
442 | 443 | ||
443 | /* kprobe removal house-keeping routines */ | ||
444 | static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags) | ||
445 | { | ||
446 | arch_disarm_kprobe(p); | ||
447 | hlist_del_rcu(&p->hlist); | ||
448 | spin_unlock_irqrestore(&kprobe_lock, flags); | ||
449 | arch_remove_kprobe(p); | ||
450 | } | ||
451 | |||
452 | static inline void cleanup_aggr_kprobe(struct kprobe *old_p, | ||
453 | struct kprobe *p, unsigned long flags) | ||
454 | { | ||
455 | list_del_rcu(&p->list); | ||
456 | if (list_empty(&old_p->list)) | ||
457 | cleanup_kprobe(old_p, flags); | ||
458 | else | ||
459 | spin_unlock_irqrestore(&kprobe_lock, flags); | ||
460 | } | ||
461 | |||
462 | static int __kprobes in_kprobes_functions(unsigned long addr) | 444 | static int __kprobes in_kprobes_functions(unsigned long addr) |
463 | { | 445 | { |
464 | if (addr >= (unsigned long)__kprobes_text_start | 446 | if (addr >= (unsigned long)__kprobes_text_start |
@@ -467,33 +449,44 @@ static int __kprobes in_kprobes_functions(unsigned long addr) | |||
467 | return 0; | 449 | return 0; |
468 | } | 450 | } |
469 | 451 | ||
470 | int __kprobes register_kprobe(struct kprobe *p) | 452 | static int __kprobes __register_kprobe(struct kprobe *p, |
453 | unsigned long called_from) | ||
471 | { | 454 | { |
472 | int ret = 0; | 455 | int ret = 0; |
473 | unsigned long flags = 0; | ||
474 | struct kprobe *old_p; | 456 | struct kprobe *old_p; |
475 | struct module *mod; | 457 | struct module *probed_mod; |
476 | 458 | ||
477 | if ((!kernel_text_address((unsigned long) p->addr)) || | 459 | if ((!kernel_text_address((unsigned long) p->addr)) || |
478 | in_kprobes_functions((unsigned long) p->addr)) | 460 | in_kprobes_functions((unsigned long) p->addr)) |
479 | return -EINVAL; | 461 | return -EINVAL; |
480 | 462 | ||
481 | if ((mod = module_text_address((unsigned long) p->addr)) && | 463 | p->mod_refcounted = 0; |
482 | (unlikely(!try_module_get(mod)))) | 464 | /* Check are we probing a module */ |
483 | return -EINVAL; | 465 | if ((probed_mod = module_text_address((unsigned long) p->addr))) { |
484 | 466 | struct module *calling_mod = module_text_address(called_from); | |
485 | if ((ret = arch_prepare_kprobe(p)) != 0) | 467 | /* We must allow modules to probe themself and |
486 | goto rm_kprobe; | 468 | * in this case avoid incrementing the module refcount, |
469 | * so as to allow unloading of self probing modules. | ||
470 | */ | ||
471 | if (calling_mod && (calling_mod != probed_mod)) { | ||
472 | if (unlikely(!try_module_get(probed_mod))) | ||
473 | return -EINVAL; | ||
474 | p->mod_refcounted = 1; | ||
475 | } else | ||
476 | probed_mod = NULL; | ||
477 | } | ||
487 | 478 | ||
488 | p->nmissed = 0; | 479 | p->nmissed = 0; |
489 | spin_lock_irqsave(&kprobe_lock, flags); | 480 | down(&kprobe_mutex); |
490 | old_p = get_kprobe(p->addr); | 481 | old_p = get_kprobe(p->addr); |
491 | if (old_p) { | 482 | if (old_p) { |
492 | ret = register_aggr_kprobe(old_p, p); | 483 | ret = register_aggr_kprobe(old_p, p); |
493 | goto out; | 484 | goto out; |
494 | } | 485 | } |
495 | 486 | ||
496 | arch_copy_kprobe(p); | 487 | if ((ret = arch_prepare_kprobe(p)) != 0) |
488 | goto out; | ||
489 | |||
497 | INIT_HLIST_NODE(&p->hlist); | 490 | INIT_HLIST_NODE(&p->hlist); |
498 | hlist_add_head_rcu(&p->hlist, | 491 | hlist_add_head_rcu(&p->hlist, |
499 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); | 492 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); |
@@ -501,40 +494,66 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
501 | arch_arm_kprobe(p); | 494 | arch_arm_kprobe(p); |
502 | 495 | ||
503 | out: | 496 | out: |
504 | spin_unlock_irqrestore(&kprobe_lock, flags); | 497 | up(&kprobe_mutex); |
505 | rm_kprobe: | 498 | |
506 | if (ret == -EEXIST) | 499 | if (ret && probed_mod) |
507 | arch_remove_kprobe(p); | 500 | module_put(probed_mod); |
508 | if (ret && mod) | ||
509 | module_put(mod); | ||
510 | return ret; | 501 | return ret; |
511 | } | 502 | } |
512 | 503 | ||
504 | int __kprobes register_kprobe(struct kprobe *p) | ||
505 | { | ||
506 | return __register_kprobe(p, | ||
507 | (unsigned long)__builtin_return_address(0)); | ||
508 | } | ||
509 | |||
513 | void __kprobes unregister_kprobe(struct kprobe *p) | 510 | void __kprobes unregister_kprobe(struct kprobe *p) |
514 | { | 511 | { |
515 | unsigned long flags; | ||
516 | struct kprobe *old_p; | ||
517 | struct module *mod; | 512 | struct module *mod; |
513 | struct kprobe *old_p, *list_p; | ||
514 | int cleanup_p; | ||
518 | 515 | ||
519 | spin_lock_irqsave(&kprobe_lock, flags); | 516 | down(&kprobe_mutex); |
520 | old_p = get_kprobe(p->addr); | 517 | old_p = get_kprobe(p->addr); |
521 | if (old_p) { | 518 | if (unlikely(!old_p)) { |
522 | /* cleanup_*_kprobe() does the spin_unlock_irqrestore */ | 519 | up(&kprobe_mutex); |
523 | if (old_p->pre_handler == aggr_pre_handler) | 520 | return; |
524 | cleanup_aggr_kprobe(old_p, p, flags); | 521 | } |
525 | else | 522 | if (p != old_p) { |
526 | cleanup_kprobe(p, flags); | 523 | list_for_each_entry_rcu(list_p, &old_p->list, list) |
524 | if (list_p == p) | ||
525 | /* kprobe p is a valid probe */ | ||
526 | goto valid_p; | ||
527 | up(&kprobe_mutex); | ||
528 | return; | ||
529 | } | ||
530 | valid_p: | ||
531 | if ((old_p == p) || ((old_p->pre_handler == aggr_pre_handler) && | ||
532 | (p->list.next == &old_p->list) && | ||
533 | (p->list.prev == &old_p->list))) { | ||
534 | /* Only probe on the hash list */ | ||
535 | arch_disarm_kprobe(p); | ||
536 | hlist_del_rcu(&old_p->hlist); | ||
537 | cleanup_p = 1; | ||
538 | } else { | ||
539 | list_del_rcu(&p->list); | ||
540 | cleanup_p = 0; | ||
541 | } | ||
527 | 542 | ||
528 | synchronize_sched(); | 543 | up(&kprobe_mutex); |
529 | 544 | ||
530 | if ((mod = module_text_address((unsigned long)p->addr))) | 545 | synchronize_sched(); |
531 | module_put(mod); | 546 | if (p->mod_refcounted && |
547 | (mod = module_text_address((unsigned long)p->addr))) | ||
548 | module_put(mod); | ||
532 | 549 | ||
533 | if (old_p->pre_handler == aggr_pre_handler && | 550 | if (cleanup_p) { |
534 | list_empty(&old_p->list)) | 551 | if (p != old_p) { |
552 | list_del_rcu(&p->list); | ||
535 | kfree(old_p); | 553 | kfree(old_p); |
536 | } else | 554 | } |
537 | spin_unlock_irqrestore(&kprobe_lock, flags); | 555 | arch_remove_kprobe(p); |
556 | } | ||
538 | } | 557 | } |
539 | 558 | ||
540 | static struct notifier_block kprobe_exceptions_nb = { | 559 | static struct notifier_block kprobe_exceptions_nb = { |
@@ -548,7 +567,8 @@ int __kprobes register_jprobe(struct jprobe *jp) | |||
548 | jp->kp.pre_handler = setjmp_pre_handler; | 567 | jp->kp.pre_handler = setjmp_pre_handler; |
549 | jp->kp.break_handler = longjmp_break_handler; | 568 | jp->kp.break_handler = longjmp_break_handler; |
550 | 569 | ||
551 | return register_kprobe(&jp->kp); | 570 | return __register_kprobe(&jp->kp, |
571 | (unsigned long)__builtin_return_address(0)); | ||
552 | } | 572 | } |
553 | 573 | ||
554 | void __kprobes unregister_jprobe(struct jprobe *jp) | 574 | void __kprobes unregister_jprobe(struct jprobe *jp) |
@@ -588,7 +608,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp) | |||
588 | 608 | ||
589 | rp->nmissed = 0; | 609 | rp->nmissed = 0; |
590 | /* Establish function entry probe point */ | 610 | /* Establish function entry probe point */ |
591 | if ((ret = register_kprobe(&rp->kp)) != 0) | 611 | if ((ret = __register_kprobe(&rp->kp, |
612 | (unsigned long)__builtin_return_address(0))) != 0) | ||
592 | free_rp_inst(rp); | 613 | free_rp_inst(rp); |
593 | return ret; | 614 | return ret; |
594 | } | 615 | } |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 015fb69ad94d..d5eeae0fa5bc 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -15,6 +15,9 @@ | |||
15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
16 | #include <linux/init.h> | 16 | #include <linux/init.h> |
17 | 17 | ||
18 | u64 uevent_seqnum; | ||
19 | char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug"; | ||
20 | |||
18 | #define KERNEL_ATTR_RO(_name) \ | 21 | #define KERNEL_ATTR_RO(_name) \ |
19 | static struct subsys_attribute _name##_attr = __ATTR_RO(_name) | 22 | static struct subsys_attribute _name##_attr = __ATTR_RO(_name) |
20 | 23 | ||
@@ -23,21 +26,29 @@ static struct subsys_attribute _name##_attr = \ | |||
23 | __ATTR(_name, 0644, _name##_show, _name##_store) | 26 | __ATTR(_name, 0644, _name##_show, _name##_store) |
24 | 27 | ||
25 | #ifdef CONFIG_HOTPLUG | 28 | #ifdef CONFIG_HOTPLUG |
26 | static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page) | 29 | /* current uevent sequence number */ |
30 | static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page) | ||
27 | { | 31 | { |
28 | return sprintf(page, "%llu\n", (unsigned long long)hotplug_seqnum); | 32 | return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum); |
29 | } | 33 | } |
30 | KERNEL_ATTR_RO(hotplug_seqnum); | 34 | KERNEL_ATTR_RO(uevent_seqnum); |
31 | #endif | ||
32 | |||
33 | #ifdef CONFIG_KEXEC | ||
34 | #include <asm/kexec.h> | ||
35 | 35 | ||
36 | static ssize_t crash_notes_show(struct subsystem *subsys, char *page) | 36 | /* uevent helper program, used during early boo */ |
37 | static ssize_t uevent_helper_show(struct subsystem *subsys, char *page) | ||
37 | { | 38 | { |
38 | return sprintf(page, "%p\n", (void *)crash_notes); | 39 | return sprintf(page, "%s\n", uevent_helper); |
39 | } | 40 | } |
40 | KERNEL_ATTR_RO(crash_notes); | 41 | static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, size_t count) |
42 | { | ||
43 | if (count+1 > UEVENT_HELPER_PATH_LEN) | ||
44 | return -ENOENT; | ||
45 | memcpy(uevent_helper, page, count); | ||
46 | uevent_helper[count] = '\0'; | ||
47 | if (count && uevent_helper[count-1] == '\n') | ||
48 | uevent_helper[count-1] = '\0'; | ||
49 | return count; | ||
50 | } | ||
51 | KERNEL_ATTR_RW(uevent_helper); | ||
41 | #endif | 52 | #endif |
42 | 53 | ||
43 | decl_subsys(kernel, NULL, NULL); | 54 | decl_subsys(kernel, NULL, NULL); |
@@ -45,10 +56,8 @@ EXPORT_SYMBOL_GPL(kernel_subsys); | |||
45 | 56 | ||
46 | static struct attribute * kernel_attrs[] = { | 57 | static struct attribute * kernel_attrs[] = { |
47 | #ifdef CONFIG_HOTPLUG | 58 | #ifdef CONFIG_HOTPLUG |
48 | &hotplug_seqnum_attr.attr, | 59 | &uevent_seqnum_attr.attr, |
49 | #endif | 60 | &uevent_helper_attr.attr, |
50 | #ifdef CONFIG_KEXEC | ||
51 | &crash_notes_attr.attr, | ||
52 | #endif | 61 | #endif |
53 | NULL | 62 | NULL |
54 | }; | 63 | }; |
diff --git a/kernel/module.c b/kernel/module.c index 2ea929d51ad0..618ed6e23ecc 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/syscalls.h> | 28 | #include <linux/syscalls.h> |
29 | #include <linux/fcntl.h> | 29 | #include <linux/fcntl.h> |
30 | #include <linux/rcupdate.h> | 30 | #include <linux/rcupdate.h> |
31 | #include <linux/capability.h> | ||
31 | #include <linux/cpu.h> | 32 | #include <linux/cpu.h> |
32 | #include <linux/moduleparam.h> | 33 | #include <linux/moduleparam.h> |
33 | #include <linux/errno.h> | 34 | #include <linux/errno.h> |
@@ -496,15 +497,15 @@ static void module_unload_free(struct module *mod) | |||
496 | } | 497 | } |
497 | 498 | ||
498 | #ifdef CONFIG_MODULE_FORCE_UNLOAD | 499 | #ifdef CONFIG_MODULE_FORCE_UNLOAD |
499 | static inline int try_force(unsigned int flags) | 500 | static inline int try_force_unload(unsigned int flags) |
500 | { | 501 | { |
501 | int ret = (flags & O_TRUNC); | 502 | int ret = (flags & O_TRUNC); |
502 | if (ret) | 503 | if (ret) |
503 | add_taint(TAINT_FORCED_MODULE); | 504 | add_taint(TAINT_FORCED_RMMOD); |
504 | return ret; | 505 | return ret; |
505 | } | 506 | } |
506 | #else | 507 | #else |
507 | static inline int try_force(unsigned int flags) | 508 | static inline int try_force_unload(unsigned int flags) |
508 | { | 509 | { |
509 | return 0; | 510 | return 0; |
510 | } | 511 | } |
@@ -524,7 +525,7 @@ static int __try_stop_module(void *_sref) | |||
524 | 525 | ||
525 | /* If it's not unused, quit unless we are told to block. */ | 526 | /* If it's not unused, quit unless we are told to block. */ |
526 | if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) { | 527 | if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) { |
527 | if (!(*sref->forced = try_force(sref->flags))) | 528 | if (!(*sref->forced = try_force_unload(sref->flags))) |
528 | return -EWOULDBLOCK; | 529 | return -EWOULDBLOCK; |
529 | } | 530 | } |
530 | 531 | ||
@@ -609,7 +610,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags) | |||
609 | /* If it has an init func, it must have an exit func to unload */ | 610 | /* If it has an init func, it must have an exit func to unload */ |
610 | if ((mod->init != NULL && mod->exit == NULL) | 611 | if ((mod->init != NULL && mod->exit == NULL) |
611 | || mod->unsafe) { | 612 | || mod->unsafe) { |
612 | forced = try_force(flags); | 613 | forced = try_force_unload(flags); |
613 | if (!forced) { | 614 | if (!forced) { |
614 | /* This module can't be removed */ | 615 | /* This module can't be removed */ |
615 | ret = -EBUSY; | 616 | ret = -EBUSY; |
@@ -958,7 +959,6 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, | |||
958 | unsigned long ret; | 959 | unsigned long ret; |
959 | const unsigned long *crc; | 960 | const unsigned long *crc; |
960 | 961 | ||
961 | spin_lock_irq(&modlist_lock); | ||
962 | ret = __find_symbol(name, &owner, &crc, mod->license_gplok); | 962 | ret = __find_symbol(name, &owner, &crc, mod->license_gplok); |
963 | if (ret) { | 963 | if (ret) { |
964 | /* use_module can fail due to OOM, or module unloading */ | 964 | /* use_module can fail due to OOM, or module unloading */ |
@@ -966,7 +966,6 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, | |||
966 | !use_module(mod, owner)) | 966 | !use_module(mod, owner)) |
967 | ret = 0; | 967 | ret = 0; |
968 | } | 968 | } |
969 | spin_unlock_irq(&modlist_lock); | ||
970 | return ret; | 969 | return ret; |
971 | } | 970 | } |
972 | 971 | ||
@@ -1204,6 +1203,39 @@ void *__symbol_get(const char *symbol) | |||
1204 | } | 1203 | } |
1205 | EXPORT_SYMBOL_GPL(__symbol_get); | 1204 | EXPORT_SYMBOL_GPL(__symbol_get); |
1206 | 1205 | ||
1206 | /* | ||
1207 | * Ensure that an exported symbol [global namespace] does not already exist | ||
1208 | * in the Kernel or in some other modules exported symbol table. | ||
1209 | */ | ||
1210 | static int verify_export_symbols(struct module *mod) | ||
1211 | { | ||
1212 | const char *name = NULL; | ||
1213 | unsigned long i, ret = 0; | ||
1214 | struct module *owner; | ||
1215 | const unsigned long *crc; | ||
1216 | |||
1217 | for (i = 0; i < mod->num_syms; i++) | ||
1218 | if (__find_symbol(mod->syms[i].name, &owner, &crc, 1)) { | ||
1219 | name = mod->syms[i].name; | ||
1220 | ret = -ENOEXEC; | ||
1221 | goto dup; | ||
1222 | } | ||
1223 | |||
1224 | for (i = 0; i < mod->num_gpl_syms; i++) | ||
1225 | if (__find_symbol(mod->gpl_syms[i].name, &owner, &crc, 1)) { | ||
1226 | name = mod->gpl_syms[i].name; | ||
1227 | ret = -ENOEXEC; | ||
1228 | goto dup; | ||
1229 | } | ||
1230 | |||
1231 | dup: | ||
1232 | if (ret) | ||
1233 | printk(KERN_ERR "%s: exports duplicate symbol %s (owned by %s)\n", | ||
1234 | mod->name, name, module_name(owner)); | ||
1235 | |||
1236 | return ret; | ||
1237 | } | ||
1238 | |||
1207 | /* Change all symbols so that sh_value encodes the pointer directly. */ | 1239 | /* Change all symbols so that sh_value encodes the pointer directly. */ |
1208 | static int simplify_symbols(Elf_Shdr *sechdrs, | 1240 | static int simplify_symbols(Elf_Shdr *sechdrs, |
1209 | unsigned int symindex, | 1241 | unsigned int symindex, |
@@ -1715,6 +1747,11 @@ static struct module *load_module(void __user *umod, | |||
1715 | /* Set up license info based on the info section */ | 1747 | /* Set up license info based on the info section */ |
1716 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); | 1748 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); |
1717 | 1749 | ||
1750 | if (strcmp(mod->name, "ndiswrapper") == 0) | ||
1751 | add_taint(TAINT_PROPRIETARY_MODULE); | ||
1752 | if (strcmp(mod->name, "driverloader") == 0) | ||
1753 | add_taint(TAINT_PROPRIETARY_MODULE); | ||
1754 | |||
1718 | #ifdef CONFIG_MODULE_UNLOAD | 1755 | #ifdef CONFIG_MODULE_UNLOAD |
1719 | /* Set up MODINFO_ATTR fields */ | 1756 | /* Set up MODINFO_ATTR fields */ |
1720 | setup_modinfo(mod, sechdrs, infoindex); | 1757 | setup_modinfo(mod, sechdrs, infoindex); |
@@ -1767,6 +1804,12 @@ static struct module *load_module(void __user *umod, | |||
1767 | goto cleanup; | 1804 | goto cleanup; |
1768 | } | 1805 | } |
1769 | 1806 | ||
1807 | /* Find duplicate symbols */ | ||
1808 | err = verify_export_symbols(mod); | ||
1809 | |||
1810 | if (err < 0) | ||
1811 | goto cleanup; | ||
1812 | |||
1770 | /* Set up and sort exception table */ | 1813 | /* Set up and sort exception table */ |
1771 | mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable); | 1814 | mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable); |
1772 | mod->extable = extable = (void *)sechdrs[exindex].sh_addr; | 1815 | mod->extable = extable = (void *)sechdrs[exindex].sh_addr; |
@@ -1854,8 +1897,7 @@ static struct module *load_module(void __user *umod, | |||
1854 | kfree(args); | 1897 | kfree(args); |
1855 | free_hdr: | 1898 | free_hdr: |
1856 | vfree(hdr); | 1899 | vfree(hdr); |
1857 | if (err < 0) return ERR_PTR(err); | 1900 | return ERR_PTR(err); |
1858 | else return ptr; | ||
1859 | 1901 | ||
1860 | truncated: | 1902 | truncated: |
1861 | printk(KERN_ERR "Module len %lu truncated\n", len); | 1903 | printk(KERN_ERR "Module len %lu truncated\n", len); |
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c new file mode 100644 index 000000000000..f4913c376950 --- /dev/null +++ b/kernel/mutex-debug.c | |||
@@ -0,0 +1,462 @@ | |||
1 | /* | ||
2 | * kernel/mutex-debug.c | ||
3 | * | ||
4 | * Debugging code for mutexes | ||
5 | * | ||
6 | * Started by Ingo Molnar: | ||
7 | * | ||
8 | * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
9 | * | ||
10 | * lock debugging, locking tree, deadlock detection started by: | ||
11 | * | ||
12 | * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey | ||
13 | * Released under the General Public License (GPL). | ||
14 | */ | ||
15 | #include <linux/mutex.h> | ||
16 | #include <linux/sched.h> | ||
17 | #include <linux/delay.h> | ||
18 | #include <linux/module.h> | ||
19 | #include <linux/spinlock.h> | ||
20 | #include <linux/kallsyms.h> | ||
21 | #include <linux/interrupt.h> | ||
22 | |||
23 | #include "mutex-debug.h" | ||
24 | |||
25 | /* | ||
26 | * We need a global lock when we walk through the multi-process | ||
27 | * lock tree. Only used in the deadlock-debugging case. | ||
28 | */ | ||
29 | DEFINE_SPINLOCK(debug_mutex_lock); | ||
30 | |||
31 | /* | ||
32 | * All locks held by all tasks, in a single global list: | ||
33 | */ | ||
34 | LIST_HEAD(debug_mutex_held_locks); | ||
35 | |||
36 | /* | ||
37 | * In the debug case we carry the caller's instruction pointer into | ||
38 | * other functions, but we dont want the function argument overhead | ||
39 | * in the nondebug case - hence these macros: | ||
40 | */ | ||
41 | #define __IP_DECL__ , unsigned long ip | ||
42 | #define __IP__ , ip | ||
43 | #define __RET_IP__ , (unsigned long)__builtin_return_address(0) | ||
44 | |||
45 | /* | ||
46 | * "mutex debugging enabled" flag. We turn it off when we detect | ||
47 | * the first problem because we dont want to recurse back | ||
48 | * into the tracing code when doing error printk or | ||
49 | * executing a BUG(): | ||
50 | */ | ||
51 | int debug_mutex_on = 1; | ||
52 | |||
53 | static void printk_task(struct task_struct *p) | ||
54 | { | ||
55 | if (p) | ||
56 | printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio); | ||
57 | else | ||
58 | printk("<none>"); | ||
59 | } | ||
60 | |||
61 | static void printk_ti(struct thread_info *ti) | ||
62 | { | ||
63 | if (ti) | ||
64 | printk_task(ti->task); | ||
65 | else | ||
66 | printk("<none>"); | ||
67 | } | ||
68 | |||
69 | static void printk_task_short(struct task_struct *p) | ||
70 | { | ||
71 | if (p) | ||
72 | printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio); | ||
73 | else | ||
74 | printk("<none>"); | ||
75 | } | ||
76 | |||
77 | static void printk_lock(struct mutex *lock, int print_owner) | ||
78 | { | ||
79 | printk(" [%p] {%s}\n", lock, lock->name); | ||
80 | |||
81 | if (print_owner && lock->owner) { | ||
82 | printk(".. held by: "); | ||
83 | printk_ti(lock->owner); | ||
84 | printk("\n"); | ||
85 | } | ||
86 | if (lock->owner) { | ||
87 | printk("... acquired at: "); | ||
88 | print_symbol("%s\n", lock->acquire_ip); | ||
89 | } | ||
90 | } | ||
91 | |||
92 | /* | ||
93 | * printk locks held by a task: | ||
94 | */ | ||
95 | static void show_task_locks(struct task_struct *p) | ||
96 | { | ||
97 | switch (p->state) { | ||
98 | case TASK_RUNNING: printk("R"); break; | ||
99 | case TASK_INTERRUPTIBLE: printk("S"); break; | ||
100 | case TASK_UNINTERRUPTIBLE: printk("D"); break; | ||
101 | case TASK_STOPPED: printk("T"); break; | ||
102 | case EXIT_ZOMBIE: printk("Z"); break; | ||
103 | case EXIT_DEAD: printk("X"); break; | ||
104 | default: printk("?"); break; | ||
105 | } | ||
106 | printk_task(p); | ||
107 | if (p->blocked_on) { | ||
108 | struct mutex *lock = p->blocked_on->lock; | ||
109 | |||
110 | printk(" blocked on mutex:"); | ||
111 | printk_lock(lock, 1); | ||
112 | } else | ||
113 | printk(" (not blocked on mutex)\n"); | ||
114 | } | ||
115 | |||
116 | /* | ||
117 | * printk all locks held in the system (if filter == NULL), | ||
118 | * or all locks belonging to a single task (if filter != NULL): | ||
119 | */ | ||
120 | void show_held_locks(struct task_struct *filter) | ||
121 | { | ||
122 | struct list_head *curr, *cursor = NULL; | ||
123 | struct mutex *lock; | ||
124 | struct thread_info *t; | ||
125 | unsigned long flags; | ||
126 | int count = 0; | ||
127 | |||
128 | if (filter) { | ||
129 | printk("------------------------------\n"); | ||
130 | printk("| showing all locks held by: | ("); | ||
131 | printk_task_short(filter); | ||
132 | printk("):\n"); | ||
133 | printk("------------------------------\n"); | ||
134 | } else { | ||
135 | printk("---------------------------\n"); | ||
136 | printk("| showing all locks held: |\n"); | ||
137 | printk("---------------------------\n"); | ||
138 | } | ||
139 | |||
140 | /* | ||
141 | * Play safe and acquire the global trace lock. We | ||
142 | * cannot printk with that lock held so we iterate | ||
143 | * very carefully: | ||
144 | */ | ||
145 | next: | ||
146 | debug_spin_lock_save(&debug_mutex_lock, flags); | ||
147 | list_for_each(curr, &debug_mutex_held_locks) { | ||
148 | if (cursor && curr != cursor) | ||
149 | continue; | ||
150 | lock = list_entry(curr, struct mutex, held_list); | ||
151 | t = lock->owner; | ||
152 | if (filter && (t != filter->thread_info)) | ||
153 | continue; | ||
154 | count++; | ||
155 | cursor = curr->next; | ||
156 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
157 | |||
158 | printk("\n#%03d: ", count); | ||
159 | printk_lock(lock, filter ? 0 : 1); | ||
160 | goto next; | ||
161 | } | ||
162 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
163 | printk("\n"); | ||
164 | } | ||
165 | |||
166 | void mutex_debug_show_all_locks(void) | ||
167 | { | ||
168 | struct task_struct *g, *p; | ||
169 | int count = 10; | ||
170 | int unlock = 1; | ||
171 | |||
172 | printk("\nShowing all blocking locks in the system:\n"); | ||
173 | |||
174 | /* | ||
175 | * Here we try to get the tasklist_lock as hard as possible, | ||
176 | * if not successful after 2 seconds we ignore it (but keep | ||
177 | * trying). This is to enable a debug printout even if a | ||
178 | * tasklist_lock-holding task deadlocks or crashes. | ||
179 | */ | ||
180 | retry: | ||
181 | if (!read_trylock(&tasklist_lock)) { | ||
182 | if (count == 10) | ||
183 | printk("hm, tasklist_lock locked, retrying... "); | ||
184 | if (count) { | ||
185 | count--; | ||
186 | printk(" #%d", 10-count); | ||
187 | mdelay(200); | ||
188 | goto retry; | ||
189 | } | ||
190 | printk(" ignoring it.\n"); | ||
191 | unlock = 0; | ||
192 | } | ||
193 | if (count != 10) | ||
194 | printk(" locked it.\n"); | ||
195 | |||
196 | do_each_thread(g, p) { | ||
197 | show_task_locks(p); | ||
198 | if (!unlock) | ||
199 | if (read_trylock(&tasklist_lock)) | ||
200 | unlock = 1; | ||
201 | } while_each_thread(g, p); | ||
202 | |||
203 | printk("\n"); | ||
204 | show_held_locks(NULL); | ||
205 | printk("=============================================\n\n"); | ||
206 | |||
207 | if (unlock) | ||
208 | read_unlock(&tasklist_lock); | ||
209 | } | ||
210 | |||
211 | static void report_deadlock(struct task_struct *task, struct mutex *lock, | ||
212 | struct mutex *lockblk, unsigned long ip) | ||
213 | { | ||
214 | printk("\n%s/%d is trying to acquire this lock:\n", | ||
215 | current->comm, current->pid); | ||
216 | printk_lock(lock, 1); | ||
217 | printk("... trying at: "); | ||
218 | print_symbol("%s\n", ip); | ||
219 | show_held_locks(current); | ||
220 | |||
221 | if (lockblk) { | ||
222 | printk("but %s/%d is deadlocking current task %s/%d!\n\n", | ||
223 | task->comm, task->pid, current->comm, current->pid); | ||
224 | printk("\n%s/%d is blocked on this lock:\n", | ||
225 | task->comm, task->pid); | ||
226 | printk_lock(lockblk, 1); | ||
227 | |||
228 | show_held_locks(task); | ||
229 | |||
230 | printk("\n%s/%d's [blocked] stackdump:\n\n", | ||
231 | task->comm, task->pid); | ||
232 | show_stack(task, NULL); | ||
233 | } | ||
234 | |||
235 | printk("\n%s/%d's [current] stackdump:\n\n", | ||
236 | current->comm, current->pid); | ||
237 | dump_stack(); | ||
238 | mutex_debug_show_all_locks(); | ||
239 | printk("[ turning off deadlock detection. Please report this. ]\n\n"); | ||
240 | local_irq_disable(); | ||
241 | } | ||
242 | |||
243 | /* | ||
244 | * Recursively check for mutex deadlocks: | ||
245 | */ | ||
246 | static int check_deadlock(struct mutex *lock, int depth, | ||
247 | struct thread_info *ti, unsigned long ip) | ||
248 | { | ||
249 | struct mutex *lockblk; | ||
250 | struct task_struct *task; | ||
251 | |||
252 | if (!debug_mutex_on) | ||
253 | return 0; | ||
254 | |||
255 | ti = lock->owner; | ||
256 | if (!ti) | ||
257 | return 0; | ||
258 | |||
259 | task = ti->task; | ||
260 | lockblk = NULL; | ||
261 | if (task->blocked_on) | ||
262 | lockblk = task->blocked_on->lock; | ||
263 | |||
264 | /* Self-deadlock: */ | ||
265 | if (current == task) { | ||
266 | DEBUG_OFF(); | ||
267 | if (depth) | ||
268 | return 1; | ||
269 | printk("\n==========================================\n"); | ||
270 | printk( "[ BUG: lock recursion deadlock detected! |\n"); | ||
271 | printk( "------------------------------------------\n"); | ||
272 | report_deadlock(task, lock, NULL, ip); | ||
273 | return 0; | ||
274 | } | ||
275 | |||
276 | /* Ugh, something corrupted the lock data structure? */ | ||
277 | if (depth > 20) { | ||
278 | DEBUG_OFF(); | ||
279 | printk("\n===========================================\n"); | ||
280 | printk( "[ BUG: infinite lock dependency detected!? |\n"); | ||
281 | printk( "-------------------------------------------\n"); | ||
282 | report_deadlock(task, lock, lockblk, ip); | ||
283 | return 0; | ||
284 | } | ||
285 | |||
286 | /* Recursively check for dependencies: */ | ||
287 | if (lockblk && check_deadlock(lockblk, depth+1, ti, ip)) { | ||
288 | printk("\n============================================\n"); | ||
289 | printk( "[ BUG: circular locking deadlock detected! ]\n"); | ||
290 | printk( "--------------------------------------------\n"); | ||
291 | report_deadlock(task, lock, lockblk, ip); | ||
292 | return 0; | ||
293 | } | ||
294 | return 0; | ||
295 | } | ||
296 | |||
297 | /* | ||
298 | * Called when a task exits, this function checks whether the | ||
299 | * task is holding any locks, and reports the first one if so: | ||
300 | */ | ||
301 | void mutex_debug_check_no_locks_held(struct task_struct *task) | ||
302 | { | ||
303 | struct list_head *curr, *next; | ||
304 | struct thread_info *t; | ||
305 | unsigned long flags; | ||
306 | struct mutex *lock; | ||
307 | |||
308 | if (!debug_mutex_on) | ||
309 | return; | ||
310 | |||
311 | debug_spin_lock_save(&debug_mutex_lock, flags); | ||
312 | list_for_each_safe(curr, next, &debug_mutex_held_locks) { | ||
313 | lock = list_entry(curr, struct mutex, held_list); | ||
314 | t = lock->owner; | ||
315 | if (t != task->thread_info) | ||
316 | continue; | ||
317 | list_del_init(curr); | ||
318 | DEBUG_OFF(); | ||
319 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
320 | |||
321 | printk("BUG: %s/%d, lock held at task exit time!\n", | ||
322 | task->comm, task->pid); | ||
323 | printk_lock(lock, 1); | ||
324 | if (lock->owner != task->thread_info) | ||
325 | printk("exiting task is not even the owner??\n"); | ||
326 | return; | ||
327 | } | ||
328 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
329 | } | ||
330 | |||
331 | /* | ||
332 | * Called when kernel memory is freed (or unmapped), or if a mutex | ||
333 | * is destroyed or reinitialized - this code checks whether there is | ||
334 | * any held lock in the memory range of <from> to <to>: | ||
335 | */ | ||
336 | void mutex_debug_check_no_locks_freed(const void *from, unsigned long len) | ||
337 | { | ||
338 | struct list_head *curr, *next; | ||
339 | const void *to = from + len; | ||
340 | unsigned long flags; | ||
341 | struct mutex *lock; | ||
342 | void *lock_addr; | ||
343 | |||
344 | if (!debug_mutex_on) | ||
345 | return; | ||
346 | |||
347 | debug_spin_lock_save(&debug_mutex_lock, flags); | ||
348 | list_for_each_safe(curr, next, &debug_mutex_held_locks) { | ||
349 | lock = list_entry(curr, struct mutex, held_list); | ||
350 | lock_addr = lock; | ||
351 | if (lock_addr < from || lock_addr >= to) | ||
352 | continue; | ||
353 | list_del_init(curr); | ||
354 | DEBUG_OFF(); | ||
355 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
356 | |||
357 | printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", | ||
358 | current->comm, current->pid, lock, from, to); | ||
359 | dump_stack(); | ||
360 | printk_lock(lock, 1); | ||
361 | if (lock->owner != current_thread_info()) | ||
362 | printk("freeing task is not even the owner??\n"); | ||
363 | return; | ||
364 | } | ||
365 | debug_spin_lock_restore(&debug_mutex_lock, flags); | ||
366 | } | ||
367 | |||
368 | /* | ||
369 | * Must be called with lock->wait_lock held. | ||
370 | */ | ||
371 | void debug_mutex_set_owner(struct mutex *lock, | ||
372 | struct thread_info *new_owner __IP_DECL__) | ||
373 | { | ||
374 | lock->owner = new_owner; | ||
375 | DEBUG_WARN_ON(!list_empty(&lock->held_list)); | ||
376 | if (debug_mutex_on) { | ||
377 | list_add_tail(&lock->held_list, &debug_mutex_held_locks); | ||
378 | lock->acquire_ip = ip; | ||
379 | } | ||
380 | } | ||
381 | |||
382 | void debug_mutex_init_waiter(struct mutex_waiter *waiter) | ||
383 | { | ||
384 | memset(waiter, 0x11, sizeof(*waiter)); | ||
385 | waiter->magic = waiter; | ||
386 | INIT_LIST_HEAD(&waiter->list); | ||
387 | } | ||
388 | |||
389 | void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) | ||
390 | { | ||
391 | SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock)); | ||
392 | DEBUG_WARN_ON(list_empty(&lock->wait_list)); | ||
393 | DEBUG_WARN_ON(waiter->magic != waiter); | ||
394 | DEBUG_WARN_ON(list_empty(&waiter->list)); | ||
395 | } | ||
396 | |||
397 | void debug_mutex_free_waiter(struct mutex_waiter *waiter) | ||
398 | { | ||
399 | DEBUG_WARN_ON(!list_empty(&waiter->list)); | ||
400 | memset(waiter, 0x22, sizeof(*waiter)); | ||
401 | } | ||
402 | |||
403 | void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, | ||
404 | struct thread_info *ti __IP_DECL__) | ||
405 | { | ||
406 | SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock)); | ||
407 | check_deadlock(lock, 0, ti, ip); | ||
408 | /* Mark the current thread as blocked on the lock: */ | ||
409 | ti->task->blocked_on = waiter; | ||
410 | waiter->lock = lock; | ||
411 | } | ||
412 | |||
413 | void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, | ||
414 | struct thread_info *ti) | ||
415 | { | ||
416 | DEBUG_WARN_ON(list_empty(&waiter->list)); | ||
417 | DEBUG_WARN_ON(waiter->task != ti->task); | ||
418 | DEBUG_WARN_ON(ti->task->blocked_on != waiter); | ||
419 | ti->task->blocked_on = NULL; | ||
420 | |||
421 | list_del_init(&waiter->list); | ||
422 | waiter->task = NULL; | ||
423 | } | ||
424 | |||
425 | void debug_mutex_unlock(struct mutex *lock) | ||
426 | { | ||
427 | DEBUG_WARN_ON(lock->magic != lock); | ||
428 | DEBUG_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); | ||
429 | DEBUG_WARN_ON(lock->owner != current_thread_info()); | ||
430 | if (debug_mutex_on) { | ||
431 | DEBUG_WARN_ON(list_empty(&lock->held_list)); | ||
432 | list_del_init(&lock->held_list); | ||
433 | } | ||
434 | } | ||
435 | |||
436 | void debug_mutex_init(struct mutex *lock, const char *name) | ||
437 | { | ||
438 | /* | ||
439 | * Make sure we are not reinitializing a held lock: | ||
440 | */ | ||
441 | mutex_debug_check_no_locks_freed((void *)lock, sizeof(*lock)); | ||
442 | lock->owner = NULL; | ||
443 | INIT_LIST_HEAD(&lock->held_list); | ||
444 | lock->name = name; | ||
445 | lock->magic = lock; | ||
446 | } | ||
447 | |||
448 | /*** | ||
449 | * mutex_destroy - mark a mutex unusable | ||
450 | * @lock: the mutex to be destroyed | ||
451 | * | ||
452 | * This function marks the mutex uninitialized, and any subsequent | ||
453 | * use of the mutex is forbidden. The mutex must not be locked when | ||
454 | * this function is called. | ||
455 | */ | ||
456 | void fastcall mutex_destroy(struct mutex *lock) | ||
457 | { | ||
458 | DEBUG_WARN_ON(mutex_is_locked(lock)); | ||
459 | lock->magic = NULL; | ||
460 | } | ||
461 | |||
462 | EXPORT_SYMBOL_GPL(mutex_destroy); | ||
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h new file mode 100644 index 000000000000..fd384050acb1 --- /dev/null +++ b/kernel/mutex-debug.h | |||
@@ -0,0 +1,134 @@ | |||
1 | /* | ||
2 | * Mutexes: blocking mutual exclusion locks | ||
3 | * | ||
4 | * started by Ingo Molnar: | ||
5 | * | ||
6 | * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * | ||
8 | * This file contains mutex debugging related internal declarations, | ||
9 | * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case. | ||
10 | * More details are in kernel/mutex-debug.c. | ||
11 | */ | ||
12 | |||
13 | extern spinlock_t debug_mutex_lock; | ||
14 | extern struct list_head debug_mutex_held_locks; | ||
15 | extern int debug_mutex_on; | ||
16 | |||
17 | /* | ||
18 | * In the debug case we carry the caller's instruction pointer into | ||
19 | * other functions, but we dont want the function argument overhead | ||
20 | * in the nondebug case - hence these macros: | ||
21 | */ | ||
22 | #define __IP_DECL__ , unsigned long ip | ||
23 | #define __IP__ , ip | ||
24 | #define __RET_IP__ , (unsigned long)__builtin_return_address(0) | ||
25 | |||
26 | /* | ||
27 | * This must be called with lock->wait_lock held. | ||
28 | */ | ||
29 | extern void debug_mutex_set_owner(struct mutex *lock, | ||
30 | struct thread_info *new_owner __IP_DECL__); | ||
31 | |||
32 | static inline void debug_mutex_clear_owner(struct mutex *lock) | ||
33 | { | ||
34 | lock->owner = NULL; | ||
35 | } | ||
36 | |||
37 | extern void debug_mutex_init_waiter(struct mutex_waiter *waiter); | ||
38 | extern void debug_mutex_wake_waiter(struct mutex *lock, | ||
39 | struct mutex_waiter *waiter); | ||
40 | extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); | ||
41 | extern void debug_mutex_add_waiter(struct mutex *lock, | ||
42 | struct mutex_waiter *waiter, | ||
43 | struct thread_info *ti __IP_DECL__); | ||
44 | extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, | ||
45 | struct thread_info *ti); | ||
46 | extern void debug_mutex_unlock(struct mutex *lock); | ||
47 | extern void debug_mutex_init(struct mutex *lock, const char *name); | ||
48 | |||
49 | #define debug_spin_lock(lock) \ | ||
50 | do { \ | ||
51 | local_irq_disable(); \ | ||
52 | if (debug_mutex_on) \ | ||
53 | spin_lock(lock); \ | ||
54 | } while (0) | ||
55 | |||
56 | #define debug_spin_unlock(lock) \ | ||
57 | do { \ | ||
58 | if (debug_mutex_on) \ | ||
59 | spin_unlock(lock); \ | ||
60 | local_irq_enable(); \ | ||
61 | preempt_check_resched(); \ | ||
62 | } while (0) | ||
63 | |||
64 | #define debug_spin_lock_save(lock, flags) \ | ||
65 | do { \ | ||
66 | local_irq_save(flags); \ | ||
67 | if (debug_mutex_on) \ | ||
68 | spin_lock(lock); \ | ||
69 | } while (0) | ||
70 | |||
71 | #define debug_spin_lock_restore(lock, flags) \ | ||
72 | do { \ | ||
73 | if (debug_mutex_on) \ | ||
74 | spin_unlock(lock); \ | ||
75 | local_irq_restore(flags); \ | ||
76 | preempt_check_resched(); \ | ||
77 | } while (0) | ||
78 | |||
79 | #define spin_lock_mutex(lock) \ | ||
80 | do { \ | ||
81 | struct mutex *l = container_of(lock, struct mutex, wait_lock); \ | ||
82 | \ | ||
83 | DEBUG_WARN_ON(in_interrupt()); \ | ||
84 | debug_spin_lock(&debug_mutex_lock); \ | ||
85 | spin_lock(lock); \ | ||
86 | DEBUG_WARN_ON(l->magic != l); \ | ||
87 | } while (0) | ||
88 | |||
89 | #define spin_unlock_mutex(lock) \ | ||
90 | do { \ | ||
91 | spin_unlock(lock); \ | ||
92 | debug_spin_unlock(&debug_mutex_lock); \ | ||
93 | } while (0) | ||
94 | |||
95 | #define DEBUG_OFF() \ | ||
96 | do { \ | ||
97 | if (debug_mutex_on) { \ | ||
98 | debug_mutex_on = 0; \ | ||
99 | console_verbose(); \ | ||
100 | if (spin_is_locked(&debug_mutex_lock)) \ | ||
101 | spin_unlock(&debug_mutex_lock); \ | ||
102 | } \ | ||
103 | } while (0) | ||
104 | |||
105 | #define DEBUG_BUG() \ | ||
106 | do { \ | ||
107 | if (debug_mutex_on) { \ | ||
108 | DEBUG_OFF(); \ | ||
109 | BUG(); \ | ||
110 | } \ | ||
111 | } while (0) | ||
112 | |||
113 | #define DEBUG_WARN_ON(c) \ | ||
114 | do { \ | ||
115 | if (unlikely(c && debug_mutex_on)) { \ | ||
116 | DEBUG_OFF(); \ | ||
117 | WARN_ON(1); \ | ||
118 | } \ | ||
119 | } while (0) | ||
120 | |||
121 | # define DEBUG_BUG_ON(c) \ | ||
122 | do { \ | ||
123 | if (unlikely(c)) \ | ||
124 | DEBUG_BUG(); \ | ||
125 | } while (0) | ||
126 | |||
127 | #ifdef CONFIG_SMP | ||
128 | # define SMP_DEBUG_WARN_ON(c) DEBUG_WARN_ON(c) | ||
129 | # define SMP_DEBUG_BUG_ON(c) DEBUG_BUG_ON(c) | ||
130 | #else | ||
131 | # define SMP_DEBUG_WARN_ON(c) do { } while (0) | ||
132 | # define SMP_DEBUG_BUG_ON(c) do { } while (0) | ||
133 | #endif | ||
134 | |||
diff --git a/kernel/mutex.c b/kernel/mutex.c new file mode 100644 index 000000000000..5449b210d9ed --- /dev/null +++ b/kernel/mutex.c | |||
@@ -0,0 +1,315 @@ | |||
1 | /* | ||
2 | * kernel/mutex.c | ||
3 | * | ||
4 | * Mutexes: blocking mutual exclusion locks | ||
5 | * | ||
6 | * Started by Ingo Molnar: | ||
7 | * | ||
8 | * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
9 | * | ||
10 | * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and | ||
11 | * David Howells for suggestions and improvements. | ||
12 | * | ||
13 | * Also see Documentation/mutex-design.txt. | ||
14 | */ | ||
15 | #include <linux/mutex.h> | ||
16 | #include <linux/sched.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/spinlock.h> | ||
19 | #include <linux/interrupt.h> | ||
20 | |||
21 | /* | ||
22 | * In the DEBUG case we are using the "NULL fastpath" for mutexes, | ||
23 | * which forces all calls into the slowpath: | ||
24 | */ | ||
25 | #ifdef CONFIG_DEBUG_MUTEXES | ||
26 | # include "mutex-debug.h" | ||
27 | # include <asm-generic/mutex-null.h> | ||
28 | #else | ||
29 | # include "mutex.h" | ||
30 | # include <asm/mutex.h> | ||
31 | #endif | ||
32 | |||
33 | /*** | ||
34 | * mutex_init - initialize the mutex | ||
35 | * @lock: the mutex to be initialized | ||
36 | * | ||
37 | * Initialize the mutex to unlocked state. | ||
38 | * | ||
39 | * It is not allowed to initialize an already locked mutex. | ||
40 | */ | ||
41 | void fastcall __mutex_init(struct mutex *lock, const char *name) | ||
42 | { | ||
43 | atomic_set(&lock->count, 1); | ||
44 | spin_lock_init(&lock->wait_lock); | ||
45 | INIT_LIST_HEAD(&lock->wait_list); | ||
46 | |||
47 | debug_mutex_init(lock, name); | ||
48 | } | ||
49 | |||
50 | EXPORT_SYMBOL(__mutex_init); | ||
51 | |||
52 | /* | ||
53 | * We split the mutex lock/unlock logic into separate fastpath and | ||
54 | * slowpath functions, to reduce the register pressure on the fastpath. | ||
55 | * We also put the fastpath first in the kernel image, to make sure the | ||
56 | * branch is predicted by the CPU as default-untaken. | ||
57 | */ | ||
58 | static void fastcall noinline __sched | ||
59 | __mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__); | ||
60 | |||
61 | /*** | ||
62 | * mutex_lock - acquire the mutex | ||
63 | * @lock: the mutex to be acquired | ||
64 | * | ||
65 | * Lock the mutex exclusively for this task. If the mutex is not | ||
66 | * available right now, it will sleep until it can get it. | ||
67 | * | ||
68 | * The mutex must later on be released by the same task that | ||
69 | * acquired it. Recursive locking is not allowed. The task | ||
70 | * may not exit without first unlocking the mutex. Also, kernel | ||
71 | * memory where the mutex resides mutex must not be freed with | ||
72 | * the mutex still locked. The mutex must first be initialized | ||
73 | * (or statically defined) before it can be locked. memset()-ing | ||
74 | * the mutex to 0 is not allowed. | ||
75 | * | ||
76 | * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging | ||
77 | * checks that will enforce the restrictions and will also do | ||
78 | * deadlock debugging. ) | ||
79 | * | ||
80 | * This function is similar to (but not equivalent to) down(). | ||
81 | */ | ||
82 | void fastcall __sched mutex_lock(struct mutex *lock) | ||
83 | { | ||
84 | might_sleep(); | ||
85 | /* | ||
86 | * The locking fastpath is the 1->0 transition from | ||
87 | * 'unlocked' into 'locked' state. | ||
88 | */ | ||
89 | __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); | ||
90 | } | ||
91 | |||
92 | EXPORT_SYMBOL(mutex_lock); | ||
93 | |||
94 | static void fastcall noinline __sched | ||
95 | __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__); | ||
96 | |||
97 | /*** | ||
98 | * mutex_unlock - release the mutex | ||
99 | * @lock: the mutex to be released | ||
100 | * | ||
101 | * Unlock a mutex that has been locked by this task previously. | ||
102 | * | ||
103 | * This function must not be used in interrupt context. Unlocking | ||
104 | * of a not locked mutex is not allowed. | ||
105 | * | ||
106 | * This function is similar to (but not equivalent to) up(). | ||
107 | */ | ||
108 | void fastcall __sched mutex_unlock(struct mutex *lock) | ||
109 | { | ||
110 | /* | ||
111 | * The unlocking fastpath is the 0->1 transition from 'locked' | ||
112 | * into 'unlocked' state: | ||
113 | */ | ||
114 | __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); | ||
115 | } | ||
116 | |||
117 | EXPORT_SYMBOL(mutex_unlock); | ||
118 | |||
119 | /* | ||
120 | * Lock a mutex (possibly interruptible), slowpath: | ||
121 | */ | ||
122 | static inline int __sched | ||
123 | __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) | ||
124 | { | ||
125 | struct task_struct *task = current; | ||
126 | struct mutex_waiter waiter; | ||
127 | unsigned int old_val; | ||
128 | |||
129 | debug_mutex_init_waiter(&waiter); | ||
130 | |||
131 | spin_lock_mutex(&lock->wait_lock); | ||
132 | |||
133 | debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip); | ||
134 | |||
135 | /* add waiting tasks to the end of the waitqueue (FIFO): */ | ||
136 | list_add_tail(&waiter.list, &lock->wait_list); | ||
137 | waiter.task = task; | ||
138 | |||
139 | for (;;) { | ||
140 | /* | ||
141 | * Lets try to take the lock again - this is needed even if | ||
142 | * we get here for the first time (shortly after failing to | ||
143 | * acquire the lock), to make sure that we get a wakeup once | ||
144 | * it's unlocked. Later on, if we sleep, this is the | ||
145 | * operation that gives us the lock. We xchg it to -1, so | ||
146 | * that when we release the lock, we properly wake up the | ||
147 | * other waiters: | ||
148 | */ | ||
149 | old_val = atomic_xchg(&lock->count, -1); | ||
150 | if (old_val == 1) | ||
151 | break; | ||
152 | |||
153 | /* | ||
154 | * got a signal? (This code gets eliminated in the | ||
155 | * TASK_UNINTERRUPTIBLE case.) | ||
156 | */ | ||
157 | if (unlikely(state == TASK_INTERRUPTIBLE && | ||
158 | signal_pending(task))) { | ||
159 | mutex_remove_waiter(lock, &waiter, task->thread_info); | ||
160 | spin_unlock_mutex(&lock->wait_lock); | ||
161 | |||
162 | debug_mutex_free_waiter(&waiter); | ||
163 | return -EINTR; | ||
164 | } | ||
165 | __set_task_state(task, state); | ||
166 | |||
167 | /* didnt get the lock, go to sleep: */ | ||
168 | spin_unlock_mutex(&lock->wait_lock); | ||
169 | schedule(); | ||
170 | spin_lock_mutex(&lock->wait_lock); | ||
171 | } | ||
172 | |||
173 | /* got the lock - rejoice! */ | ||
174 | mutex_remove_waiter(lock, &waiter, task->thread_info); | ||
175 | debug_mutex_set_owner(lock, task->thread_info __IP__); | ||
176 | |||
177 | /* set it to 0 if there are no waiters left: */ | ||
178 | if (likely(list_empty(&lock->wait_list))) | ||
179 | atomic_set(&lock->count, 0); | ||
180 | |||
181 | spin_unlock_mutex(&lock->wait_lock); | ||
182 | |||
183 | debug_mutex_free_waiter(&waiter); | ||
184 | |||
185 | DEBUG_WARN_ON(list_empty(&lock->held_list)); | ||
186 | DEBUG_WARN_ON(lock->owner != task->thread_info); | ||
187 | |||
188 | return 0; | ||
189 | } | ||
190 | |||
191 | static void fastcall noinline __sched | ||
192 | __mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__) | ||
193 | { | ||
194 | struct mutex *lock = container_of(lock_count, struct mutex, count); | ||
195 | |||
196 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE __IP__); | ||
197 | } | ||
198 | |||
199 | /* | ||
200 | * Release the lock, slowpath: | ||
201 | */ | ||
202 | static fastcall noinline void | ||
203 | __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) | ||
204 | { | ||
205 | struct mutex *lock = container_of(lock_count, struct mutex, count); | ||
206 | |||
207 | DEBUG_WARN_ON(lock->owner != current_thread_info()); | ||
208 | |||
209 | spin_lock_mutex(&lock->wait_lock); | ||
210 | |||
211 | /* | ||
212 | * some architectures leave the lock unlocked in the fastpath failure | ||
213 | * case, others need to leave it locked. In the later case we have to | ||
214 | * unlock it here | ||
215 | */ | ||
216 | if (__mutex_slowpath_needs_to_unlock()) | ||
217 | atomic_set(&lock->count, 1); | ||
218 | |||
219 | debug_mutex_unlock(lock); | ||
220 | |||
221 | if (!list_empty(&lock->wait_list)) { | ||
222 | /* get the first entry from the wait-list: */ | ||
223 | struct mutex_waiter *waiter = | ||
224 | list_entry(lock->wait_list.next, | ||
225 | struct mutex_waiter, list); | ||
226 | |||
227 | debug_mutex_wake_waiter(lock, waiter); | ||
228 | |||
229 | wake_up_process(waiter->task); | ||
230 | } | ||
231 | |||
232 | debug_mutex_clear_owner(lock); | ||
233 | |||
234 | spin_unlock_mutex(&lock->wait_lock); | ||
235 | } | ||
236 | |||
237 | /* | ||
238 | * Here come the less common (and hence less performance-critical) APIs: | ||
239 | * mutex_lock_interruptible() and mutex_trylock(). | ||
240 | */ | ||
241 | static int fastcall noinline __sched | ||
242 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__); | ||
243 | |||
244 | /*** | ||
245 | * mutex_lock_interruptible - acquire the mutex, interruptable | ||
246 | * @lock: the mutex to be acquired | ||
247 | * | ||
248 | * Lock the mutex like mutex_lock(), and return 0 if the mutex has | ||
249 | * been acquired or sleep until the mutex becomes available. If a | ||
250 | * signal arrives while waiting for the lock then this function | ||
251 | * returns -EINTR. | ||
252 | * | ||
253 | * This function is similar to (but not equivalent to) down_interruptible(). | ||
254 | */ | ||
255 | int fastcall __sched mutex_lock_interruptible(struct mutex *lock) | ||
256 | { | ||
257 | might_sleep(); | ||
258 | return __mutex_fastpath_lock_retval | ||
259 | (&lock->count, __mutex_lock_interruptible_slowpath); | ||
260 | } | ||
261 | |||
262 | EXPORT_SYMBOL(mutex_lock_interruptible); | ||
263 | |||
264 | static int fastcall noinline __sched | ||
265 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__) | ||
266 | { | ||
267 | struct mutex *lock = container_of(lock_count, struct mutex, count); | ||
268 | |||
269 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE __IP__); | ||
270 | } | ||
271 | |||
272 | /* | ||
273 | * Spinlock based trylock, we take the spinlock and check whether we | ||
274 | * can get the lock: | ||
275 | */ | ||
276 | static inline int __mutex_trylock_slowpath(atomic_t *lock_count) | ||
277 | { | ||
278 | struct mutex *lock = container_of(lock_count, struct mutex, count); | ||
279 | int prev; | ||
280 | |||
281 | spin_lock_mutex(&lock->wait_lock); | ||
282 | |||
283 | prev = atomic_xchg(&lock->count, -1); | ||
284 | if (likely(prev == 1)) | ||
285 | debug_mutex_set_owner(lock, current_thread_info() __RET_IP__); | ||
286 | /* Set it back to 0 if there are no waiters: */ | ||
287 | if (likely(list_empty(&lock->wait_list))) | ||
288 | atomic_set(&lock->count, 0); | ||
289 | |||
290 | spin_unlock_mutex(&lock->wait_lock); | ||
291 | |||
292 | return prev == 1; | ||
293 | } | ||
294 | |||
295 | /*** | ||
296 | * mutex_trylock - try acquire the mutex, without waiting | ||
297 | * @lock: the mutex to be acquired | ||
298 | * | ||
299 | * Try to acquire the mutex atomically. Returns 1 if the mutex | ||
300 | * has been acquired successfully, and 0 on contention. | ||
301 | * | ||
302 | * NOTE: this function follows the spin_trylock() convention, so | ||
303 | * it is negated to the down_trylock() return values! Be careful | ||
304 | * about this when converting semaphore users to mutexes. | ||
305 | * | ||
306 | * This function must not be used in interrupt context. The | ||
307 | * mutex must be released by the same task that acquired it. | ||
308 | */ | ||
309 | int fastcall mutex_trylock(struct mutex *lock) | ||
310 | { | ||
311 | return __mutex_fastpath_trylock(&lock->count, | ||
312 | __mutex_trylock_slowpath); | ||
313 | } | ||
314 | |||
315 | EXPORT_SYMBOL(mutex_trylock); | ||
diff --git a/kernel/mutex.h b/kernel/mutex.h new file mode 100644 index 000000000000..00fe84e7b672 --- /dev/null +++ b/kernel/mutex.h | |||
@@ -0,0 +1,35 @@ | |||
1 | /* | ||
2 | * Mutexes: blocking mutual exclusion locks | ||
3 | * | ||
4 | * started by Ingo Molnar: | ||
5 | * | ||
6 | * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * | ||
8 | * This file contains mutex debugging related internal prototypes, for the | ||
9 | * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: | ||
10 | */ | ||
11 | |||
12 | #define spin_lock_mutex(lock) spin_lock(lock) | ||
13 | #define spin_unlock_mutex(lock) spin_unlock(lock) | ||
14 | #define mutex_remove_waiter(lock, waiter, ti) \ | ||
15 | __list_del((waiter)->list.prev, (waiter)->list.next) | ||
16 | |||
17 | #define DEBUG_WARN_ON(c) do { } while (0) | ||
18 | #define debug_mutex_set_owner(lock, new_owner) do { } while (0) | ||
19 | #define debug_mutex_clear_owner(lock) do { } while (0) | ||
20 | #define debug_mutex_init_waiter(waiter) do { } while (0) | ||
21 | #define debug_mutex_wake_waiter(lock, waiter) do { } while (0) | ||
22 | #define debug_mutex_free_waiter(waiter) do { } while (0) | ||
23 | #define debug_mutex_add_waiter(lock, waiter, ti, ip) do { } while (0) | ||
24 | #define debug_mutex_unlock(lock) do { } while (0) | ||
25 | #define debug_mutex_init(lock, name) do { } while (0) | ||
26 | |||
27 | /* | ||
28 | * Return-address parameters/declarations. They are very useful for | ||
29 | * debugging, but add overhead in the !DEBUG case - so we go the | ||
30 | * trouble of using this not too elegant but zero-cost solution: | ||
31 | */ | ||
32 | #define __IP_DECL__ | ||
33 | #define __IP__ | ||
34 | #define __RET_IP__ | ||
35 | |||
diff --git a/kernel/panic.c b/kernel/panic.c index aabc5f86fa3f..c5c4ab255834 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -60,7 +60,7 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
60 | long i; | 60 | long i; |
61 | static char buf[1024]; | 61 | static char buf[1024]; |
62 | va_list args; | 62 | va_list args; |
63 | #if defined(CONFIG_ARCH_S390) | 63 | #if defined(CONFIG_S390) |
64 | unsigned long caller = (unsigned long) __builtin_return_address(0); | 64 | unsigned long caller = (unsigned long) __builtin_return_address(0); |
65 | #endif | 65 | #endif |
66 | 66 | ||
@@ -125,7 +125,7 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
125 | printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n"); | 125 | printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n"); |
126 | } | 126 | } |
127 | #endif | 127 | #endif |
128 | #if defined(CONFIG_ARCH_S390) | 128 | #if defined(CONFIG_S390) |
129 | disabled_wait(caller); | 129 | disabled_wait(caller); |
130 | #endif | 130 | #endif |
131 | local_irq_enable(); | 131 | local_irq_enable(); |
diff --git a/kernel/pid.c b/kernel/pid.c index edba31c681ac..1acc07246991 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -136,7 +136,7 @@ struct pid * fastcall find_pid(enum pid_type type, int nr) | |||
136 | struct hlist_node *elem; | 136 | struct hlist_node *elem; |
137 | struct pid *pid; | 137 | struct pid *pid; |
138 | 138 | ||
139 | hlist_for_each_entry(pid, elem, | 139 | hlist_for_each_entry_rcu(pid, elem, |
140 | &pid_hash[type][pid_hashfn(nr)], pid_chain) { | 140 | &pid_hash[type][pid_hashfn(nr)], pid_chain) { |
141 | if (pid->nr == nr) | 141 | if (pid->nr == nr) |
142 | return pid; | 142 | return pid; |
@@ -150,15 +150,15 @@ int fastcall attach_pid(task_t *task, enum pid_type type, int nr) | |||
150 | 150 | ||
151 | task_pid = &task->pids[type]; | 151 | task_pid = &task->pids[type]; |
152 | pid = find_pid(type, nr); | 152 | pid = find_pid(type, nr); |
153 | task_pid->nr = nr; | ||
153 | if (pid == NULL) { | 154 | if (pid == NULL) { |
154 | hlist_add_head(&task_pid->pid_chain, | ||
155 | &pid_hash[type][pid_hashfn(nr)]); | ||
156 | INIT_LIST_HEAD(&task_pid->pid_list); | 155 | INIT_LIST_HEAD(&task_pid->pid_list); |
156 | hlist_add_head_rcu(&task_pid->pid_chain, | ||
157 | &pid_hash[type][pid_hashfn(nr)]); | ||
157 | } else { | 158 | } else { |
158 | INIT_HLIST_NODE(&task_pid->pid_chain); | 159 | INIT_HLIST_NODE(&task_pid->pid_chain); |
159 | list_add_tail(&task_pid->pid_list, &pid->pid_list); | 160 | list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list); |
160 | } | 161 | } |
161 | task_pid->nr = nr; | ||
162 | 162 | ||
163 | return 0; | 163 | return 0; |
164 | } | 164 | } |
@@ -170,20 +170,20 @@ static fastcall int __detach_pid(task_t *task, enum pid_type type) | |||
170 | 170 | ||
171 | pid = &task->pids[type]; | 171 | pid = &task->pids[type]; |
172 | if (!hlist_unhashed(&pid->pid_chain)) { | 172 | if (!hlist_unhashed(&pid->pid_chain)) { |
173 | hlist_del(&pid->pid_chain); | ||
174 | 173 | ||
175 | if (list_empty(&pid->pid_list)) | 174 | if (list_empty(&pid->pid_list)) { |
176 | nr = pid->nr; | 175 | nr = pid->nr; |
177 | else { | 176 | hlist_del_rcu(&pid->pid_chain); |
177 | } else { | ||
178 | pid_next = list_entry(pid->pid_list.next, | 178 | pid_next = list_entry(pid->pid_list.next, |
179 | struct pid, pid_list); | 179 | struct pid, pid_list); |
180 | /* insert next pid from pid_list to hash */ | 180 | /* insert next pid from pid_list to hash */ |
181 | hlist_add_head(&pid_next->pid_chain, | 181 | hlist_replace_rcu(&pid->pid_chain, |
182 | &pid_hash[type][pid_hashfn(pid_next->nr)]); | 182 | &pid_next->pid_chain); |
183 | } | 183 | } |
184 | } | 184 | } |
185 | 185 | ||
186 | list_del(&pid->pid_list); | 186 | list_del_rcu(&pid->pid_list); |
187 | pid->nr = 0; | 187 | pid->nr = 0; |
188 | 188 | ||
189 | return nr; | 189 | return nr; |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index cae4f5728997..520f6c59948d 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -7,7 +7,7 @@ | |||
7 | #include <asm/uaccess.h> | 7 | #include <asm/uaccess.h> |
8 | #include <linux/errno.h> | 8 | #include <linux/errno.h> |
9 | 9 | ||
10 | static int check_clock(clockid_t which_clock) | 10 | static int check_clock(const clockid_t which_clock) |
11 | { | 11 | { |
12 | int error = 0; | 12 | int error = 0; |
13 | struct task_struct *p; | 13 | struct task_struct *p; |
@@ -31,7 +31,7 @@ static int check_clock(clockid_t which_clock) | |||
31 | } | 31 | } |
32 | 32 | ||
33 | static inline union cpu_time_count | 33 | static inline union cpu_time_count |
34 | timespec_to_sample(clockid_t which_clock, const struct timespec *tp) | 34 | timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) |
35 | { | 35 | { |
36 | union cpu_time_count ret; | 36 | union cpu_time_count ret; |
37 | ret.sched = 0; /* high half always zero when .cpu used */ | 37 | ret.sched = 0; /* high half always zero when .cpu used */ |
@@ -43,7 +43,7 @@ timespec_to_sample(clockid_t which_clock, const struct timespec *tp) | |||
43 | return ret; | 43 | return ret; |
44 | } | 44 | } |
45 | 45 | ||
46 | static void sample_to_timespec(clockid_t which_clock, | 46 | static void sample_to_timespec(const clockid_t which_clock, |
47 | union cpu_time_count cpu, | 47 | union cpu_time_count cpu, |
48 | struct timespec *tp) | 48 | struct timespec *tp) |
49 | { | 49 | { |
@@ -55,7 +55,7 @@ static void sample_to_timespec(clockid_t which_clock, | |||
55 | } | 55 | } |
56 | } | 56 | } |
57 | 57 | ||
58 | static inline int cpu_time_before(clockid_t which_clock, | 58 | static inline int cpu_time_before(const clockid_t which_clock, |
59 | union cpu_time_count now, | 59 | union cpu_time_count now, |
60 | union cpu_time_count then) | 60 | union cpu_time_count then) |
61 | { | 61 | { |
@@ -65,7 +65,7 @@ static inline int cpu_time_before(clockid_t which_clock, | |||
65 | return cputime_lt(now.cpu, then.cpu); | 65 | return cputime_lt(now.cpu, then.cpu); |
66 | } | 66 | } |
67 | } | 67 | } |
68 | static inline void cpu_time_add(clockid_t which_clock, | 68 | static inline void cpu_time_add(const clockid_t which_clock, |
69 | union cpu_time_count *acc, | 69 | union cpu_time_count *acc, |
70 | union cpu_time_count val) | 70 | union cpu_time_count val) |
71 | { | 71 | { |
@@ -75,7 +75,7 @@ static inline void cpu_time_add(clockid_t which_clock, | |||
75 | acc->cpu = cputime_add(acc->cpu, val.cpu); | 75 | acc->cpu = cputime_add(acc->cpu, val.cpu); |
76 | } | 76 | } |
77 | } | 77 | } |
78 | static inline union cpu_time_count cpu_time_sub(clockid_t which_clock, | 78 | static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, |
79 | union cpu_time_count a, | 79 | union cpu_time_count a, |
80 | union cpu_time_count b) | 80 | union cpu_time_count b) |
81 | { | 81 | { |
@@ -151,7 +151,7 @@ static inline unsigned long long sched_ns(struct task_struct *p) | |||
151 | return (p == current) ? current_sched_time(p) : p->sched_time; | 151 | return (p == current) ? current_sched_time(p) : p->sched_time; |
152 | } | 152 | } |
153 | 153 | ||
154 | int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp) | 154 | int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) |
155 | { | 155 | { |
156 | int error = check_clock(which_clock); | 156 | int error = check_clock(which_clock); |
157 | if (!error) { | 157 | if (!error) { |
@@ -169,7 +169,7 @@ int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp) | |||
169 | return error; | 169 | return error; |
170 | } | 170 | } |
171 | 171 | ||
172 | int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp) | 172 | int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) |
173 | { | 173 | { |
174 | /* | 174 | /* |
175 | * You can never reset a CPU clock, but we check for other errors | 175 | * You can never reset a CPU clock, but we check for other errors |
@@ -186,7 +186,7 @@ int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp) | |||
186 | /* | 186 | /* |
187 | * Sample a per-thread clock for the given task. | 187 | * Sample a per-thread clock for the given task. |
188 | */ | 188 | */ |
189 | static int cpu_clock_sample(clockid_t which_clock, struct task_struct *p, | 189 | static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, |
190 | union cpu_time_count *cpu) | 190 | union cpu_time_count *cpu) |
191 | { | 191 | { |
192 | switch (CPUCLOCK_WHICH(which_clock)) { | 192 | switch (CPUCLOCK_WHICH(which_clock)) { |
@@ -238,18 +238,7 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx, | |||
238 | while ((t = next_thread(t)) != p) { | 238 | while ((t = next_thread(t)) != p) { |
239 | cpu->sched += t->sched_time; | 239 | cpu->sched += t->sched_time; |
240 | } | 240 | } |
241 | if (p->tgid == current->tgid) { | 241 | cpu->sched += sched_ns(p); |
242 | /* | ||
243 | * We're sampling ourselves, so include the | ||
244 | * cycles not yet banked. We still omit | ||
245 | * other threads running on other CPUs, | ||
246 | * so the total can always be behind as | ||
247 | * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ). | ||
248 | */ | ||
249 | cpu->sched += current_sched_time(current); | ||
250 | } else { | ||
251 | cpu->sched += p->sched_time; | ||
252 | } | ||
253 | break; | 242 | break; |
254 | } | 243 | } |
255 | return 0; | 244 | return 0; |
@@ -259,7 +248,7 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx, | |||
259 | * Sample a process (thread group) clock for the given group_leader task. | 248 | * Sample a process (thread group) clock for the given group_leader task. |
260 | * Must be called with tasklist_lock held for reading. | 249 | * Must be called with tasklist_lock held for reading. |
261 | */ | 250 | */ |
262 | static int cpu_clock_sample_group(clockid_t which_clock, | 251 | static int cpu_clock_sample_group(const clockid_t which_clock, |
263 | struct task_struct *p, | 252 | struct task_struct *p, |
264 | union cpu_time_count *cpu) | 253 | union cpu_time_count *cpu) |
265 | { | 254 | { |
@@ -273,7 +262,7 @@ static int cpu_clock_sample_group(clockid_t which_clock, | |||
273 | } | 262 | } |
274 | 263 | ||
275 | 264 | ||
276 | int posix_cpu_clock_get(clockid_t which_clock, struct timespec *tp) | 265 | int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) |
277 | { | 266 | { |
278 | const pid_t pid = CPUCLOCK_PID(which_clock); | 267 | const pid_t pid = CPUCLOCK_PID(which_clock); |
279 | int error = -EINVAL; | 268 | int error = -EINVAL; |
@@ -1410,8 +1399,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, | |||
1410 | 1399 | ||
1411 | static long posix_cpu_clock_nanosleep_restart(struct restart_block *); | 1400 | static long posix_cpu_clock_nanosleep_restart(struct restart_block *); |
1412 | 1401 | ||
1413 | int posix_cpu_nsleep(clockid_t which_clock, int flags, | 1402 | int posix_cpu_nsleep(const clockid_t which_clock, int flags, |
1414 | struct timespec *rqtp) | 1403 | struct timespec *rqtp, struct timespec __user *rmtp) |
1415 | { | 1404 | { |
1416 | struct restart_block *restart_block = | 1405 | struct restart_block *restart_block = |
1417 | ¤t_thread_info()->restart_block; | 1406 | ¤t_thread_info()->restart_block; |
@@ -1436,7 +1425,6 @@ int posix_cpu_nsleep(clockid_t which_clock, int flags, | |||
1436 | error = posix_cpu_timer_create(&timer); | 1425 | error = posix_cpu_timer_create(&timer); |
1437 | timer.it_process = current; | 1426 | timer.it_process = current; |
1438 | if (!error) { | 1427 | if (!error) { |
1439 | struct timespec __user *rmtp; | ||
1440 | static struct itimerspec zero_it; | 1428 | static struct itimerspec zero_it; |
1441 | struct itimerspec it = { .it_value = *rqtp, | 1429 | struct itimerspec it = { .it_value = *rqtp, |
1442 | .it_interval = {} }; | 1430 | .it_interval = {} }; |
@@ -1483,7 +1471,6 @@ int posix_cpu_nsleep(clockid_t which_clock, int flags, | |||
1483 | /* | 1471 | /* |
1484 | * Report back to the user the time still remaining. | 1472 | * Report back to the user the time still remaining. |
1485 | */ | 1473 | */ |
1486 | rmtp = (struct timespec __user *) restart_block->arg1; | ||
1487 | if (rmtp != NULL && !(flags & TIMER_ABSTIME) && | 1474 | if (rmtp != NULL && !(flags & TIMER_ABSTIME) && |
1488 | copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) | 1475 | copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) |
1489 | return -EFAULT; | 1476 | return -EFAULT; |
@@ -1491,6 +1478,7 @@ int posix_cpu_nsleep(clockid_t which_clock, int flags, | |||
1491 | restart_block->fn = posix_cpu_clock_nanosleep_restart; | 1478 | restart_block->fn = posix_cpu_clock_nanosleep_restart; |
1492 | /* Caller already set restart_block->arg1 */ | 1479 | /* Caller already set restart_block->arg1 */ |
1493 | restart_block->arg0 = which_clock; | 1480 | restart_block->arg0 = which_clock; |
1481 | restart_block->arg1 = (unsigned long) rmtp; | ||
1494 | restart_block->arg2 = rqtp->tv_sec; | 1482 | restart_block->arg2 = rqtp->tv_sec; |
1495 | restart_block->arg3 = rqtp->tv_nsec; | 1483 | restart_block->arg3 = rqtp->tv_nsec; |
1496 | 1484 | ||
@@ -1504,21 +1492,28 @@ static long | |||
1504 | posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block) | 1492 | posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block) |
1505 | { | 1493 | { |
1506 | clockid_t which_clock = restart_block->arg0; | 1494 | clockid_t which_clock = restart_block->arg0; |
1507 | struct timespec t = { .tv_sec = restart_block->arg2, | 1495 | struct timespec __user *rmtp; |
1508 | .tv_nsec = restart_block->arg3 }; | 1496 | struct timespec t; |
1497 | |||
1498 | rmtp = (struct timespec __user *) restart_block->arg1; | ||
1499 | t.tv_sec = restart_block->arg2; | ||
1500 | t.tv_nsec = restart_block->arg3; | ||
1501 | |||
1509 | restart_block->fn = do_no_restart_syscall; | 1502 | restart_block->fn = do_no_restart_syscall; |
1510 | return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t); | 1503 | return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t, rmtp); |
1511 | } | 1504 | } |
1512 | 1505 | ||
1513 | 1506 | ||
1514 | #define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) | 1507 | #define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) |
1515 | #define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) | 1508 | #define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) |
1516 | 1509 | ||
1517 | static int process_cpu_clock_getres(clockid_t which_clock, struct timespec *tp) | 1510 | static int process_cpu_clock_getres(const clockid_t which_clock, |
1511 | struct timespec *tp) | ||
1518 | { | 1512 | { |
1519 | return posix_cpu_clock_getres(PROCESS_CLOCK, tp); | 1513 | return posix_cpu_clock_getres(PROCESS_CLOCK, tp); |
1520 | } | 1514 | } |
1521 | static int process_cpu_clock_get(clockid_t which_clock, struct timespec *tp) | 1515 | static int process_cpu_clock_get(const clockid_t which_clock, |
1516 | struct timespec *tp) | ||
1522 | { | 1517 | { |
1523 | return posix_cpu_clock_get(PROCESS_CLOCK, tp); | 1518 | return posix_cpu_clock_get(PROCESS_CLOCK, tp); |
1524 | } | 1519 | } |
@@ -1527,16 +1522,19 @@ static int process_cpu_timer_create(struct k_itimer *timer) | |||
1527 | timer->it_clock = PROCESS_CLOCK; | 1522 | timer->it_clock = PROCESS_CLOCK; |
1528 | return posix_cpu_timer_create(timer); | 1523 | return posix_cpu_timer_create(timer); |
1529 | } | 1524 | } |
1530 | static int process_cpu_nsleep(clockid_t which_clock, int flags, | 1525 | static int process_cpu_nsleep(const clockid_t which_clock, int flags, |
1531 | struct timespec *rqtp) | 1526 | struct timespec *rqtp, |
1527 | struct timespec __user *rmtp) | ||
1532 | { | 1528 | { |
1533 | return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp); | 1529 | return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); |
1534 | } | 1530 | } |
1535 | static int thread_cpu_clock_getres(clockid_t which_clock, struct timespec *tp) | 1531 | static int thread_cpu_clock_getres(const clockid_t which_clock, |
1532 | struct timespec *tp) | ||
1536 | { | 1533 | { |
1537 | return posix_cpu_clock_getres(THREAD_CLOCK, tp); | 1534 | return posix_cpu_clock_getres(THREAD_CLOCK, tp); |
1538 | } | 1535 | } |
1539 | static int thread_cpu_clock_get(clockid_t which_clock, struct timespec *tp) | 1536 | static int thread_cpu_clock_get(const clockid_t which_clock, |
1537 | struct timespec *tp) | ||
1540 | { | 1538 | { |
1541 | return posix_cpu_clock_get(THREAD_CLOCK, tp); | 1539 | return posix_cpu_clock_get(THREAD_CLOCK, tp); |
1542 | } | 1540 | } |
@@ -1545,8 +1543,8 @@ static int thread_cpu_timer_create(struct k_itimer *timer) | |||
1545 | timer->it_clock = THREAD_CLOCK; | 1543 | timer->it_clock = THREAD_CLOCK; |
1546 | return posix_cpu_timer_create(timer); | 1544 | return posix_cpu_timer_create(timer); |
1547 | } | 1545 | } |
1548 | static int thread_cpu_nsleep(clockid_t which_clock, int flags, | 1546 | static int thread_cpu_nsleep(const clockid_t which_clock, int flags, |
1549 | struct timespec *rqtp) | 1547 | struct timespec *rqtp, struct timespec __user *rmtp) |
1550 | { | 1548 | { |
1551 | return -EINVAL; | 1549 | return -EINVAL; |
1552 | } | 1550 | } |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 5870efb3e200..197208b3aa2a 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -48,21 +48,6 @@ | |||
48 | #include <linux/workqueue.h> | 48 | #include <linux/workqueue.h> |
49 | #include <linux/module.h> | 49 | #include <linux/module.h> |
50 | 50 | ||
51 | #ifndef div_long_long_rem | ||
52 | #include <asm/div64.h> | ||
53 | |||
54 | #define div_long_long_rem(dividend,divisor,remainder) ({ \ | ||
55 | u64 result = dividend; \ | ||
56 | *remainder = do_div(result,divisor); \ | ||
57 | result; }) | ||
58 | |||
59 | #endif | ||
60 | #define CLOCK_REALTIME_RES TICK_NSEC /* In nano seconds. */ | ||
61 | |||
62 | static inline u64 mpy_l_X_l_ll(unsigned long mpy1,unsigned long mpy2) | ||
63 | { | ||
64 | return (u64)mpy1 * mpy2; | ||
65 | } | ||
66 | /* | 51 | /* |
67 | * Management arrays for POSIX timers. Timers are kept in slab memory | 52 | * Management arrays for POSIX timers. Timers are kept in slab memory |
68 | * Timer ids are allocated by an external routine that keeps track of the | 53 | * Timer ids are allocated by an external routine that keeps track of the |
@@ -148,18 +133,18 @@ static DEFINE_SPINLOCK(idr_lock); | |||
148 | */ | 133 | */ |
149 | 134 | ||
150 | static struct k_clock posix_clocks[MAX_CLOCKS]; | 135 | static struct k_clock posix_clocks[MAX_CLOCKS]; |
136 | |||
151 | /* | 137 | /* |
152 | * We only have one real clock that can be set so we need only one abs list, | 138 | * These ones are defined below. |
153 | * even if we should want to have several clocks with differing resolutions. | ||
154 | */ | 139 | */ |
155 | static struct k_clock_abs abs_list = {.list = LIST_HEAD_INIT(abs_list.list), | 140 | static int common_nsleep(const clockid_t, int flags, struct timespec *t, |
156 | .lock = SPIN_LOCK_UNLOCKED}; | 141 | struct timespec __user *rmtp); |
142 | static void common_timer_get(struct k_itimer *, struct itimerspec *); | ||
143 | static int common_timer_set(struct k_itimer *, int, | ||
144 | struct itimerspec *, struct itimerspec *); | ||
145 | static int common_timer_del(struct k_itimer *timer); | ||
157 | 146 | ||
158 | static void posix_timer_fn(unsigned long); | 147 | static int posix_timer_fn(void *data); |
159 | static u64 do_posix_clock_monotonic_gettime_parts( | ||
160 | struct timespec *tp, struct timespec *mo); | ||
161 | int do_posix_clock_monotonic_gettime(struct timespec *tp); | ||
162 | static int do_posix_clock_monotonic_get(clockid_t, struct timespec *tp); | ||
163 | 148 | ||
164 | static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); | 149 | static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); |
165 | 150 | ||
@@ -184,7 +169,7 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) | |||
184 | * the function pointer CALL in struct k_clock. | 169 | * the function pointer CALL in struct k_clock. |
185 | */ | 170 | */ |
186 | 171 | ||
187 | static inline int common_clock_getres(clockid_t which_clock, | 172 | static inline int common_clock_getres(const clockid_t which_clock, |
188 | struct timespec *tp) | 173 | struct timespec *tp) |
189 | { | 174 | { |
190 | tp->tv_sec = 0; | 175 | tp->tv_sec = 0; |
@@ -192,39 +177,33 @@ static inline int common_clock_getres(clockid_t which_clock, | |||
192 | return 0; | 177 | return 0; |
193 | } | 178 | } |
194 | 179 | ||
195 | static inline int common_clock_get(clockid_t which_clock, struct timespec *tp) | 180 | /* |
181 | * Get real time for posix timers | ||
182 | */ | ||
183 | static int common_clock_get(clockid_t which_clock, struct timespec *tp) | ||
196 | { | 184 | { |
197 | getnstimeofday(tp); | 185 | ktime_get_real_ts(tp); |
198 | return 0; | 186 | return 0; |
199 | } | 187 | } |
200 | 188 | ||
201 | static inline int common_clock_set(clockid_t which_clock, struct timespec *tp) | 189 | static inline int common_clock_set(const clockid_t which_clock, |
190 | struct timespec *tp) | ||
202 | { | 191 | { |
203 | return do_sys_settimeofday(tp, NULL); | 192 | return do_sys_settimeofday(tp, NULL); |
204 | } | 193 | } |
205 | 194 | ||
206 | static inline int common_timer_create(struct k_itimer *new_timer) | 195 | static int common_timer_create(struct k_itimer *new_timer) |
207 | { | 196 | { |
208 | INIT_LIST_HEAD(&new_timer->it.real.abs_timer_entry); | 197 | hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock); |
209 | init_timer(&new_timer->it.real.timer); | 198 | new_timer->it.real.timer.data = new_timer; |
210 | new_timer->it.real.timer.data = (unsigned long) new_timer; | ||
211 | new_timer->it.real.timer.function = posix_timer_fn; | 199 | new_timer->it.real.timer.function = posix_timer_fn; |
212 | return 0; | 200 | return 0; |
213 | } | 201 | } |
214 | 202 | ||
215 | /* | 203 | /* |
216 | * These ones are defined below. | 204 | * Return nonzero if we know a priori this clockid_t value is bogus. |
217 | */ | ||
218 | static int common_nsleep(clockid_t, int flags, struct timespec *t); | ||
219 | static void common_timer_get(struct k_itimer *, struct itimerspec *); | ||
220 | static int common_timer_set(struct k_itimer *, int, | ||
221 | struct itimerspec *, struct itimerspec *); | ||
222 | static int common_timer_del(struct k_itimer *timer); | ||
223 | |||
224 | /* | ||
225 | * Return nonzero iff we know a priori this clockid_t value is bogus. | ||
226 | */ | 205 | */ |
227 | static inline int invalid_clockid(clockid_t which_clock) | 206 | static inline int invalid_clockid(const clockid_t which_clock) |
228 | { | 207 | { |
229 | if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ | 208 | if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ |
230 | return 0; | 209 | return 0; |
@@ -232,26 +211,32 @@ static inline int invalid_clockid(clockid_t which_clock) | |||
232 | return 1; | 211 | return 1; |
233 | if (posix_clocks[which_clock].clock_getres != NULL) | 212 | if (posix_clocks[which_clock].clock_getres != NULL) |
234 | return 0; | 213 | return 0; |
235 | #ifndef CLOCK_DISPATCH_DIRECT | ||
236 | if (posix_clocks[which_clock].res != 0) | 214 | if (posix_clocks[which_clock].res != 0) |
237 | return 0; | 215 | return 0; |
238 | #endif | ||
239 | return 1; | 216 | return 1; |
240 | } | 217 | } |
241 | 218 | ||
219 | /* | ||
220 | * Get monotonic time for posix timers | ||
221 | */ | ||
222 | static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) | ||
223 | { | ||
224 | ktime_get_ts(tp); | ||
225 | return 0; | ||
226 | } | ||
242 | 227 | ||
243 | /* | 228 | /* |
244 | * Initialize everything, well, just everything in Posix clocks/timers ;) | 229 | * Initialize everything, well, just everything in Posix clocks/timers ;) |
245 | */ | 230 | */ |
246 | static __init int init_posix_timers(void) | 231 | static __init int init_posix_timers(void) |
247 | { | 232 | { |
248 | struct k_clock clock_realtime = {.res = CLOCK_REALTIME_RES, | 233 | struct k_clock clock_realtime = { |
249 | .abs_struct = &abs_list | 234 | .clock_getres = hrtimer_get_res, |
250 | }; | 235 | }; |
251 | struct k_clock clock_monotonic = {.res = CLOCK_REALTIME_RES, | 236 | struct k_clock clock_monotonic = { |
252 | .abs_struct = NULL, | 237 | .clock_getres = hrtimer_get_res, |
253 | .clock_get = do_posix_clock_monotonic_get, | 238 | .clock_get = posix_ktime_get_ts, |
254 | .clock_set = do_posix_clock_nosettime | 239 | .clock_set = do_posix_clock_nosettime, |
255 | }; | 240 | }; |
256 | 241 | ||
257 | register_posix_clock(CLOCK_REALTIME, &clock_realtime); | 242 | register_posix_clock(CLOCK_REALTIME, &clock_realtime); |
@@ -265,117 +250,17 @@ static __init int init_posix_timers(void) | |||
265 | 250 | ||
266 | __initcall(init_posix_timers); | 251 | __initcall(init_posix_timers); |
267 | 252 | ||
268 | static void tstojiffie(struct timespec *tp, int res, u64 *jiff) | ||
269 | { | ||
270 | long sec = tp->tv_sec; | ||
271 | long nsec = tp->tv_nsec + res - 1; | ||
272 | |||
273 | if (nsec >= NSEC_PER_SEC) { | ||
274 | sec++; | ||
275 | nsec -= NSEC_PER_SEC; | ||
276 | } | ||
277 | |||
278 | /* | ||
279 | * The scaling constants are defined in <linux/time.h> | ||
280 | * The difference between there and here is that we do the | ||
281 | * res rounding and compute a 64-bit result (well so does that | ||
282 | * but it then throws away the high bits). | ||
283 | */ | ||
284 | *jiff = (mpy_l_X_l_ll(sec, SEC_CONVERSION) + | ||
285 | (mpy_l_X_l_ll(nsec, NSEC_CONVERSION) >> | ||
286 | (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; | ||
287 | } | ||
288 | |||
289 | /* | ||
290 | * This function adjusts the timer as needed as a result of the clock | ||
291 | * being set. It should only be called for absolute timers, and then | ||
292 | * under the abs_list lock. It computes the time difference and sets | ||
293 | * the new jiffies value in the timer. It also updates the timers | ||
294 | * reference wall_to_monotonic value. It is complicated by the fact | ||
295 | * that tstojiffies() only handles positive times and it needs to work | ||
296 | * with both positive and negative times. Also, for negative offsets, | ||
297 | * we need to defeat the res round up. | ||
298 | * | ||
299 | * Return is true if there is a new time, else false. | ||
300 | */ | ||
301 | static long add_clockset_delta(struct k_itimer *timr, | ||
302 | struct timespec *new_wall_to) | ||
303 | { | ||
304 | struct timespec delta; | ||
305 | int sign = 0; | ||
306 | u64 exp; | ||
307 | |||
308 | set_normalized_timespec(&delta, | ||
309 | new_wall_to->tv_sec - | ||
310 | timr->it.real.wall_to_prev.tv_sec, | ||
311 | new_wall_to->tv_nsec - | ||
312 | timr->it.real.wall_to_prev.tv_nsec); | ||
313 | if (likely(!(delta.tv_sec | delta.tv_nsec))) | ||
314 | return 0; | ||
315 | if (delta.tv_sec < 0) { | ||
316 | set_normalized_timespec(&delta, | ||
317 | -delta.tv_sec, | ||
318 | 1 - delta.tv_nsec - | ||
319 | posix_clocks[timr->it_clock].res); | ||
320 | sign++; | ||
321 | } | ||
322 | tstojiffie(&delta, posix_clocks[timr->it_clock].res, &exp); | ||
323 | timr->it.real.wall_to_prev = *new_wall_to; | ||
324 | timr->it.real.timer.expires += (sign ? -exp : exp); | ||
325 | return 1; | ||
326 | } | ||
327 | |||
328 | static void remove_from_abslist(struct k_itimer *timr) | ||
329 | { | ||
330 | if (!list_empty(&timr->it.real.abs_timer_entry)) { | ||
331 | spin_lock(&abs_list.lock); | ||
332 | list_del_init(&timr->it.real.abs_timer_entry); | ||
333 | spin_unlock(&abs_list.lock); | ||
334 | } | ||
335 | } | ||
336 | |||
337 | static void schedule_next_timer(struct k_itimer *timr) | 253 | static void schedule_next_timer(struct k_itimer *timr) |
338 | { | 254 | { |
339 | struct timespec new_wall_to; | 255 | if (timr->it.real.interval.tv64 == 0) |
340 | struct now_struct now; | ||
341 | unsigned long seq; | ||
342 | |||
343 | /* | ||
344 | * Set up the timer for the next interval (if there is one). | ||
345 | * Note: this code uses the abs_timer_lock to protect | ||
346 | * it.real.wall_to_prev and must hold it until exp is set, not exactly | ||
347 | * obvious... | ||
348 | |||
349 | * This function is used for CLOCK_REALTIME* and | ||
350 | * CLOCK_MONOTONIC* timers. If we ever want to handle other | ||
351 | * CLOCKs, the calling code (do_schedule_next_timer) would need | ||
352 | * to pull the "clock" info from the timer and dispatch the | ||
353 | * "other" CLOCKs "next timer" code (which, I suppose should | ||
354 | * also be added to the k_clock structure). | ||
355 | */ | ||
356 | if (!timr->it.real.incr) | ||
357 | return; | 256 | return; |
358 | 257 | ||
359 | do { | 258 | timr->it_overrun += hrtimer_forward(&timr->it.real.timer, |
360 | seq = read_seqbegin(&xtime_lock); | 259 | timr->it.real.interval); |
361 | new_wall_to = wall_to_monotonic; | ||
362 | posix_get_now(&now); | ||
363 | } while (read_seqretry(&xtime_lock, seq)); | ||
364 | |||
365 | if (!list_empty(&timr->it.real.abs_timer_entry)) { | ||
366 | spin_lock(&abs_list.lock); | ||
367 | add_clockset_delta(timr, &new_wall_to); | ||
368 | |||
369 | posix_bump_timer(timr, now); | ||
370 | |||
371 | spin_unlock(&abs_list.lock); | ||
372 | } else { | ||
373 | posix_bump_timer(timr, now); | ||
374 | } | ||
375 | timr->it_overrun_last = timr->it_overrun; | 260 | timr->it_overrun_last = timr->it_overrun; |
376 | timr->it_overrun = -1; | 261 | timr->it_overrun = -1; |
377 | ++timr->it_requeue_pending; | 262 | ++timr->it_requeue_pending; |
378 | add_timer(&timr->it.real.timer); | 263 | hrtimer_restart(&timr->it.real.timer); |
379 | } | 264 | } |
380 | 265 | ||
381 | /* | 266 | /* |
@@ -396,31 +281,23 @@ void do_schedule_next_timer(struct siginfo *info) | |||
396 | 281 | ||
397 | timr = lock_timer(info->si_tid, &flags); | 282 | timr = lock_timer(info->si_tid, &flags); |
398 | 283 | ||
399 | if (!timr || timr->it_requeue_pending != info->si_sys_private) | 284 | if (timr && timr->it_requeue_pending == info->si_sys_private) { |
400 | goto exit; | 285 | if (timr->it_clock < 0) |
286 | posix_cpu_timer_schedule(timr); | ||
287 | else | ||
288 | schedule_next_timer(timr); | ||
401 | 289 | ||
402 | if (timr->it_clock < 0) /* CPU clock */ | 290 | info->si_overrun = timr->it_overrun_last; |
403 | posix_cpu_timer_schedule(timr); | 291 | } |
404 | else | 292 | |
405 | schedule_next_timer(timr); | 293 | unlock_timer(timr, flags); |
406 | info->si_overrun = timr->it_overrun_last; | ||
407 | exit: | ||
408 | if (timr) | ||
409 | unlock_timer(timr, flags); | ||
410 | } | 294 | } |
411 | 295 | ||
412 | int posix_timer_event(struct k_itimer *timr,int si_private) | 296 | int posix_timer_event(struct k_itimer *timr,int si_private) |
413 | { | 297 | { |
414 | memset(&timr->sigq->info, 0, sizeof(siginfo_t)); | 298 | memset(&timr->sigq->info, 0, sizeof(siginfo_t)); |
415 | timr->sigq->info.si_sys_private = si_private; | 299 | timr->sigq->info.si_sys_private = si_private; |
416 | /* | 300 | /* Send signal to the process that owns this timer.*/ |
417 | * Send signal to the process that owns this timer. | ||
418 | |||
419 | * This code assumes that all the possible abs_lists share the | ||
420 | * same lock (there is only one list at this time). If this is | ||
421 | * not the case, the CLOCK info would need to be used to find | ||
422 | * the proper abs list lock. | ||
423 | */ | ||
424 | 301 | ||
425 | timr->sigq->info.si_signo = timr->it_sigev_signo; | 302 | timr->sigq->info.si_signo = timr->it_sigev_signo; |
426 | timr->sigq->info.si_errno = 0; | 303 | timr->sigq->info.si_errno = 0; |
@@ -454,66 +331,37 @@ EXPORT_SYMBOL_GPL(posix_timer_event); | |||
454 | 331 | ||
455 | * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. | 332 | * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. |
456 | */ | 333 | */ |
457 | static void posix_timer_fn(unsigned long __data) | 334 | static int posix_timer_fn(void *data) |
458 | { | 335 | { |
459 | struct k_itimer *timr = (struct k_itimer *) __data; | 336 | struct k_itimer *timr = data; |
460 | unsigned long flags; | 337 | unsigned long flags; |
461 | unsigned long seq; | 338 | int si_private = 0; |
462 | struct timespec delta, new_wall_to; | 339 | int ret = HRTIMER_NORESTART; |
463 | u64 exp = 0; | ||
464 | int do_notify = 1; | ||
465 | 340 | ||
466 | spin_lock_irqsave(&timr->it_lock, flags); | 341 | spin_lock_irqsave(&timr->it_lock, flags); |
467 | if (!list_empty(&timr->it.real.abs_timer_entry)) { | ||
468 | spin_lock(&abs_list.lock); | ||
469 | do { | ||
470 | seq = read_seqbegin(&xtime_lock); | ||
471 | new_wall_to = wall_to_monotonic; | ||
472 | } while (read_seqretry(&xtime_lock, seq)); | ||
473 | set_normalized_timespec(&delta, | ||
474 | new_wall_to.tv_sec - | ||
475 | timr->it.real.wall_to_prev.tv_sec, | ||
476 | new_wall_to.tv_nsec - | ||
477 | timr->it.real.wall_to_prev.tv_nsec); | ||
478 | if (likely((delta.tv_sec | delta.tv_nsec ) == 0)) { | ||
479 | /* do nothing, timer is on time */ | ||
480 | } else if (delta.tv_sec < 0) { | ||
481 | /* do nothing, timer is already late */ | ||
482 | } else { | ||
483 | /* timer is early due to a clock set */ | ||
484 | tstojiffie(&delta, | ||
485 | posix_clocks[timr->it_clock].res, | ||
486 | &exp); | ||
487 | timr->it.real.wall_to_prev = new_wall_to; | ||
488 | timr->it.real.timer.expires += exp; | ||
489 | add_timer(&timr->it.real.timer); | ||
490 | do_notify = 0; | ||
491 | } | ||
492 | spin_unlock(&abs_list.lock); | ||
493 | 342 | ||
494 | } | 343 | if (timr->it.real.interval.tv64 != 0) |
495 | if (do_notify) { | 344 | si_private = ++timr->it_requeue_pending; |
496 | int si_private=0; | ||
497 | 345 | ||
498 | if (timr->it.real.incr) | 346 | if (posix_timer_event(timr, si_private)) { |
499 | si_private = ++timr->it_requeue_pending; | 347 | /* |
500 | else { | 348 | * signal was not sent because of sig_ignor |
501 | remove_from_abslist(timr); | 349 | * we will not get a call back to restart it AND |
350 | * it should be restarted. | ||
351 | */ | ||
352 | if (timr->it.real.interval.tv64 != 0) { | ||
353 | timr->it_overrun += | ||
354 | hrtimer_forward(&timr->it.real.timer, | ||
355 | timr->it.real.interval); | ||
356 | ret = HRTIMER_RESTART; | ||
502 | } | 357 | } |
503 | |||
504 | if (posix_timer_event(timr, si_private)) | ||
505 | /* | ||
506 | * signal was not sent because of sig_ignor | ||
507 | * we will not get a call back to restart it AND | ||
508 | * it should be restarted. | ||
509 | */ | ||
510 | schedule_next_timer(timr); | ||
511 | } | 358 | } |
512 | unlock_timer(timr, flags); /* hold thru abs lock to keep irq off */ | ||
513 | } | ||
514 | 359 | ||
360 | unlock_timer(timr, flags); | ||
361 | return ret; | ||
362 | } | ||
515 | 363 | ||
516 | static inline struct task_struct * good_sigevent(sigevent_t * event) | 364 | static struct task_struct * good_sigevent(sigevent_t * event) |
517 | { | 365 | { |
518 | struct task_struct *rtn = current->group_leader; | 366 | struct task_struct *rtn = current->group_leader; |
519 | 367 | ||
@@ -530,7 +378,7 @@ static inline struct task_struct * good_sigevent(sigevent_t * event) | |||
530 | return rtn; | 378 | return rtn; |
531 | } | 379 | } |
532 | 380 | ||
533 | void register_posix_clock(clockid_t clock_id, struct k_clock *new_clock) | 381 | void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) |
534 | { | 382 | { |
535 | if ((unsigned) clock_id >= MAX_CLOCKS) { | 383 | if ((unsigned) clock_id >= MAX_CLOCKS) { |
536 | printk("POSIX clock register failed for clock_id %d\n", | 384 | printk("POSIX clock register failed for clock_id %d\n", |
@@ -576,7 +424,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) | |||
576 | /* Create a POSIX.1b interval timer. */ | 424 | /* Create a POSIX.1b interval timer. */ |
577 | 425 | ||
578 | asmlinkage long | 426 | asmlinkage long |
579 | sys_timer_create(clockid_t which_clock, | 427 | sys_timer_create(const clockid_t which_clock, |
580 | struct sigevent __user *timer_event_spec, | 428 | struct sigevent __user *timer_event_spec, |
581 | timer_t __user * created_timer_id) | 429 | timer_t __user * created_timer_id) |
582 | { | 430 | { |
@@ -602,8 +450,7 @@ sys_timer_create(clockid_t which_clock, | |||
602 | goto out; | 450 | goto out; |
603 | } | 451 | } |
604 | spin_lock_irq(&idr_lock); | 452 | spin_lock_irq(&idr_lock); |
605 | error = idr_get_new(&posix_timers_id, | 453 | error = idr_get_new(&posix_timers_id, (void *) new_timer, |
606 | (void *) new_timer, | ||
607 | &new_timer_id); | 454 | &new_timer_id); |
608 | spin_unlock_irq(&idr_lock); | 455 | spin_unlock_irq(&idr_lock); |
609 | if (error == -EAGAIN) | 456 | if (error == -EAGAIN) |
@@ -704,27 +551,6 @@ out: | |||
704 | } | 551 | } |
705 | 552 | ||
706 | /* | 553 | /* |
707 | * good_timespec | ||
708 | * | ||
709 | * This function checks the elements of a timespec structure. | ||
710 | * | ||
711 | * Arguments: | ||
712 | * ts : Pointer to the timespec structure to check | ||
713 | * | ||
714 | * Return value: | ||
715 | * If a NULL pointer was passed in, or the tv_nsec field was less than 0 | ||
716 | * or greater than NSEC_PER_SEC, or the tv_sec field was less than 0, | ||
717 | * this function returns 0. Otherwise it returns 1. | ||
718 | */ | ||
719 | static int good_timespec(const struct timespec *ts) | ||
720 | { | ||
721 | if ((!ts) || (ts->tv_sec < 0) || | ||
722 | ((unsigned) ts->tv_nsec >= NSEC_PER_SEC)) | ||
723 | return 0; | ||
724 | return 1; | ||
725 | } | ||
726 | |||
727 | /* | ||
728 | * Locking issues: We need to protect the result of the id look up until | 554 | * Locking issues: We need to protect the result of the id look up until |
729 | * we get the timer locked down so it is not deleted under us. The | 555 | * we get the timer locked down so it is not deleted under us. The |
730 | * removal is done under the idr spinlock so we use that here to bridge | 556 | * removal is done under the idr spinlock so we use that here to bridge |
@@ -776,39 +602,39 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) | |||
776 | static void | 602 | static void |
777 | common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) | 603 | common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) |
778 | { | 604 | { |
779 | unsigned long expires; | 605 | ktime_t remaining; |
780 | struct now_struct now; | 606 | struct hrtimer *timer = &timr->it.real.timer; |
781 | 607 | ||
782 | do | 608 | memset(cur_setting, 0, sizeof(struct itimerspec)); |
783 | expires = timr->it.real.timer.expires; | 609 | remaining = hrtimer_get_remaining(timer); |
784 | while ((volatile long) (timr->it.real.timer.expires) != expires); | ||
785 | |||
786 | posix_get_now(&now); | ||
787 | |||
788 | if (expires && | ||
789 | ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) && | ||
790 | !timr->it.real.incr && | ||
791 | posix_time_before(&timr->it.real.timer, &now)) | ||
792 | timr->it.real.timer.expires = expires = 0; | ||
793 | if (expires) { | ||
794 | if (timr->it_requeue_pending & REQUEUE_PENDING || | ||
795 | (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { | ||
796 | posix_bump_timer(timr, now); | ||
797 | expires = timr->it.real.timer.expires; | ||
798 | } | ||
799 | else | ||
800 | if (!timer_pending(&timr->it.real.timer)) | ||
801 | expires = 0; | ||
802 | if (expires) | ||
803 | expires -= now.jiffies; | ||
804 | } | ||
805 | jiffies_to_timespec(expires, &cur_setting->it_value); | ||
806 | jiffies_to_timespec(timr->it.real.incr, &cur_setting->it_interval); | ||
807 | 610 | ||
808 | if (cur_setting->it_value.tv_sec < 0) { | 611 | /* Time left ? or timer pending */ |
612 | if (remaining.tv64 > 0 || hrtimer_active(timer)) | ||
613 | goto calci; | ||
614 | /* interval timer ? */ | ||
615 | if (timr->it.real.interval.tv64 == 0) | ||
616 | return; | ||
617 | /* | ||
618 | * When a requeue is pending or this is a SIGEV_NONE timer | ||
619 | * move the expiry time forward by intervals, so expiry is > | ||
620 | * now. | ||
621 | */ | ||
622 | if (timr->it_requeue_pending & REQUEUE_PENDING || | ||
623 | (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { | ||
624 | timr->it_overrun += | ||
625 | hrtimer_forward(timer, timr->it.real.interval); | ||
626 | remaining = hrtimer_get_remaining(timer); | ||
627 | } | ||
628 | calci: | ||
629 | /* interval timer ? */ | ||
630 | if (timr->it.real.interval.tv64 != 0) | ||
631 | cur_setting->it_interval = | ||
632 | ktime_to_timespec(timr->it.real.interval); | ||
633 | /* Return 0 only, when the timer is expired and not pending */ | ||
634 | if (remaining.tv64 <= 0) | ||
809 | cur_setting->it_value.tv_nsec = 1; | 635 | cur_setting->it_value.tv_nsec = 1; |
810 | cur_setting->it_value.tv_sec = 0; | 636 | else |
811 | } | 637 | cur_setting->it_value = ktime_to_timespec(remaining); |
812 | } | 638 | } |
813 | 639 | ||
814 | /* Get the time remaining on a POSIX.1b interval timer. */ | 640 | /* Get the time remaining on a POSIX.1b interval timer. */ |
@@ -832,6 +658,7 @@ sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting) | |||
832 | 658 | ||
833 | return 0; | 659 | return 0; |
834 | } | 660 | } |
661 | |||
835 | /* | 662 | /* |
836 | * Get the number of overruns of a POSIX.1b interval timer. This is to | 663 | * Get the number of overruns of a POSIX.1b interval timer. This is to |
837 | * be the overrun of the timer last delivered. At the same time we are | 664 | * be the overrun of the timer last delivered. At the same time we are |
@@ -841,7 +668,6 @@ sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting) | |||
841 | * the call back to do_schedule_next_timer(). So all we need to do is | 668 | * the call back to do_schedule_next_timer(). So all we need to do is |
842 | * to pick up the frozen overrun. | 669 | * to pick up the frozen overrun. |
843 | */ | 670 | */ |
844 | |||
845 | asmlinkage long | 671 | asmlinkage long |
846 | sys_timer_getoverrun(timer_t timer_id) | 672 | sys_timer_getoverrun(timer_t timer_id) |
847 | { | 673 | { |
@@ -858,153 +684,55 @@ sys_timer_getoverrun(timer_t timer_id) | |||
858 | 684 | ||
859 | return overrun; | 685 | return overrun; |
860 | } | 686 | } |
861 | /* | ||
862 | * Adjust for absolute time | ||
863 | * | ||
864 | * If absolute time is given and it is not CLOCK_MONOTONIC, we need to | ||
865 | * adjust for the offset between the timer clock (CLOCK_MONOTONIC) and | ||
866 | * what ever clock he is using. | ||
867 | * | ||
868 | * If it is relative time, we need to add the current (CLOCK_MONOTONIC) | ||
869 | * time to it to get the proper time for the timer. | ||
870 | */ | ||
871 | static int adjust_abs_time(struct k_clock *clock, struct timespec *tp, | ||
872 | int abs, u64 *exp, struct timespec *wall_to) | ||
873 | { | ||
874 | struct timespec now; | ||
875 | struct timespec oc = *tp; | ||
876 | u64 jiffies_64_f; | ||
877 | int rtn =0; | ||
878 | |||
879 | if (abs) { | ||
880 | /* | ||
881 | * The mask pick up the 4 basic clocks | ||
882 | */ | ||
883 | if (!((clock - &posix_clocks[0]) & ~CLOCKS_MASK)) { | ||
884 | jiffies_64_f = do_posix_clock_monotonic_gettime_parts( | ||
885 | &now, wall_to); | ||
886 | /* | ||
887 | * If we are doing a MONOTONIC clock | ||
888 | */ | ||
889 | if((clock - &posix_clocks[0]) & CLOCKS_MONO){ | ||
890 | now.tv_sec += wall_to->tv_sec; | ||
891 | now.tv_nsec += wall_to->tv_nsec; | ||
892 | } | ||
893 | } else { | ||
894 | /* | ||
895 | * Not one of the basic clocks | ||
896 | */ | ||
897 | clock->clock_get(clock - posix_clocks, &now); | ||
898 | jiffies_64_f = get_jiffies_64(); | ||
899 | } | ||
900 | /* | ||
901 | * Take away now to get delta and normalize | ||
902 | */ | ||
903 | set_normalized_timespec(&oc, oc.tv_sec - now.tv_sec, | ||
904 | oc.tv_nsec - now.tv_nsec); | ||
905 | }else{ | ||
906 | jiffies_64_f = get_jiffies_64(); | ||
907 | } | ||
908 | /* | ||
909 | * Check if the requested time is prior to now (if so set now) | ||
910 | */ | ||
911 | if (oc.tv_sec < 0) | ||
912 | oc.tv_sec = oc.tv_nsec = 0; | ||
913 | |||
914 | if (oc.tv_sec | oc.tv_nsec) | ||
915 | set_normalized_timespec(&oc, oc.tv_sec, | ||
916 | oc.tv_nsec + clock->res); | ||
917 | tstojiffie(&oc, clock->res, exp); | ||
918 | |||
919 | /* | ||
920 | * Check if the requested time is more than the timer code | ||
921 | * can handle (if so we error out but return the value too). | ||
922 | */ | ||
923 | if (*exp > ((u64)MAX_JIFFY_OFFSET)) | ||
924 | /* | ||
925 | * This is a considered response, not exactly in | ||
926 | * line with the standard (in fact it is silent on | ||
927 | * possible overflows). We assume such a large | ||
928 | * value is ALMOST always a programming error and | ||
929 | * try not to compound it by setting a really dumb | ||
930 | * value. | ||
931 | */ | ||
932 | rtn = -EINVAL; | ||
933 | /* | ||
934 | * return the actual jiffies expire time, full 64 bits | ||
935 | */ | ||
936 | *exp += jiffies_64_f; | ||
937 | return rtn; | ||
938 | } | ||
939 | 687 | ||
940 | /* Set a POSIX.1b interval timer. */ | 688 | /* Set a POSIX.1b interval timer. */ |
941 | /* timr->it_lock is taken. */ | 689 | /* timr->it_lock is taken. */ |
942 | static inline int | 690 | static int |
943 | common_timer_set(struct k_itimer *timr, int flags, | 691 | common_timer_set(struct k_itimer *timr, int flags, |
944 | struct itimerspec *new_setting, struct itimerspec *old_setting) | 692 | struct itimerspec *new_setting, struct itimerspec *old_setting) |
945 | { | 693 | { |
946 | struct k_clock *clock = &posix_clocks[timr->it_clock]; | 694 | struct hrtimer *timer = &timr->it.real.timer; |
947 | u64 expire_64; | ||
948 | 695 | ||
949 | if (old_setting) | 696 | if (old_setting) |
950 | common_timer_get(timr, old_setting); | 697 | common_timer_get(timr, old_setting); |
951 | 698 | ||
952 | /* disable the timer */ | 699 | /* disable the timer */ |
953 | timr->it.real.incr = 0; | 700 | timr->it.real.interval.tv64 = 0; |
954 | /* | 701 | /* |
955 | * careful here. If smp we could be in the "fire" routine which will | 702 | * careful here. If smp we could be in the "fire" routine which will |
956 | * be spinning as we hold the lock. But this is ONLY an SMP issue. | 703 | * be spinning as we hold the lock. But this is ONLY an SMP issue. |
957 | */ | 704 | */ |
958 | if (try_to_del_timer_sync(&timr->it.real.timer) < 0) { | 705 | if (hrtimer_try_to_cancel(timer) < 0) |
959 | #ifdef CONFIG_SMP | ||
960 | /* | ||
961 | * It can only be active if on an other cpu. Since | ||
962 | * we have cleared the interval stuff above, it should | ||
963 | * clear once we release the spin lock. Of course once | ||
964 | * we do that anything could happen, including the | ||
965 | * complete melt down of the timer. So return with | ||
966 | * a "retry" exit status. | ||
967 | */ | ||
968 | return TIMER_RETRY; | 706 | return TIMER_RETRY; |
969 | #endif | ||
970 | } | ||
971 | |||
972 | remove_from_abslist(timr); | ||
973 | 707 | ||
974 | timr->it_requeue_pending = (timr->it_requeue_pending + 2) & | 708 | timr->it_requeue_pending = (timr->it_requeue_pending + 2) & |
975 | ~REQUEUE_PENDING; | 709 | ~REQUEUE_PENDING; |
976 | timr->it_overrun_last = 0; | 710 | timr->it_overrun_last = 0; |
977 | timr->it_overrun = -1; | ||
978 | /* | ||
979 | *switch off the timer when it_value is zero | ||
980 | */ | ||
981 | if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) { | ||
982 | timr->it.real.timer.expires = 0; | ||
983 | return 0; | ||
984 | } | ||
985 | 711 | ||
986 | if (adjust_abs_time(clock, | 712 | /* switch off the timer when it_value is zero */ |
987 | &new_setting->it_value, flags & TIMER_ABSTIME, | 713 | if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) |
988 | &expire_64, &(timr->it.real.wall_to_prev))) { | 714 | return 0; |
989 | return -EINVAL; | ||
990 | } | ||
991 | timr->it.real.timer.expires = (unsigned long)expire_64; | ||
992 | tstojiffie(&new_setting->it_interval, clock->res, &expire_64); | ||
993 | timr->it.real.incr = (unsigned long)expire_64; | ||
994 | 715 | ||
995 | /* | 716 | /* Posix madness. Only absolute CLOCK_REALTIME timers |
996 | * We do not even queue SIGEV_NONE timers! But we do put them | 717 | * are affected by clock sets. So we must reiniatilize |
997 | * in the abs list so we can do that right. | 718 | * the timer. |
998 | */ | 719 | */ |
999 | if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)) | 720 | if (timr->it_clock == CLOCK_REALTIME && (flags & TIMER_ABSTIME)) |
1000 | add_timer(&timr->it.real.timer); | 721 | hrtimer_rebase(timer, CLOCK_REALTIME); |
1001 | 722 | else | |
1002 | if (flags & TIMER_ABSTIME && clock->abs_struct) { | 723 | hrtimer_rebase(timer, CLOCK_MONOTONIC); |
1003 | spin_lock(&clock->abs_struct->lock); | 724 | |
1004 | list_add_tail(&(timr->it.real.abs_timer_entry), | 725 | timer->expires = timespec_to_ktime(new_setting->it_value); |
1005 | &(clock->abs_struct->list)); | 726 | |
1006 | spin_unlock(&clock->abs_struct->lock); | 727 | /* Convert interval */ |
1007 | } | 728 | timr->it.real.interval = timespec_to_ktime(new_setting->it_interval); |
729 | |||
730 | /* SIGEV_NONE timers are not queued ! See common_timer_get */ | ||
731 | if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) | ||
732 | return 0; | ||
733 | |||
734 | hrtimer_start(timer, timer->expires, (flags & TIMER_ABSTIME) ? | ||
735 | HRTIMER_ABS : HRTIMER_REL); | ||
1008 | return 0; | 736 | return 0; |
1009 | } | 737 | } |
1010 | 738 | ||
@@ -1026,8 +754,8 @@ sys_timer_settime(timer_t timer_id, int flags, | |||
1026 | if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) | 754 | if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) |
1027 | return -EFAULT; | 755 | return -EFAULT; |
1028 | 756 | ||
1029 | if ((!good_timespec(&new_spec.it_interval)) || | 757 | if (!timespec_valid(&new_spec.it_interval) || |
1030 | (!good_timespec(&new_spec.it_value))) | 758 | !timespec_valid(&new_spec.it_value)) |
1031 | return -EINVAL; | 759 | return -EINVAL; |
1032 | retry: | 760 | retry: |
1033 | timr = lock_timer(timer_id, &flag); | 761 | timr = lock_timer(timer_id, &flag); |
@@ -1043,8 +771,8 @@ retry: | |||
1043 | goto retry; | 771 | goto retry; |
1044 | } | 772 | } |
1045 | 773 | ||
1046 | if (old_setting && !error && copy_to_user(old_setting, | 774 | if (old_setting && !error && |
1047 | &old_spec, sizeof (old_spec))) | 775 | copy_to_user(old_setting, &old_spec, sizeof (old_spec))) |
1048 | error = -EFAULT; | 776 | error = -EFAULT; |
1049 | 777 | ||
1050 | return error; | 778 | return error; |
@@ -1052,24 +780,10 @@ retry: | |||
1052 | 780 | ||
1053 | static inline int common_timer_del(struct k_itimer *timer) | 781 | static inline int common_timer_del(struct k_itimer *timer) |
1054 | { | 782 | { |
1055 | timer->it.real.incr = 0; | 783 | timer->it.real.interval.tv64 = 0; |
1056 | 784 | ||
1057 | if (try_to_del_timer_sync(&timer->it.real.timer) < 0) { | 785 | if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0) |
1058 | #ifdef CONFIG_SMP | ||
1059 | /* | ||
1060 | * It can only be active if on an other cpu. Since | ||
1061 | * we have cleared the interval stuff above, it should | ||
1062 | * clear once we release the spin lock. Of course once | ||
1063 | * we do that anything could happen, including the | ||
1064 | * complete melt down of the timer. So return with | ||
1065 | * a "retry" exit status. | ||
1066 | */ | ||
1067 | return TIMER_RETRY; | 786 | return TIMER_RETRY; |
1068 | #endif | ||
1069 | } | ||
1070 | |||
1071 | remove_from_abslist(timer); | ||
1072 | |||
1073 | return 0; | 787 | return 0; |
1074 | } | 788 | } |
1075 | 789 | ||
@@ -1085,24 +799,16 @@ sys_timer_delete(timer_t timer_id) | |||
1085 | struct k_itimer *timer; | 799 | struct k_itimer *timer; |
1086 | long flags; | 800 | long flags; |
1087 | 801 | ||
1088 | #ifdef CONFIG_SMP | ||
1089 | int error; | ||
1090 | retry_delete: | 802 | retry_delete: |
1091 | #endif | ||
1092 | timer = lock_timer(timer_id, &flags); | 803 | timer = lock_timer(timer_id, &flags); |
1093 | if (!timer) | 804 | if (!timer) |
1094 | return -EINVAL; | 805 | return -EINVAL; |
1095 | 806 | ||
1096 | #ifdef CONFIG_SMP | 807 | if (timer_delete_hook(timer) == TIMER_RETRY) { |
1097 | error = timer_delete_hook(timer); | ||
1098 | |||
1099 | if (error == TIMER_RETRY) { | ||
1100 | unlock_timer(timer, flags); | 808 | unlock_timer(timer, flags); |
1101 | goto retry_delete; | 809 | goto retry_delete; |
1102 | } | 810 | } |
1103 | #else | 811 | |
1104 | timer_delete_hook(timer); | ||
1105 | #endif | ||
1106 | spin_lock(¤t->sighand->siglock); | 812 | spin_lock(¤t->sighand->siglock); |
1107 | list_del(&timer->list); | 813 | list_del(&timer->list); |
1108 | spin_unlock(¤t->sighand->siglock); | 814 | spin_unlock(¤t->sighand->siglock); |
@@ -1119,29 +825,21 @@ retry_delete: | |||
1119 | release_posix_timer(timer, IT_ID_SET); | 825 | release_posix_timer(timer, IT_ID_SET); |
1120 | return 0; | 826 | return 0; |
1121 | } | 827 | } |
828 | |||
1122 | /* | 829 | /* |
1123 | * return timer owned by the process, used by exit_itimers | 830 | * return timer owned by the process, used by exit_itimers |
1124 | */ | 831 | */ |
1125 | static inline void itimer_delete(struct k_itimer *timer) | 832 | static void itimer_delete(struct k_itimer *timer) |
1126 | { | 833 | { |
1127 | unsigned long flags; | 834 | unsigned long flags; |
1128 | 835 | ||
1129 | #ifdef CONFIG_SMP | ||
1130 | int error; | ||
1131 | retry_delete: | 836 | retry_delete: |
1132 | #endif | ||
1133 | spin_lock_irqsave(&timer->it_lock, flags); | 837 | spin_lock_irqsave(&timer->it_lock, flags); |
1134 | 838 | ||
1135 | #ifdef CONFIG_SMP | 839 | if (timer_delete_hook(timer) == TIMER_RETRY) { |
1136 | error = timer_delete_hook(timer); | ||
1137 | |||
1138 | if (error == TIMER_RETRY) { | ||
1139 | unlock_timer(timer, flags); | 840 | unlock_timer(timer, flags); |
1140 | goto retry_delete; | 841 | goto retry_delete; |
1141 | } | 842 | } |
1142 | #else | ||
1143 | timer_delete_hook(timer); | ||
1144 | #endif | ||
1145 | list_del(&timer->list); | 843 | list_del(&timer->list); |
1146 | /* | 844 | /* |
1147 | * This keeps any tasks waiting on the spin lock from thinking | 845 | * This keeps any tasks waiting on the spin lock from thinking |
@@ -1170,57 +868,8 @@ void exit_itimers(struct signal_struct *sig) | |||
1170 | } | 868 | } |
1171 | } | 869 | } |
1172 | 870 | ||
1173 | /* | 871 | /* Not available / possible... functions */ |
1174 | * And now for the "clock" calls | 872 | int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp) |
1175 | * | ||
1176 | * These functions are called both from timer functions (with the timer | ||
1177 | * spin_lock_irq() held and from clock calls with no locking. They must | ||
1178 | * use the save flags versions of locks. | ||
1179 | */ | ||
1180 | |||
1181 | /* | ||
1182 | * We do ticks here to avoid the irq lock ( they take sooo long). | ||
1183 | * The seqlock is great here. Since we a reader, we don't really care | ||
1184 | * if we are interrupted since we don't take lock that will stall us or | ||
1185 | * any other cpu. Voila, no irq lock is needed. | ||
1186 | * | ||
1187 | */ | ||
1188 | |||
1189 | static u64 do_posix_clock_monotonic_gettime_parts( | ||
1190 | struct timespec *tp, struct timespec *mo) | ||
1191 | { | ||
1192 | u64 jiff; | ||
1193 | unsigned int seq; | ||
1194 | |||
1195 | do { | ||
1196 | seq = read_seqbegin(&xtime_lock); | ||
1197 | getnstimeofday(tp); | ||
1198 | *mo = wall_to_monotonic; | ||
1199 | jiff = jiffies_64; | ||
1200 | |||
1201 | } while(read_seqretry(&xtime_lock, seq)); | ||
1202 | |||
1203 | return jiff; | ||
1204 | } | ||
1205 | |||
1206 | static int do_posix_clock_monotonic_get(clockid_t clock, struct timespec *tp) | ||
1207 | { | ||
1208 | struct timespec wall_to_mono; | ||
1209 | |||
1210 | do_posix_clock_monotonic_gettime_parts(tp, &wall_to_mono); | ||
1211 | |||
1212 | set_normalized_timespec(tp, tp->tv_sec + wall_to_mono.tv_sec, | ||
1213 | tp->tv_nsec + wall_to_mono.tv_nsec); | ||
1214 | |||
1215 | return 0; | ||
1216 | } | ||
1217 | |||
1218 | int do_posix_clock_monotonic_gettime(struct timespec *tp) | ||
1219 | { | ||
1220 | return do_posix_clock_monotonic_get(CLOCK_MONOTONIC, tp); | ||
1221 | } | ||
1222 | |||
1223 | int do_posix_clock_nosettime(clockid_t clockid, struct timespec *tp) | ||
1224 | { | 873 | { |
1225 | return -EINVAL; | 874 | return -EINVAL; |
1226 | } | 875 | } |
@@ -1232,7 +881,8 @@ int do_posix_clock_notimer_create(struct k_itimer *timer) | |||
1232 | } | 881 | } |
1233 | EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create); | 882 | EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create); |
1234 | 883 | ||
1235 | int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t) | 884 | int do_posix_clock_nonanosleep(const clockid_t clock, int flags, |
885 | struct timespec *t, struct timespec __user *r) | ||
1236 | { | 886 | { |
1237 | #ifndef ENOTSUP | 887 | #ifndef ENOTSUP |
1238 | return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */ | 888 | return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */ |
@@ -1242,8 +892,8 @@ int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t) | |||
1242 | } | 892 | } |
1243 | EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); | 893 | EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); |
1244 | 894 | ||
1245 | asmlinkage long | 895 | asmlinkage long sys_clock_settime(const clockid_t which_clock, |
1246 | sys_clock_settime(clockid_t which_clock, const struct timespec __user *tp) | 896 | const struct timespec __user *tp) |
1247 | { | 897 | { |
1248 | struct timespec new_tp; | 898 | struct timespec new_tp; |
1249 | 899 | ||
@@ -1256,7 +906,7 @@ sys_clock_settime(clockid_t which_clock, const struct timespec __user *tp) | |||
1256 | } | 906 | } |
1257 | 907 | ||
1258 | asmlinkage long | 908 | asmlinkage long |
1259 | sys_clock_gettime(clockid_t which_clock, struct timespec __user *tp) | 909 | sys_clock_gettime(const clockid_t which_clock, struct timespec __user *tp) |
1260 | { | 910 | { |
1261 | struct timespec kernel_tp; | 911 | struct timespec kernel_tp; |
1262 | int error; | 912 | int error; |
@@ -1273,7 +923,7 @@ sys_clock_gettime(clockid_t which_clock, struct timespec __user *tp) | |||
1273 | } | 923 | } |
1274 | 924 | ||
1275 | asmlinkage long | 925 | asmlinkage long |
1276 | sys_clock_getres(clockid_t which_clock, struct timespec __user *tp) | 926 | sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp) |
1277 | { | 927 | { |
1278 | struct timespec rtn_tp; | 928 | struct timespec rtn_tp; |
1279 | int error; | 929 | int error; |
@@ -1292,117 +942,34 @@ sys_clock_getres(clockid_t which_clock, struct timespec __user *tp) | |||
1292 | } | 942 | } |
1293 | 943 | ||
1294 | /* | 944 | /* |
1295 | * The standard says that an absolute nanosleep call MUST wake up at | 945 | * nanosleep for monotonic and realtime clocks |
1296 | * the requested time in spite of clock settings. Here is what we do: | ||
1297 | * For each nanosleep call that needs it (only absolute and not on | ||
1298 | * CLOCK_MONOTONIC* (as it can not be set)) we thread a little structure | ||
1299 | * into the "nanosleep_abs_list". All we need is the task_struct pointer. | ||
1300 | * When ever the clock is set we just wake up all those tasks. The rest | ||
1301 | * is done by the while loop in clock_nanosleep(). | ||
1302 | * | ||
1303 | * On locking, clock_was_set() is called from update_wall_clock which | ||
1304 | * holds (or has held for it) a write_lock_irq( xtime_lock) and is | ||
1305 | * called from the timer bh code. Thus we need the irq save locks. | ||
1306 | * | ||
1307 | * Also, on the call from update_wall_clock, that is done as part of a | ||
1308 | * softirq thing. We don't want to delay the system that much (possibly | ||
1309 | * long list of timers to fix), so we defer that work to keventd. | ||
1310 | */ | 946 | */ |
1311 | 947 | static int common_nsleep(const clockid_t which_clock, int flags, | |
1312 | static DECLARE_WAIT_QUEUE_HEAD(nanosleep_abs_wqueue); | 948 | struct timespec *tsave, struct timespec __user *rmtp) |
1313 | static DECLARE_WORK(clock_was_set_work, (void(*)(void*))clock_was_set, NULL); | 949 | { |
1314 | 950 | int mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL; | |
1315 | static DECLARE_MUTEX(clock_was_set_lock); | 951 | int clockid = which_clock; |
1316 | 952 | ||
1317 | void clock_was_set(void) | 953 | switch (which_clock) { |
1318 | { | 954 | case CLOCK_REALTIME: |
1319 | struct k_itimer *timr; | 955 | /* Posix madness. Only absolute timers on clock realtime |
1320 | struct timespec new_wall_to; | 956 | are affected by clock set. */ |
1321 | LIST_HEAD(cws_list); | 957 | if (mode != HRTIMER_ABS) |
1322 | unsigned long seq; | 958 | clockid = CLOCK_MONOTONIC; |
1323 | 959 | case CLOCK_MONOTONIC: | |
1324 | 960 | break; | |
1325 | if (unlikely(in_interrupt())) { | 961 | default: |
1326 | schedule_work(&clock_was_set_work); | 962 | return -EINVAL; |
1327 | return; | ||
1328 | } | 963 | } |
1329 | wake_up_all(&nanosleep_abs_wqueue); | 964 | return hrtimer_nanosleep(tsave, rmtp, mode, clockid); |
1330 | |||
1331 | /* | ||
1332 | * Check if there exist TIMER_ABSTIME timers to correct. | ||
1333 | * | ||
1334 | * Notes on locking: This code is run in task context with irq | ||
1335 | * on. We CAN be interrupted! All other usage of the abs list | ||
1336 | * lock is under the timer lock which holds the irq lock as | ||
1337 | * well. We REALLY don't want to scan the whole list with the | ||
1338 | * interrupt system off, AND we would like a sequence lock on | ||
1339 | * this code as well. Since we assume that the clock will not | ||
1340 | * be set often, it seems ok to take and release the irq lock | ||
1341 | * for each timer. In fact add_timer will do this, so this is | ||
1342 | * not an issue. So we know when we are done, we will move the | ||
1343 | * whole list to a new location. Then as we process each entry, | ||
1344 | * we will move it to the actual list again. This way, when our | ||
1345 | * copy is empty, we are done. We are not all that concerned | ||
1346 | * about preemption so we will use a semaphore lock to protect | ||
1347 | * aginst reentry. This way we will not stall another | ||
1348 | * processor. It is possible that this may delay some timers | ||
1349 | * that should have expired, given the new clock, but even this | ||
1350 | * will be minimal as we will always update to the current time, | ||
1351 | * even if it was set by a task that is waiting for entry to | ||
1352 | * this code. Timers that expire too early will be caught by | ||
1353 | * the expire code and restarted. | ||
1354 | |||
1355 | * Absolute timers that repeat are left in the abs list while | ||
1356 | * waiting for the task to pick up the signal. This means we | ||
1357 | * may find timers that are not in the "add_timer" list, but are | ||
1358 | * in the abs list. We do the same thing for these, save | ||
1359 | * putting them back in the "add_timer" list. (Note, these are | ||
1360 | * left in the abs list mainly to indicate that they are | ||
1361 | * ABSOLUTE timers, a fact that is used by the re-arm code, and | ||
1362 | * for which we have no other flag.) | ||
1363 | |||
1364 | */ | ||
1365 | |||
1366 | down(&clock_was_set_lock); | ||
1367 | spin_lock_irq(&abs_list.lock); | ||
1368 | list_splice_init(&abs_list.list, &cws_list); | ||
1369 | spin_unlock_irq(&abs_list.lock); | ||
1370 | do { | ||
1371 | do { | ||
1372 | seq = read_seqbegin(&xtime_lock); | ||
1373 | new_wall_to = wall_to_monotonic; | ||
1374 | } while (read_seqretry(&xtime_lock, seq)); | ||
1375 | |||
1376 | spin_lock_irq(&abs_list.lock); | ||
1377 | if (list_empty(&cws_list)) { | ||
1378 | spin_unlock_irq(&abs_list.lock); | ||
1379 | break; | ||
1380 | } | ||
1381 | timr = list_entry(cws_list.next, struct k_itimer, | ||
1382 | it.real.abs_timer_entry); | ||
1383 | |||
1384 | list_del_init(&timr->it.real.abs_timer_entry); | ||
1385 | if (add_clockset_delta(timr, &new_wall_to) && | ||
1386 | del_timer(&timr->it.real.timer)) /* timer run yet? */ | ||
1387 | add_timer(&timr->it.real.timer); | ||
1388 | list_add(&timr->it.real.abs_timer_entry, &abs_list.list); | ||
1389 | spin_unlock_irq(&abs_list.lock); | ||
1390 | } while (1); | ||
1391 | |||
1392 | up(&clock_was_set_lock); | ||
1393 | } | 965 | } |
1394 | 966 | ||
1395 | long clock_nanosleep_restart(struct restart_block *restart_block); | ||
1396 | |||
1397 | asmlinkage long | 967 | asmlinkage long |
1398 | sys_clock_nanosleep(clockid_t which_clock, int flags, | 968 | sys_clock_nanosleep(const clockid_t which_clock, int flags, |
1399 | const struct timespec __user *rqtp, | 969 | const struct timespec __user *rqtp, |
1400 | struct timespec __user *rmtp) | 970 | struct timespec __user *rmtp) |
1401 | { | 971 | { |
1402 | struct timespec t; | 972 | struct timespec t; |
1403 | struct restart_block *restart_block = | ||
1404 | &(current_thread_info()->restart_block); | ||
1405 | int ret; | ||
1406 | 973 | ||
1407 | if (invalid_clockid(which_clock)) | 974 | if (invalid_clockid(which_clock)) |
1408 | return -EINVAL; | 975 | return -EINVAL; |
@@ -1410,125 +977,9 @@ sys_clock_nanosleep(clockid_t which_clock, int flags, | |||
1410 | if (copy_from_user(&t, rqtp, sizeof (struct timespec))) | 977 | if (copy_from_user(&t, rqtp, sizeof (struct timespec))) |
1411 | return -EFAULT; | 978 | return -EFAULT; |
1412 | 979 | ||
1413 | if ((unsigned) t.tv_nsec >= NSEC_PER_SEC || t.tv_sec < 0) | 980 | if (!timespec_valid(&t)) |
1414 | return -EINVAL; | 981 | return -EINVAL; |
1415 | 982 | ||
1416 | /* | 983 | return CLOCK_DISPATCH(which_clock, nsleep, |
1417 | * Do this here as nsleep function does not have the real address. | 984 | (which_clock, flags, &t, rmtp)); |
1418 | */ | ||
1419 | restart_block->arg1 = (unsigned long)rmtp; | ||
1420 | |||
1421 | ret = CLOCK_DISPATCH(which_clock, nsleep, (which_clock, flags, &t)); | ||
1422 | |||
1423 | if ((ret == -ERESTART_RESTARTBLOCK) && rmtp && | ||
1424 | copy_to_user(rmtp, &t, sizeof (t))) | ||
1425 | return -EFAULT; | ||
1426 | return ret; | ||
1427 | } | ||
1428 | |||
1429 | |||
1430 | static int common_nsleep(clockid_t which_clock, | ||
1431 | int flags, struct timespec *tsave) | ||
1432 | { | ||
1433 | struct timespec t, dum; | ||
1434 | DECLARE_WAITQUEUE(abs_wqueue, current); | ||
1435 | u64 rq_time = (u64)0; | ||
1436 | s64 left; | ||
1437 | int abs; | ||
1438 | struct restart_block *restart_block = | ||
1439 | ¤t_thread_info()->restart_block; | ||
1440 | |||
1441 | abs_wqueue.flags = 0; | ||
1442 | abs = flags & TIMER_ABSTIME; | ||
1443 | |||
1444 | if (restart_block->fn == clock_nanosleep_restart) { | ||
1445 | /* | ||
1446 | * Interrupted by a non-delivered signal, pick up remaining | ||
1447 | * time and continue. Remaining time is in arg2 & 3. | ||
1448 | */ | ||
1449 | restart_block->fn = do_no_restart_syscall; | ||
1450 | |||
1451 | rq_time = restart_block->arg3; | ||
1452 | rq_time = (rq_time << 32) + restart_block->arg2; | ||
1453 | if (!rq_time) | ||
1454 | return -EINTR; | ||
1455 | left = rq_time - get_jiffies_64(); | ||
1456 | if (left <= (s64)0) | ||
1457 | return 0; /* Already passed */ | ||
1458 | } | ||
1459 | |||
1460 | if (abs && (posix_clocks[which_clock].clock_get != | ||
1461 | posix_clocks[CLOCK_MONOTONIC].clock_get)) | ||
1462 | add_wait_queue(&nanosleep_abs_wqueue, &abs_wqueue); | ||
1463 | |||
1464 | do { | ||
1465 | t = *tsave; | ||
1466 | if (abs || !rq_time) { | ||
1467 | adjust_abs_time(&posix_clocks[which_clock], &t, abs, | ||
1468 | &rq_time, &dum); | ||
1469 | } | ||
1470 | |||
1471 | left = rq_time - get_jiffies_64(); | ||
1472 | if (left >= (s64)MAX_JIFFY_OFFSET) | ||
1473 | left = (s64)MAX_JIFFY_OFFSET; | ||
1474 | if (left < (s64)0) | ||
1475 | break; | ||
1476 | |||
1477 | schedule_timeout_interruptible(left); | ||
1478 | |||
1479 | left = rq_time - get_jiffies_64(); | ||
1480 | } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING)); | ||
1481 | |||
1482 | if (abs_wqueue.task_list.next) | ||
1483 | finish_wait(&nanosleep_abs_wqueue, &abs_wqueue); | ||
1484 | |||
1485 | if (left > (s64)0) { | ||
1486 | |||
1487 | /* | ||
1488 | * Always restart abs calls from scratch to pick up any | ||
1489 | * clock shifting that happened while we are away. | ||
1490 | */ | ||
1491 | if (abs) | ||
1492 | return -ERESTARTNOHAND; | ||
1493 | |||
1494 | left *= TICK_NSEC; | ||
1495 | tsave->tv_sec = div_long_long_rem(left, | ||
1496 | NSEC_PER_SEC, | ||
1497 | &tsave->tv_nsec); | ||
1498 | /* | ||
1499 | * Restart works by saving the time remaing in | ||
1500 | * arg2 & 3 (it is 64-bits of jiffies). The other | ||
1501 | * info we need is the clock_id (saved in arg0). | ||
1502 | * The sys_call interface needs the users | ||
1503 | * timespec return address which _it_ saves in arg1. | ||
1504 | * Since we have cast the nanosleep call to a clock_nanosleep | ||
1505 | * both can be restarted with the same code. | ||
1506 | */ | ||
1507 | restart_block->fn = clock_nanosleep_restart; | ||
1508 | restart_block->arg0 = which_clock; | ||
1509 | /* | ||
1510 | * Caller sets arg1 | ||
1511 | */ | ||
1512 | restart_block->arg2 = rq_time & 0xffffffffLL; | ||
1513 | restart_block->arg3 = rq_time >> 32; | ||
1514 | |||
1515 | return -ERESTART_RESTARTBLOCK; | ||
1516 | } | ||
1517 | |||
1518 | return 0; | ||
1519 | } | ||
1520 | /* | ||
1521 | * This will restart clock_nanosleep. | ||
1522 | */ | ||
1523 | long | ||
1524 | clock_nanosleep_restart(struct restart_block *restart_block) | ||
1525 | { | ||
1526 | struct timespec t; | ||
1527 | int ret = common_nsleep(restart_block->arg0, 0, &t); | ||
1528 | |||
1529 | if ((ret == -ERESTART_RESTARTBLOCK) && restart_block->arg1 && | ||
1530 | copy_to_user((struct timespec __user *)(restart_block->arg1), &t, | ||
1531 | sizeof (t))) | ||
1532 | return -EFAULT; | ||
1533 | return ret; | ||
1534 | } | 985 | } |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 5ec248cb7f4a..9fd8d4f03595 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -38,7 +38,7 @@ config PM_DEBUG | |||
38 | 38 | ||
39 | config SOFTWARE_SUSPEND | 39 | config SOFTWARE_SUSPEND |
40 | bool "Software Suspend" | 40 | bool "Software Suspend" |
41 | depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FVR || PPC32) && !SMP) | 41 | depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) |
42 | ---help--- | 42 | ---help--- |
43 | Enable the possibility of suspending the machine. | 43 | Enable the possibility of suspending the machine. |
44 | It doesn't need APM. | 44 | It doesn't need APM. |
diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 027322a564f4..e03d85e55291 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c | |||
@@ -24,10 +24,11 @@ | |||
24 | 24 | ||
25 | extern suspend_disk_method_t pm_disk_mode; | 25 | extern suspend_disk_method_t pm_disk_mode; |
26 | 26 | ||
27 | extern int swsusp_shrink_memory(void); | ||
27 | extern int swsusp_suspend(void); | 28 | extern int swsusp_suspend(void); |
28 | extern int swsusp_write(void); | 29 | extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages); |
29 | extern int swsusp_check(void); | 30 | extern int swsusp_check(void); |
30 | extern int swsusp_read(void); | 31 | extern int swsusp_read(struct pbe **pblist_ptr); |
31 | extern void swsusp_close(void); | 32 | extern void swsusp_close(void); |
32 | extern int swsusp_resume(void); | 33 | extern int swsusp_resume(void); |
33 | 34 | ||
@@ -52,7 +53,7 @@ static void power_down(suspend_disk_method_t mode) | |||
52 | 53 | ||
53 | switch(mode) { | 54 | switch(mode) { |
54 | case PM_DISK_PLATFORM: | 55 | case PM_DISK_PLATFORM: |
55 | kernel_power_off_prepare(); | 56 | kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); |
56 | error = pm_ops->enter(PM_SUSPEND_DISK); | 57 | error = pm_ops->enter(PM_SUSPEND_DISK); |
57 | break; | 58 | break; |
58 | case PM_DISK_SHUTDOWN: | 59 | case PM_DISK_SHUTDOWN: |
@@ -73,31 +74,6 @@ static void power_down(suspend_disk_method_t mode) | |||
73 | static int in_suspend __nosavedata = 0; | 74 | static int in_suspend __nosavedata = 0; |
74 | 75 | ||
75 | 76 | ||
76 | /** | ||
77 | * free_some_memory - Try to free as much memory as possible | ||
78 | * | ||
79 | * ... but do not OOM-kill anyone | ||
80 | * | ||
81 | * Notice: all userland should be stopped at this point, or | ||
82 | * livelock is possible. | ||
83 | */ | ||
84 | |||
85 | static void free_some_memory(void) | ||
86 | { | ||
87 | unsigned int i = 0; | ||
88 | unsigned int tmp; | ||
89 | unsigned long pages = 0; | ||
90 | char *p = "-\\|/"; | ||
91 | |||
92 | printk("Freeing memory... "); | ||
93 | while ((tmp = shrink_all_memory(10000))) { | ||
94 | pages += tmp; | ||
95 | printk("\b%c", p[i++ % 4]); | ||
96 | } | ||
97 | printk("\bdone (%li pages freed)\n", pages); | ||
98 | } | ||
99 | |||
100 | |||
101 | static inline void platform_finish(void) | 77 | static inline void platform_finish(void) |
102 | { | 78 | { |
103 | if (pm_disk_mode == PM_DISK_PLATFORM) { | 79 | if (pm_disk_mode == PM_DISK_PLATFORM) { |
@@ -119,16 +95,9 @@ static int prepare_processes(void) | |||
119 | goto thaw; | 95 | goto thaw; |
120 | } | 96 | } |
121 | 97 | ||
122 | if (pm_disk_mode == PM_DISK_PLATFORM) { | ||
123 | if (pm_ops && pm_ops->prepare) { | ||
124 | if ((error = pm_ops->prepare(PM_SUSPEND_DISK))) | ||
125 | goto thaw; | ||
126 | } | ||
127 | } | ||
128 | |||
129 | /* Free memory before shutting down devices. */ | 98 | /* Free memory before shutting down devices. */ |
130 | free_some_memory(); | 99 | if (!(error = swsusp_shrink_memory())) |
131 | return 0; | 100 | return 0; |
132 | thaw: | 101 | thaw: |
133 | thaw_processes(); | 102 | thaw_processes(); |
134 | enable_nonboot_cpus(); | 103 | enable_nonboot_cpus(); |
@@ -176,7 +145,7 @@ int pm_suspend_disk(void) | |||
176 | if (in_suspend) { | 145 | if (in_suspend) { |
177 | device_resume(); | 146 | device_resume(); |
178 | pr_debug("PM: writing image.\n"); | 147 | pr_debug("PM: writing image.\n"); |
179 | error = swsusp_write(); | 148 | error = swsusp_write(pagedir_nosave, nr_copy_pages); |
180 | if (!error) | 149 | if (!error) |
181 | power_down(pm_disk_mode); | 150 | power_down(pm_disk_mode); |
182 | else { | 151 | else { |
@@ -247,7 +216,7 @@ static int software_resume(void) | |||
247 | 216 | ||
248 | pr_debug("PM: Reading swsusp image.\n"); | 217 | pr_debug("PM: Reading swsusp image.\n"); |
249 | 218 | ||
250 | if ((error = swsusp_read())) { | 219 | if ((error = swsusp_read(&pagedir_nosave))) { |
251 | swsusp_free(); | 220 | swsusp_free(); |
252 | goto Thaw; | 221 | goto Thaw; |
253 | } | 222 | } |
@@ -363,37 +332,55 @@ static ssize_t resume_show(struct subsystem * subsys, char *buf) | |||
363 | MINOR(swsusp_resume_device)); | 332 | MINOR(swsusp_resume_device)); |
364 | } | 333 | } |
365 | 334 | ||
366 | static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t n) | 335 | static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n) |
367 | { | 336 | { |
368 | int len; | ||
369 | char *p; | ||
370 | unsigned int maj, min; | 337 | unsigned int maj, min; |
371 | int error = -EINVAL; | ||
372 | dev_t res; | 338 | dev_t res; |
339 | int ret = -EINVAL; | ||
373 | 340 | ||
374 | p = memchr(buf, '\n', n); | 341 | if (sscanf(buf, "%u:%u", &maj, &min) != 2) |
375 | len = p ? p - buf : n; | 342 | goto out; |
376 | 343 | ||
377 | if (sscanf(buf, "%u:%u", &maj, &min) == 2) { | 344 | res = MKDEV(maj,min); |
378 | res = MKDEV(maj,min); | 345 | if (maj != MAJOR(res) || min != MINOR(res)) |
379 | if (maj == MAJOR(res) && min == MINOR(res)) { | 346 | goto out; |
380 | down(&pm_sem); | ||
381 | swsusp_resume_device = res; | ||
382 | up(&pm_sem); | ||
383 | printk("Attempting manual resume\n"); | ||
384 | noresume = 0; | ||
385 | software_resume(); | ||
386 | } | ||
387 | } | ||
388 | 347 | ||
389 | return error >= 0 ? n : error; | 348 | down(&pm_sem); |
349 | swsusp_resume_device = res; | ||
350 | up(&pm_sem); | ||
351 | printk("Attempting manual resume\n"); | ||
352 | noresume = 0; | ||
353 | software_resume(); | ||
354 | ret = n; | ||
355 | out: | ||
356 | return ret; | ||
390 | } | 357 | } |
391 | 358 | ||
392 | power_attr(resume); | 359 | power_attr(resume); |
393 | 360 | ||
361 | static ssize_t image_size_show(struct subsystem * subsys, char *buf) | ||
362 | { | ||
363 | return sprintf(buf, "%u\n", image_size); | ||
364 | } | ||
365 | |||
366 | static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n) | ||
367 | { | ||
368 | unsigned int size; | ||
369 | |||
370 | if (sscanf(buf, "%u", &size) == 1) { | ||
371 | image_size = size; | ||
372 | return n; | ||
373 | } | ||
374 | |||
375 | return -EINVAL; | ||
376 | } | ||
377 | |||
378 | power_attr(image_size); | ||
379 | |||
394 | static struct attribute * g[] = { | 380 | static struct attribute * g[] = { |
395 | &disk_attr.attr, | 381 | &disk_attr.attr, |
396 | &resume_attr.attr, | 382 | &resume_attr.attr, |
383 | &image_size_attr.attr, | ||
397 | NULL, | 384 | NULL, |
398 | }; | 385 | }; |
399 | 386 | ||
diff --git a/kernel/power/main.c b/kernel/power/main.c index d253f3ae2fa5..9cb235cba4a9 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -133,10 +133,10 @@ static int suspend_enter(suspend_state_t state) | |||
133 | static void suspend_finish(suspend_state_t state) | 133 | static void suspend_finish(suspend_state_t state) |
134 | { | 134 | { |
135 | device_resume(); | 135 | device_resume(); |
136 | if (pm_ops && pm_ops->finish) | ||
137 | pm_ops->finish(state); | ||
138 | thaw_processes(); | 136 | thaw_processes(); |
139 | enable_nonboot_cpus(); | 137 | enable_nonboot_cpus(); |
138 | if (pm_ops && pm_ops->finish) | ||
139 | pm_ops->finish(state); | ||
140 | pm_restore_console(); | 140 | pm_restore_console(); |
141 | } | 141 | } |
142 | 142 | ||
diff --git a/kernel/power/power.h b/kernel/power/power.h index 6c042b5ee14b..7e8492fd1423 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -9,19 +9,13 @@ | |||
9 | #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) | 9 | #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) |
10 | #endif | 10 | #endif |
11 | 11 | ||
12 | #define MAX_PBES ((PAGE_SIZE - sizeof(struct new_utsname) \ | ||
13 | - 4 - 3*sizeof(unsigned long) - sizeof(int) \ | ||
14 | - sizeof(void *)) / sizeof(swp_entry_t)) | ||
15 | |||
16 | struct swsusp_info { | 12 | struct swsusp_info { |
17 | struct new_utsname uts; | 13 | struct new_utsname uts; |
18 | u32 version_code; | 14 | u32 version_code; |
19 | unsigned long num_physpages; | 15 | unsigned long num_physpages; |
20 | int cpus; | 16 | int cpus; |
21 | unsigned long image_pages; | 17 | unsigned long image_pages; |
22 | unsigned long pagedir_pages; | 18 | unsigned long pages; |
23 | suspend_pagedir_t * suspend_pagedir; | ||
24 | swp_entry_t pagedir[MAX_PBES]; | ||
25 | } __attribute__((aligned(PAGE_SIZE))); | 19 | } __attribute__((aligned(PAGE_SIZE))); |
26 | 20 | ||
27 | 21 | ||
@@ -48,25 +42,27 @@ static struct subsys_attribute _name##_attr = { \ | |||
48 | 42 | ||
49 | extern struct subsystem power_subsys; | 43 | extern struct subsystem power_subsys; |
50 | 44 | ||
51 | extern int freeze_processes(void); | ||
52 | extern void thaw_processes(void); | ||
53 | |||
54 | extern int pm_prepare_console(void); | 45 | extern int pm_prepare_console(void); |
55 | extern void pm_restore_console(void); | 46 | extern void pm_restore_console(void); |
56 | 47 | ||
57 | |||
58 | /* References to section boundaries */ | 48 | /* References to section boundaries */ |
59 | extern const void __nosave_begin, __nosave_end; | 49 | extern const void __nosave_begin, __nosave_end; |
60 | 50 | ||
61 | extern unsigned int nr_copy_pages; | 51 | extern unsigned int nr_copy_pages; |
62 | extern suspend_pagedir_t *pagedir_nosave; | 52 | extern struct pbe *pagedir_nosave; |
63 | extern suspend_pagedir_t *pagedir_save; | 53 | |
54 | /* Preferred image size in MB (default 500) */ | ||
55 | extern unsigned int image_size; | ||
64 | 56 | ||
65 | extern asmlinkage int swsusp_arch_suspend(void); | 57 | extern asmlinkage int swsusp_arch_suspend(void); |
66 | extern asmlinkage int swsusp_arch_resume(void); | 58 | extern asmlinkage int swsusp_arch_resume(void); |
67 | 59 | ||
60 | extern unsigned int count_data_pages(void); | ||
68 | extern void free_pagedir(struct pbe *pblist); | 61 | extern void free_pagedir(struct pbe *pblist); |
62 | extern void release_eaten_pages(void); | ||
69 | extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed); | 63 | extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed); |
70 | extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages); | ||
71 | extern void swsusp_free(void); | 64 | extern void swsusp_free(void); |
72 | extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed); | 65 | extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed); |
66 | extern unsigned int snapshot_nr_pages(void); | ||
67 | extern struct pbe *snapshot_pblist(void); | ||
68 | extern void snapshot_pblist_set(struct pbe *pblist); | ||
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 4a6dbcefd378..41f66365f0d8 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -33,7 +33,35 @@ | |||
33 | 33 | ||
34 | #include "power.h" | 34 | #include "power.h" |
35 | 35 | ||
36 | struct pbe *pagedir_nosave; | ||
37 | unsigned int nr_copy_pages; | ||
38 | |||
36 | #ifdef CONFIG_HIGHMEM | 39 | #ifdef CONFIG_HIGHMEM |
40 | unsigned int count_highmem_pages(void) | ||
41 | { | ||
42 | struct zone *zone; | ||
43 | unsigned long zone_pfn; | ||
44 | unsigned int n = 0; | ||
45 | |||
46 | for_each_zone (zone) | ||
47 | if (is_highmem(zone)) { | ||
48 | mark_free_pages(zone); | ||
49 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) { | ||
50 | struct page *page; | ||
51 | unsigned long pfn = zone_pfn + zone->zone_start_pfn; | ||
52 | if (!pfn_valid(pfn)) | ||
53 | continue; | ||
54 | page = pfn_to_page(pfn); | ||
55 | if (PageReserved(page)) | ||
56 | continue; | ||
57 | if (PageNosaveFree(page)) | ||
58 | continue; | ||
59 | n++; | ||
60 | } | ||
61 | } | ||
62 | return n; | ||
63 | } | ||
64 | |||
37 | struct highmem_page { | 65 | struct highmem_page { |
38 | char *data; | 66 | char *data; |
39 | struct page *page; | 67 | struct page *page; |
@@ -149,17 +177,15 @@ static int saveable(struct zone *zone, unsigned long *zone_pfn) | |||
149 | BUG_ON(PageReserved(page) && PageNosave(page)); | 177 | BUG_ON(PageReserved(page) && PageNosave(page)); |
150 | if (PageNosave(page)) | 178 | if (PageNosave(page)) |
151 | return 0; | 179 | return 0; |
152 | if (PageReserved(page) && pfn_is_nosave(pfn)) { | 180 | if (PageReserved(page) && pfn_is_nosave(pfn)) |
153 | pr_debug("[nosave pfn 0x%lx]", pfn); | ||
154 | return 0; | 181 | return 0; |
155 | } | ||
156 | if (PageNosaveFree(page)) | 182 | if (PageNosaveFree(page)) |
157 | return 0; | 183 | return 0; |
158 | 184 | ||
159 | return 1; | 185 | return 1; |
160 | } | 186 | } |
161 | 187 | ||
162 | static unsigned count_data_pages(void) | 188 | unsigned int count_data_pages(void) |
163 | { | 189 | { |
164 | struct zone *zone; | 190 | struct zone *zone; |
165 | unsigned long zone_pfn; | 191 | unsigned long zone_pfn; |
@@ -244,7 +270,7 @@ static inline void fill_pb_page(struct pbe *pbpage) | |||
244 | * of memory pages allocated with alloc_pagedir() | 270 | * of memory pages allocated with alloc_pagedir() |
245 | */ | 271 | */ |
246 | 272 | ||
247 | void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) | 273 | static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) |
248 | { | 274 | { |
249 | struct pbe *pbpage, *p; | 275 | struct pbe *pbpage, *p; |
250 | unsigned int num = PBES_PER_PAGE; | 276 | unsigned int num = PBES_PER_PAGE; |
@@ -261,7 +287,35 @@ void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) | |||
261 | p->next = p + 1; | 287 | p->next = p + 1; |
262 | p->next = NULL; | 288 | p->next = NULL; |
263 | } | 289 | } |
264 | pr_debug("create_pbe_list(): initialized %d PBEs\n", num); | 290 | } |
291 | |||
292 | /** | ||
293 | * On resume it is necessary to trace and eventually free the unsafe | ||
294 | * pages that have been allocated, because they are needed for I/O | ||
295 | * (on x86-64 we likely will "eat" these pages once again while | ||
296 | * creating the temporary page translation tables) | ||
297 | */ | ||
298 | |||
299 | struct eaten_page { | ||
300 | struct eaten_page *next; | ||
301 | char padding[PAGE_SIZE - sizeof(void *)]; | ||
302 | }; | ||
303 | |||
304 | static struct eaten_page *eaten_pages = NULL; | ||
305 | |||
306 | void release_eaten_pages(void) | ||
307 | { | ||
308 | struct eaten_page *p, *q; | ||
309 | |||
310 | p = eaten_pages; | ||
311 | while (p) { | ||
312 | q = p->next; | ||
313 | /* We don't want swsusp_free() to free this page again */ | ||
314 | ClearPageNosave(virt_to_page(p)); | ||
315 | free_page((unsigned long)p); | ||
316 | p = q; | ||
317 | } | ||
318 | eaten_pages = NULL; | ||
265 | } | 319 | } |
266 | 320 | ||
267 | /** | 321 | /** |
@@ -282,9 +336,12 @@ static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) | |||
282 | if (safe_needed) | 336 | if (safe_needed) |
283 | do { | 337 | do { |
284 | res = (void *)get_zeroed_page(gfp_mask); | 338 | res = (void *)get_zeroed_page(gfp_mask); |
285 | if (res && PageNosaveFree(virt_to_page(res))) | 339 | if (res && PageNosaveFree(virt_to_page(res))) { |
286 | /* This is for swsusp_free() */ | 340 | /* This is for swsusp_free() */ |
287 | SetPageNosave(virt_to_page(res)); | 341 | SetPageNosave(virt_to_page(res)); |
342 | ((struct eaten_page *)res)->next = eaten_pages; | ||
343 | eaten_pages = res; | ||
344 | } | ||
288 | } while (res && PageNosaveFree(virt_to_page(res))); | 345 | } while (res && PageNosaveFree(virt_to_page(res))); |
289 | else | 346 | else |
290 | res = (void *)get_zeroed_page(gfp_mask); | 347 | res = (void *)get_zeroed_page(gfp_mask); |
@@ -332,7 +389,8 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed | |||
332 | if (!pbe) { /* get_zeroed_page() failed */ | 389 | if (!pbe) { /* get_zeroed_page() failed */ |
333 | free_pagedir(pblist); | 390 | free_pagedir(pblist); |
334 | pblist = NULL; | 391 | pblist = NULL; |
335 | } | 392 | } else |
393 | create_pbe_list(pblist, nr_pages); | ||
336 | return pblist; | 394 | return pblist; |
337 | } | 395 | } |
338 | 396 | ||
@@ -370,8 +428,14 @@ void swsusp_free(void) | |||
370 | 428 | ||
371 | static int enough_free_mem(unsigned int nr_pages) | 429 | static int enough_free_mem(unsigned int nr_pages) |
372 | { | 430 | { |
373 | pr_debug("swsusp: available memory: %u pages\n", nr_free_pages()); | 431 | struct zone *zone; |
374 | return nr_free_pages() > (nr_pages + PAGES_FOR_IO + | 432 | unsigned int n = 0; |
433 | |||
434 | for_each_zone (zone) | ||
435 | if (!is_highmem(zone)) | ||
436 | n += zone->free_pages; | ||
437 | pr_debug("swsusp: available memory: %u pages\n", n); | ||
438 | return n > (nr_pages + PAGES_FOR_IO + | ||
375 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); | 439 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); |
376 | } | 440 | } |
377 | 441 | ||
@@ -395,7 +459,6 @@ static struct pbe *swsusp_alloc(unsigned int nr_pages) | |||
395 | printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); | 459 | printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); |
396 | return NULL; | 460 | return NULL; |
397 | } | 461 | } |
398 | create_pbe_list(pblist, nr_pages); | ||
399 | 462 | ||
400 | if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) { | 463 | if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) { |
401 | printk(KERN_ERR "suspend: Allocating image pages failed.\n"); | 464 | printk(KERN_ERR "suspend: Allocating image pages failed.\n"); |
@@ -421,10 +484,6 @@ asmlinkage int swsusp_save(void) | |||
421 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE, | 484 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE, |
422 | PAGES_FOR_IO, nr_free_pages()); | 485 | PAGES_FOR_IO, nr_free_pages()); |
423 | 486 | ||
424 | /* This is needed because of the fixed size of swsusp_info */ | ||
425 | if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE) | ||
426 | return -ENOSPC; | ||
427 | |||
428 | if (!enough_free_mem(nr_pages)) { | 487 | if (!enough_free_mem(nr_pages)) { |
429 | printk(KERN_ERR "swsusp: Not enough free memory\n"); | 488 | printk(KERN_ERR "swsusp: Not enough free memory\n"); |
430 | return -ENOMEM; | 489 | return -ENOMEM; |
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index c05f46e7348f..55a18d26abed 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c | |||
@@ -30,8 +30,8 @@ | |||
30 | * Alex Badea <vampire@go.ro>: | 30 | * Alex Badea <vampire@go.ro>: |
31 | * Fixed runaway init | 31 | * Fixed runaway init |
32 | * | 32 | * |
33 | * Andreas Steinmetz <ast@domdv.de>: | 33 | * Rafael J. Wysocki <rjw@sisk.pl> |
34 | * Added encrypted suspend option | 34 | * Added the swap map data structure and reworked the handling of swap |
35 | * | 35 | * |
36 | * More state savers are welcome. Especially for the scsi layer... | 36 | * More state savers are welcome. Especially for the scsi layer... |
37 | * | 37 | * |
@@ -67,44 +67,33 @@ | |||
67 | #include <asm/tlbflush.h> | 67 | #include <asm/tlbflush.h> |
68 | #include <asm/io.h> | 68 | #include <asm/io.h> |
69 | 69 | ||
70 | #include <linux/random.h> | ||
71 | #include <linux/crypto.h> | ||
72 | #include <asm/scatterlist.h> | ||
73 | |||
74 | #include "power.h" | 70 | #include "power.h" |
75 | 71 | ||
72 | /* | ||
73 | * Preferred image size in MB (tunable via /sys/power/image_size). | ||
74 | * When it is set to N, swsusp will do its best to ensure the image | ||
75 | * size will not exceed N MB, but if that is impossible, it will | ||
76 | * try to create the smallest image possible. | ||
77 | */ | ||
78 | unsigned int image_size = 500; | ||
79 | |||
76 | #ifdef CONFIG_HIGHMEM | 80 | #ifdef CONFIG_HIGHMEM |
81 | unsigned int count_highmem_pages(void); | ||
77 | int save_highmem(void); | 82 | int save_highmem(void); |
78 | int restore_highmem(void); | 83 | int restore_highmem(void); |
79 | #else | 84 | #else |
80 | static int save_highmem(void) { return 0; } | 85 | static int save_highmem(void) { return 0; } |
81 | static int restore_highmem(void) { return 0; } | 86 | static int restore_highmem(void) { return 0; } |
87 | static unsigned int count_highmem_pages(void) { return 0; } | ||
82 | #endif | 88 | #endif |
83 | 89 | ||
84 | #define CIPHER "aes" | ||
85 | #define MAXKEY 32 | ||
86 | #define MAXIV 32 | ||
87 | |||
88 | extern char resume_file[]; | 90 | extern char resume_file[]; |
89 | 91 | ||
90 | /* Local variables that should not be affected by save */ | ||
91 | unsigned int nr_copy_pages __nosavedata = 0; | ||
92 | |||
93 | /* Suspend pagedir is allocated before final copy, therefore it | ||
94 | must be freed after resume | ||
95 | |||
96 | Warning: this is even more evil than it seems. Pagedirs this file | ||
97 | talks about are completely different from page directories used by | ||
98 | MMU hardware. | ||
99 | */ | ||
100 | suspend_pagedir_t *pagedir_nosave __nosavedata = NULL; | ||
101 | |||
102 | #define SWSUSP_SIG "S1SUSPEND" | 92 | #define SWSUSP_SIG "S1SUSPEND" |
103 | 93 | ||
104 | static struct swsusp_header { | 94 | static struct swsusp_header { |
105 | char reserved[PAGE_SIZE - 20 - MAXKEY - MAXIV - sizeof(swp_entry_t)]; | 95 | char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; |
106 | u8 key_iv[MAXKEY+MAXIV]; | 96 | swp_entry_t image; |
107 | swp_entry_t swsusp_info; | ||
108 | char orig_sig[10]; | 97 | char orig_sig[10]; |
109 | char sig[10]; | 98 | char sig[10]; |
110 | } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; | 99 | } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; |
@@ -115,140 +104,9 @@ static struct swsusp_info swsusp_info; | |||
115 | * Saving part... | 104 | * Saving part... |
116 | */ | 105 | */ |
117 | 106 | ||
118 | /* We memorize in swapfile_used what swap devices are used for suspension */ | 107 | static unsigned short root_swap = 0xffff; |
119 | #define SWAPFILE_UNUSED 0 | ||
120 | #define SWAPFILE_SUSPEND 1 /* This is the suspending device */ | ||
121 | #define SWAPFILE_IGNORED 2 /* Those are other swap devices ignored for suspension */ | ||
122 | |||
123 | static unsigned short swapfile_used[MAX_SWAPFILES]; | ||
124 | static unsigned short root_swap; | ||
125 | |||
126 | static int write_page(unsigned long addr, swp_entry_t *loc); | ||
127 | static int bio_read_page(pgoff_t page_off, void *page); | ||
128 | |||
129 | static u8 key_iv[MAXKEY+MAXIV]; | ||
130 | |||
131 | #ifdef CONFIG_SWSUSP_ENCRYPT | ||
132 | |||
133 | static int crypto_init(int mode, void **mem) | ||
134 | { | ||
135 | int error = 0; | ||
136 | int len; | ||
137 | char *modemsg; | ||
138 | struct crypto_tfm *tfm; | ||
139 | |||
140 | modemsg = mode ? "suspend not possible" : "resume not possible"; | ||
141 | |||
142 | tfm = crypto_alloc_tfm(CIPHER, CRYPTO_TFM_MODE_CBC); | ||
143 | if(!tfm) { | ||
144 | printk(KERN_ERR "swsusp: no tfm, %s\n", modemsg); | ||
145 | error = -EINVAL; | ||
146 | goto out; | ||
147 | } | ||
148 | |||
149 | if(MAXKEY < crypto_tfm_alg_min_keysize(tfm)) { | ||
150 | printk(KERN_ERR "swsusp: key buffer too small, %s\n", modemsg); | ||
151 | error = -ENOKEY; | ||
152 | goto fail; | ||
153 | } | ||
154 | |||
155 | if (mode) | ||
156 | get_random_bytes(key_iv, MAXKEY+MAXIV); | ||
157 | |||
158 | len = crypto_tfm_alg_max_keysize(tfm); | ||
159 | if (len > MAXKEY) | ||
160 | len = MAXKEY; | ||
161 | |||
162 | if (crypto_cipher_setkey(tfm, key_iv, len)) { | ||
163 | printk(KERN_ERR "swsusp: key setup failure, %s\n", modemsg); | ||
164 | error = -EKEYREJECTED; | ||
165 | goto fail; | ||
166 | } | ||
167 | |||
168 | len = crypto_tfm_alg_ivsize(tfm); | ||
169 | |||
170 | if (MAXIV < len) { | ||
171 | printk(KERN_ERR "swsusp: iv buffer too small, %s\n", modemsg); | ||
172 | error = -EOVERFLOW; | ||
173 | goto fail; | ||
174 | } | ||
175 | |||
176 | crypto_cipher_set_iv(tfm, key_iv+MAXKEY, len); | ||
177 | |||
178 | *mem=(void *)tfm; | ||
179 | |||
180 | goto out; | ||
181 | |||
182 | fail: crypto_free_tfm(tfm); | ||
183 | out: return error; | ||
184 | } | ||
185 | |||
186 | static __inline__ void crypto_exit(void *mem) | ||
187 | { | ||
188 | crypto_free_tfm((struct crypto_tfm *)mem); | ||
189 | } | ||
190 | |||
191 | static __inline__ int crypto_write(struct pbe *p, void *mem) | ||
192 | { | ||
193 | int error = 0; | ||
194 | struct scatterlist src, dst; | ||
195 | |||
196 | src.page = virt_to_page(p->address); | ||
197 | src.offset = 0; | ||
198 | src.length = PAGE_SIZE; | ||
199 | dst.page = virt_to_page((void *)&swsusp_header); | ||
200 | dst.offset = 0; | ||
201 | dst.length = PAGE_SIZE; | ||
202 | |||
203 | error = crypto_cipher_encrypt((struct crypto_tfm *)mem, &dst, &src, | ||
204 | PAGE_SIZE); | ||
205 | |||
206 | if (!error) | ||
207 | error = write_page((unsigned long)&swsusp_header, | ||
208 | &(p->swap_address)); | ||
209 | return error; | ||
210 | } | ||
211 | |||
212 | static __inline__ int crypto_read(struct pbe *p, void *mem) | ||
213 | { | ||
214 | int error = 0; | ||
215 | struct scatterlist src, dst; | ||
216 | |||
217 | error = bio_read_page(swp_offset(p->swap_address), (void *)p->address); | ||
218 | if (!error) { | ||
219 | src.offset = 0; | ||
220 | src.length = PAGE_SIZE; | ||
221 | dst.offset = 0; | ||
222 | dst.length = PAGE_SIZE; | ||
223 | src.page = dst.page = virt_to_page((void *)p->address); | ||
224 | |||
225 | error = crypto_cipher_decrypt((struct crypto_tfm *)mem, &dst, | ||
226 | &src, PAGE_SIZE); | ||
227 | } | ||
228 | return error; | ||
229 | } | ||
230 | #else | ||
231 | static __inline__ int crypto_init(int mode, void *mem) | ||
232 | { | ||
233 | return 0; | ||
234 | } | ||
235 | |||
236 | static __inline__ void crypto_exit(void *mem) | ||
237 | { | ||
238 | } | ||
239 | |||
240 | static __inline__ int crypto_write(struct pbe *p, void *mem) | ||
241 | { | ||
242 | return write_page(p->address, &(p->swap_address)); | ||
243 | } | ||
244 | 108 | ||
245 | static __inline__ int crypto_read(struct pbe *p, void *mem) | 109 | static int mark_swapfiles(swp_entry_t start) |
246 | { | ||
247 | return bio_read_page(swp_offset(p->swap_address), (void *)p->address); | ||
248 | } | ||
249 | #endif | ||
250 | |||
251 | static int mark_swapfiles(swp_entry_t prev) | ||
252 | { | 110 | { |
253 | int error; | 111 | int error; |
254 | 112 | ||
@@ -259,8 +117,7 @@ static int mark_swapfiles(swp_entry_t prev) | |||
259 | !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { | 117 | !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { |
260 | memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); | 118 | memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); |
261 | memcpy(swsusp_header.sig,SWSUSP_SIG, 10); | 119 | memcpy(swsusp_header.sig,SWSUSP_SIG, 10); |
262 | memcpy(swsusp_header.key_iv, key_iv, MAXKEY+MAXIV); | 120 | swsusp_header.image = start; |
263 | swsusp_header.swsusp_info = prev; | ||
264 | error = rw_swap_page_sync(WRITE, | 121 | error = rw_swap_page_sync(WRITE, |
265 | swp_entry(root_swap, 0), | 122 | swp_entry(root_swap, 0), |
266 | virt_to_page((unsigned long) | 123 | virt_to_page((unsigned long) |
@@ -283,7 +140,7 @@ static int mark_swapfiles(swp_entry_t prev) | |||
283 | * devfs, since the resume code can only recognize the form /dev/hda4, | 140 | * devfs, since the resume code can only recognize the form /dev/hda4, |
284 | * but the suspend code would see the long name.) | 141 | * but the suspend code would see the long name.) |
285 | */ | 142 | */ |
286 | static int is_resume_device(const struct swap_info_struct *swap_info) | 143 | static inline int is_resume_device(const struct swap_info_struct *swap_info) |
287 | { | 144 | { |
288 | struct file *file = swap_info->swap_file; | 145 | struct file *file = swap_info->swap_file; |
289 | struct inode *inode = file->f_dentry->d_inode; | 146 | struct inode *inode = file->f_dentry->d_inode; |
@@ -294,54 +151,22 @@ static int is_resume_device(const struct swap_info_struct *swap_info) | |||
294 | 151 | ||
295 | static int swsusp_swap_check(void) /* This is called before saving image */ | 152 | static int swsusp_swap_check(void) /* This is called before saving image */ |
296 | { | 153 | { |
297 | int i, len; | ||
298 | |||
299 | len=strlen(resume_file); | ||
300 | root_swap = 0xFFFF; | ||
301 | |||
302 | spin_lock(&swap_lock); | ||
303 | for (i=0; i<MAX_SWAPFILES; i++) { | ||
304 | if (!(swap_info[i].flags & SWP_WRITEOK)) { | ||
305 | swapfile_used[i]=SWAPFILE_UNUSED; | ||
306 | } else { | ||
307 | if (!len) { | ||
308 | printk(KERN_WARNING "resume= option should be used to set suspend device" ); | ||
309 | if (root_swap == 0xFFFF) { | ||
310 | swapfile_used[i] = SWAPFILE_SUSPEND; | ||
311 | root_swap = i; | ||
312 | } else | ||
313 | swapfile_used[i] = SWAPFILE_IGNORED; | ||
314 | } else { | ||
315 | /* we ignore all swap devices that are not the resume_file */ | ||
316 | if (is_resume_device(&swap_info[i])) { | ||
317 | swapfile_used[i] = SWAPFILE_SUSPEND; | ||
318 | root_swap = i; | ||
319 | } else { | ||
320 | swapfile_used[i] = SWAPFILE_IGNORED; | ||
321 | } | ||
322 | } | ||
323 | } | ||
324 | } | ||
325 | spin_unlock(&swap_lock); | ||
326 | return (root_swap != 0xffff) ? 0 : -ENODEV; | ||
327 | } | ||
328 | |||
329 | /** | ||
330 | * This is called after saving image so modification | ||
331 | * will be lost after resume... and that's what we want. | ||
332 | * we make the device unusable. A new call to | ||
333 | * lock_swapdevices can unlock the devices. | ||
334 | */ | ||
335 | static void lock_swapdevices(void) | ||
336 | { | ||
337 | int i; | 154 | int i; |
338 | 155 | ||
156 | if (!swsusp_resume_device) | ||
157 | return -ENODEV; | ||
339 | spin_lock(&swap_lock); | 158 | spin_lock(&swap_lock); |
340 | for (i = 0; i< MAX_SWAPFILES; i++) | 159 | for (i = 0; i < MAX_SWAPFILES; i++) { |
341 | if (swapfile_used[i] == SWAPFILE_IGNORED) { | 160 | if (!(swap_info[i].flags & SWP_WRITEOK)) |
342 | swap_info[i].flags ^= SWP_WRITEOK; | 161 | continue; |
162 | if (is_resume_device(swap_info + i)) { | ||
163 | spin_unlock(&swap_lock); | ||
164 | root_swap = i; | ||
165 | return 0; | ||
343 | } | 166 | } |
167 | } | ||
344 | spin_unlock(&swap_lock); | 168 | spin_unlock(&swap_lock); |
169 | return -ENODEV; | ||
345 | } | 170 | } |
346 | 171 | ||
347 | /** | 172 | /** |
@@ -359,72 +184,217 @@ static void lock_swapdevices(void) | |||
359 | static int write_page(unsigned long addr, swp_entry_t *loc) | 184 | static int write_page(unsigned long addr, swp_entry_t *loc) |
360 | { | 185 | { |
361 | swp_entry_t entry; | 186 | swp_entry_t entry; |
362 | int error = 0; | 187 | int error = -ENOSPC; |
363 | 188 | ||
364 | entry = get_swap_page(); | 189 | entry = get_swap_page_of_type(root_swap); |
365 | if (swp_offset(entry) && | 190 | if (swp_offset(entry)) { |
366 | swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) { | 191 | error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr)); |
367 | error = rw_swap_page_sync(WRITE, entry, | 192 | if (!error || error == -EIO) |
368 | virt_to_page(addr)); | ||
369 | if (error == -EIO) | ||
370 | error = 0; | ||
371 | if (!error) | ||
372 | *loc = entry; | 193 | *loc = entry; |
373 | } else | 194 | } |
374 | error = -ENOSPC; | ||
375 | return error; | 195 | return error; |
376 | } | 196 | } |
377 | 197 | ||
378 | /** | 198 | /** |
379 | * data_free - Free the swap entries used by the saved image. | 199 | * Swap map-handling functions |
200 | * | ||
201 | * The swap map is a data structure used for keeping track of each page | ||
202 | * written to the swap. It consists of many swap_map_page structures | ||
203 | * that contain each an array of MAP_PAGE_SIZE swap entries. | ||
204 | * These structures are linked together with the help of either the | ||
205 | * .next (in memory) or the .next_swap (in swap) member. | ||
380 | * | 206 | * |
381 | * Walk the list of used swap entries and free each one. | 207 | * The swap map is created during suspend. At that time we need to keep |
382 | * This is only used for cleanup when suspend fails. | 208 | * it in memory, because we have to free all of the allocated swap |
209 | * entries if an error occurs. The memory needed is preallocated | ||
210 | * so that we know in advance if there's enough of it. | ||
211 | * | ||
212 | * The first swap_map_page structure is filled with the swap entries that | ||
213 | * correspond to the first MAP_PAGE_SIZE data pages written to swap and | ||
214 | * so on. After the all of the data pages have been written, the order | ||
215 | * of the swap_map_page structures in the map is reversed so that they | ||
216 | * can be read from swap in the original order. This causes the data | ||
217 | * pages to be loaded in exactly the same order in which they have been | ||
218 | * saved. | ||
219 | * | ||
220 | * During resume we only need to use one swap_map_page structure | ||
221 | * at a time, which means that we only need to use two memory pages for | ||
222 | * reading the image - one for reading the swap_map_page structures | ||
223 | * and the second for reading the data pages from swap. | ||
383 | */ | 224 | */ |
384 | static void data_free(void) | 225 | |
226 | #define MAP_PAGE_SIZE ((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \ | ||
227 | / sizeof(swp_entry_t)) | ||
228 | |||
229 | struct swap_map_page { | ||
230 | swp_entry_t entries[MAP_PAGE_SIZE]; | ||
231 | swp_entry_t next_swap; | ||
232 | struct swap_map_page *next; | ||
233 | }; | ||
234 | |||
235 | static inline void free_swap_map(struct swap_map_page *swap_map) | ||
385 | { | 236 | { |
386 | swp_entry_t entry; | 237 | struct swap_map_page *swp; |
387 | struct pbe *p; | ||
388 | 238 | ||
389 | for_each_pbe (p, pagedir_nosave) { | 239 | while (swap_map) { |
390 | entry = p->swap_address; | 240 | swp = swap_map->next; |
391 | if (entry.val) | 241 | free_page((unsigned long)swap_map); |
392 | swap_free(entry); | 242 | swap_map = swp; |
393 | else | ||
394 | break; | ||
395 | } | 243 | } |
396 | } | 244 | } |
397 | 245 | ||
246 | static struct swap_map_page *alloc_swap_map(unsigned int nr_pages) | ||
247 | { | ||
248 | struct swap_map_page *swap_map, *swp; | ||
249 | unsigned n = 0; | ||
250 | |||
251 | if (!nr_pages) | ||
252 | return NULL; | ||
253 | |||
254 | pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages); | ||
255 | swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); | ||
256 | swp = swap_map; | ||
257 | for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) { | ||
258 | swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); | ||
259 | swp = swp->next; | ||
260 | if (!swp) { | ||
261 | free_swap_map(swap_map); | ||
262 | return NULL; | ||
263 | } | ||
264 | } | ||
265 | return swap_map; | ||
266 | } | ||
267 | |||
398 | /** | 268 | /** |
399 | * data_write - Write saved image to swap. | 269 | * reverse_swap_map - reverse the order of pages in the swap map |
400 | * | 270 | * @swap_map |
401 | * Walk the list of pages in the image and sync each one to swap. | ||
402 | */ | 271 | */ |
403 | static int data_write(void) | 272 | |
273 | static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map) | ||
404 | { | 274 | { |
405 | int error = 0, i = 0; | 275 | struct swap_map_page *prev, *next; |
406 | unsigned int mod = nr_copy_pages / 100; | 276 | |
407 | struct pbe *p; | 277 | prev = NULL; |
408 | void *tfm; | 278 | while (swap_map) { |
279 | next = swap_map->next; | ||
280 | swap_map->next = prev; | ||
281 | prev = swap_map; | ||
282 | swap_map = next; | ||
283 | } | ||
284 | return prev; | ||
285 | } | ||
409 | 286 | ||
410 | if ((error = crypto_init(1, &tfm))) | 287 | /** |
411 | return error; | 288 | * free_swap_map_entries - free the swap entries allocated to store |
289 | * the swap map @swap_map (this is only called in case of an error) | ||
290 | */ | ||
291 | static inline void free_swap_map_entries(struct swap_map_page *swap_map) | ||
292 | { | ||
293 | while (swap_map) { | ||
294 | if (swap_map->next_swap.val) | ||
295 | swap_free(swap_map->next_swap); | ||
296 | swap_map = swap_map->next; | ||
297 | } | ||
298 | } | ||
412 | 299 | ||
413 | if (!mod) | 300 | /** |
414 | mod = 1; | 301 | * save_swap_map - save the swap map used for tracing the data pages |
302 | * stored in the swap | ||
303 | */ | ||
415 | 304 | ||
416 | printk( "Writing data to swap (%d pages)... ", nr_copy_pages ); | 305 | static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start) |
417 | for_each_pbe (p, pagedir_nosave) { | 306 | { |
418 | if (!(i%mod)) | 307 | swp_entry_t entry = (swp_entry_t){0}; |
419 | printk( "\b\b\b\b%3d%%", i / mod ); | 308 | int error; |
420 | if ((error = crypto_write(p, tfm))) { | 309 | |
421 | crypto_exit(tfm); | 310 | while (swap_map) { |
311 | swap_map->next_swap = entry; | ||
312 | if ((error = write_page((unsigned long)swap_map, &entry))) | ||
422 | return error; | 313 | return error; |
423 | } | 314 | swap_map = swap_map->next; |
424 | i++; | ||
425 | } | 315 | } |
426 | printk("\b\b\b\bdone\n"); | 316 | *start = entry; |
427 | crypto_exit(tfm); | 317 | return 0; |
318 | } | ||
319 | |||
320 | /** | ||
321 | * free_image_entries - free the swap entries allocated to store | ||
322 | * the image data pages (this is only called in case of an error) | ||
323 | */ | ||
324 | |||
325 | static inline void free_image_entries(struct swap_map_page *swp) | ||
326 | { | ||
327 | unsigned k; | ||
328 | |||
329 | while (swp) { | ||
330 | for (k = 0; k < MAP_PAGE_SIZE; k++) | ||
331 | if (swp->entries[k].val) | ||
332 | swap_free(swp->entries[k]); | ||
333 | swp = swp->next; | ||
334 | } | ||
335 | } | ||
336 | |||
337 | /** | ||
338 | * The swap_map_handle structure is used for handling the swap map in | ||
339 | * a file-alike way | ||
340 | */ | ||
341 | |||
342 | struct swap_map_handle { | ||
343 | struct swap_map_page *cur; | ||
344 | unsigned int k; | ||
345 | }; | ||
346 | |||
347 | static inline void init_swap_map_handle(struct swap_map_handle *handle, | ||
348 | struct swap_map_page *map) | ||
349 | { | ||
350 | handle->cur = map; | ||
351 | handle->k = 0; | ||
352 | } | ||
353 | |||
354 | static inline int swap_map_write_page(struct swap_map_handle *handle, | ||
355 | unsigned long addr) | ||
356 | { | ||
357 | int error; | ||
358 | |||
359 | error = write_page(addr, handle->cur->entries + handle->k); | ||
360 | if (error) | ||
361 | return error; | ||
362 | if (++handle->k >= MAP_PAGE_SIZE) { | ||
363 | handle->cur = handle->cur->next; | ||
364 | handle->k = 0; | ||
365 | } | ||
366 | return 0; | ||
367 | } | ||
368 | |||
369 | /** | ||
370 | * save_image_data - save the data pages pointed to by the PBEs | ||
371 | * from the list @pblist using the swap map handle @handle | ||
372 | * (assume there are @nr_pages data pages to save) | ||
373 | */ | ||
374 | |||
375 | static int save_image_data(struct pbe *pblist, | ||
376 | struct swap_map_handle *handle, | ||
377 | unsigned int nr_pages) | ||
378 | { | ||
379 | unsigned int m; | ||
380 | struct pbe *p; | ||
381 | int error = 0; | ||
382 | |||
383 | printk("Saving image data pages (%u pages) ... ", nr_pages); | ||
384 | m = nr_pages / 100; | ||
385 | if (!m) | ||
386 | m = 1; | ||
387 | nr_pages = 0; | ||
388 | for_each_pbe (p, pblist) { | ||
389 | error = swap_map_write_page(handle, p->address); | ||
390 | if (error) | ||
391 | break; | ||
392 | if (!(nr_pages % m)) | ||
393 | printk("\b\b\b\b%3d%%", nr_pages / m); | ||
394 | nr_pages++; | ||
395 | } | ||
396 | if (!error) | ||
397 | printk("\b\b\b\bdone\n"); | ||
428 | return error; | 398 | return error; |
429 | } | 399 | } |
430 | 400 | ||
@@ -440,70 +410,70 @@ static void dump_info(void) | |||
440 | pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname); | 410 | pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname); |
441 | pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus); | 411 | pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus); |
442 | pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages); | 412 | pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages); |
443 | pr_debug(" swsusp: Pagedir: %ld Pages\n",swsusp_info.pagedir_pages); | 413 | pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages); |
444 | } | 414 | } |
445 | 415 | ||
446 | static void init_header(void) | 416 | static void init_header(unsigned int nr_pages) |
447 | { | 417 | { |
448 | memset(&swsusp_info, 0, sizeof(swsusp_info)); | 418 | memset(&swsusp_info, 0, sizeof(swsusp_info)); |
449 | swsusp_info.version_code = LINUX_VERSION_CODE; | 419 | swsusp_info.version_code = LINUX_VERSION_CODE; |
450 | swsusp_info.num_physpages = num_physpages; | 420 | swsusp_info.num_physpages = num_physpages; |
451 | memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname)); | 421 | memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname)); |
452 | 422 | ||
453 | swsusp_info.suspend_pagedir = pagedir_nosave; | ||
454 | swsusp_info.cpus = num_online_cpus(); | 423 | swsusp_info.cpus = num_online_cpus(); |
455 | swsusp_info.image_pages = nr_copy_pages; | 424 | swsusp_info.image_pages = nr_pages; |
456 | } | 425 | swsusp_info.pages = nr_pages + |
457 | 426 | ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1; | |
458 | static int close_swap(void) | ||
459 | { | ||
460 | swp_entry_t entry; | ||
461 | int error; | ||
462 | |||
463 | dump_info(); | ||
464 | error = write_page((unsigned long)&swsusp_info, &entry); | ||
465 | if (!error) { | ||
466 | printk( "S" ); | ||
467 | error = mark_swapfiles(entry); | ||
468 | printk( "|\n" ); | ||
469 | } | ||
470 | return error; | ||
471 | } | 427 | } |
472 | 428 | ||
473 | /** | 429 | /** |
474 | * free_pagedir_entries - Free pages used by the page directory. | 430 | * pack_orig_addresses - the .orig_address fields of the PBEs from the |
475 | * | 431 | * list starting at @pbe are stored in the array @buf[] (1 page) |
476 | * This is used during suspend for error recovery. | ||
477 | */ | 432 | */ |
478 | 433 | ||
479 | static void free_pagedir_entries(void) | 434 | static inline struct pbe *pack_orig_addresses(unsigned long *buf, |
435 | struct pbe *pbe) | ||
480 | { | 436 | { |
481 | int i; | 437 | int j; |
482 | 438 | ||
483 | for (i = 0; i < swsusp_info.pagedir_pages; i++) | 439 | for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { |
484 | swap_free(swsusp_info.pagedir[i]); | 440 | buf[j] = pbe->orig_address; |
441 | pbe = pbe->next; | ||
442 | } | ||
443 | if (!pbe) | ||
444 | for (; j < PAGE_SIZE / sizeof(long); j++) | ||
445 | buf[j] = 0; | ||
446 | return pbe; | ||
485 | } | 447 | } |
486 | 448 | ||
487 | |||
488 | /** | 449 | /** |
489 | * write_pagedir - Write the array of pages holding the page directory. | 450 | * save_image_metadata - save the .orig_address fields of the PBEs |
490 | * @last: Last swap entry we write (needed for header). | 451 | * from the list @pblist using the swap map handle @handle |
491 | */ | 452 | */ |
492 | 453 | ||
493 | static int write_pagedir(void) | 454 | static int save_image_metadata(struct pbe *pblist, |
455 | struct swap_map_handle *handle) | ||
494 | { | 456 | { |
495 | int error = 0; | 457 | unsigned long *buf; |
496 | unsigned int n = 0; | 458 | unsigned int n = 0; |
497 | struct pbe *pbe; | 459 | struct pbe *p; |
460 | int error = 0; | ||
498 | 461 | ||
499 | printk( "Writing pagedir..."); | 462 | printk("Saving image metadata ... "); |
500 | for_each_pb_page (pbe, pagedir_nosave) { | 463 | buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); |
501 | if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++]))) | 464 | if (!buf) |
502 | return error; | 465 | return -ENOMEM; |
466 | p = pblist; | ||
467 | while (p) { | ||
468 | p = pack_orig_addresses(buf, p); | ||
469 | error = swap_map_write_page(handle, (unsigned long)buf); | ||
470 | if (error) | ||
471 | break; | ||
472 | n++; | ||
503 | } | 473 | } |
504 | 474 | free_page((unsigned long)buf); | |
505 | swsusp_info.pagedir_pages = n; | 475 | if (!error) |
506 | printk("done (%u pages)\n", n); | 476 | printk("done (%u pages saved)\n", n); |
507 | return error; | 477 | return error; |
508 | } | 478 | } |
509 | 479 | ||
@@ -511,75 +481,125 @@ static int write_pagedir(void) | |||
511 | * enough_swap - Make sure we have enough swap to save the image. | 481 | * enough_swap - Make sure we have enough swap to save the image. |
512 | * | 482 | * |
513 | * Returns TRUE or FALSE after checking the total amount of swap | 483 | * Returns TRUE or FALSE after checking the total amount of swap |
514 | * space avaiable. | 484 | * space avaiable from the resume partition. |
515 | * | ||
516 | * FIXME: si_swapinfo(&i) returns all swap devices information. | ||
517 | * We should only consider resume_device. | ||
518 | */ | 485 | */ |
519 | 486 | ||
520 | static int enough_swap(unsigned int nr_pages) | 487 | static int enough_swap(unsigned int nr_pages) |
521 | { | 488 | { |
522 | struct sysinfo i; | 489 | unsigned int free_swap = swap_info[root_swap].pages - |
490 | swap_info[root_swap].inuse_pages; | ||
523 | 491 | ||
524 | si_swapinfo(&i); | 492 | pr_debug("swsusp: free swap pages: %u\n", free_swap); |
525 | pr_debug("swsusp: available swap: %lu pages\n", i.freeswap); | 493 | return free_swap > (nr_pages + PAGES_FOR_IO + |
526 | return i.freeswap > (nr_pages + PAGES_FOR_IO + | ||
527 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); | 494 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); |
528 | } | 495 | } |
529 | 496 | ||
530 | /** | 497 | /** |
531 | * write_suspend_image - Write entire image and metadata. | 498 | * swsusp_write - Write entire image and metadata. |
532 | * | 499 | * |
500 | * It is important _NOT_ to umount filesystems at this point. We want | ||
501 | * them synced (in case something goes wrong) but we DO not want to mark | ||
502 | * filesystem clean: it is not. (And it does not matter, if we resume | ||
503 | * correctly, we'll mark system clean, anyway.) | ||
533 | */ | 504 | */ |
534 | static int write_suspend_image(void) | 505 | |
506 | int swsusp_write(struct pbe *pblist, unsigned int nr_pages) | ||
535 | { | 507 | { |
508 | struct swap_map_page *swap_map; | ||
509 | struct swap_map_handle handle; | ||
510 | swp_entry_t start; | ||
536 | int error; | 511 | int error; |
537 | 512 | ||
538 | if (!enough_swap(nr_copy_pages)) { | 513 | if ((error = swsusp_swap_check())) { |
514 | printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n"); | ||
515 | return error; | ||
516 | } | ||
517 | if (!enough_swap(nr_pages)) { | ||
539 | printk(KERN_ERR "swsusp: Not enough free swap\n"); | 518 | printk(KERN_ERR "swsusp: Not enough free swap\n"); |
540 | return -ENOSPC; | 519 | return -ENOSPC; |
541 | } | 520 | } |
542 | 521 | ||
543 | init_header(); | 522 | init_header(nr_pages); |
544 | if ((error = data_write())) | 523 | swap_map = alloc_swap_map(swsusp_info.pages); |
545 | goto FreeData; | 524 | if (!swap_map) |
525 | return -ENOMEM; | ||
526 | init_swap_map_handle(&handle, swap_map); | ||
527 | |||
528 | error = swap_map_write_page(&handle, (unsigned long)&swsusp_info); | ||
529 | if (!error) | ||
530 | error = save_image_metadata(pblist, &handle); | ||
531 | if (!error) | ||
532 | error = save_image_data(pblist, &handle, nr_pages); | ||
533 | if (error) | ||
534 | goto Free_image_entries; | ||
546 | 535 | ||
547 | if ((error = write_pagedir())) | 536 | swap_map = reverse_swap_map(swap_map); |
548 | goto FreePagedir; | 537 | error = save_swap_map(swap_map, &start); |
538 | if (error) | ||
539 | goto Free_map_entries; | ||
549 | 540 | ||
550 | if ((error = close_swap())) | 541 | dump_info(); |
551 | goto FreePagedir; | 542 | printk( "S" ); |
552 | Done: | 543 | error = mark_swapfiles(start); |
553 | memset(key_iv, 0, MAXKEY+MAXIV); | 544 | printk( "|\n" ); |
545 | if (error) | ||
546 | goto Free_map_entries; | ||
547 | |||
548 | Free_swap_map: | ||
549 | free_swap_map(swap_map); | ||
554 | return error; | 550 | return error; |
555 | FreePagedir: | 551 | |
556 | free_pagedir_entries(); | 552 | Free_map_entries: |
557 | FreeData: | 553 | free_swap_map_entries(swap_map); |
558 | data_free(); | 554 | Free_image_entries: |
559 | goto Done; | 555 | free_image_entries(swap_map); |
556 | goto Free_swap_map; | ||
560 | } | 557 | } |
561 | 558 | ||
562 | /* It is important _NOT_ to umount filesystems at this point. We want | 559 | /** |
563 | * them synced (in case something goes wrong) but we DO not want to mark | 560 | * swsusp_shrink_memory - Try to free as much memory as needed |
564 | * filesystem clean: it is not. (And it does not matter, if we resume | 561 | * |
565 | * correctly, we'll mark system clean, anyway.) | 562 | * ... but do not OOM-kill anyone |
563 | * | ||
564 | * Notice: all userland should be stopped before it is called, or | ||
565 | * livelock is possible. | ||
566 | */ | 566 | */ |
567 | int swsusp_write(void) | ||
568 | { | ||
569 | int error; | ||
570 | 567 | ||
571 | if ((error = swsusp_swap_check())) { | 568 | #define SHRINK_BITE 10000 |
572 | printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n"); | ||
573 | return error; | ||
574 | } | ||
575 | lock_swapdevices(); | ||
576 | error = write_suspend_image(); | ||
577 | /* This will unlock ignored swap devices since writing is finished */ | ||
578 | lock_swapdevices(); | ||
579 | return error; | ||
580 | } | ||
581 | 569 | ||
570 | int swsusp_shrink_memory(void) | ||
571 | { | ||
572 | long size, tmp; | ||
573 | struct zone *zone; | ||
574 | unsigned long pages = 0; | ||
575 | unsigned int i = 0; | ||
576 | char *p = "-\\|/"; | ||
577 | |||
578 | printk("Shrinking memory... "); | ||
579 | do { | ||
580 | size = 2 * count_highmem_pages(); | ||
581 | size += size / 50 + count_data_pages(); | ||
582 | size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE + | ||
583 | PAGES_FOR_IO; | ||
584 | tmp = size; | ||
585 | for_each_zone (zone) | ||
586 | if (!is_highmem(zone)) | ||
587 | tmp -= zone->free_pages; | ||
588 | if (tmp > 0) { | ||
589 | tmp = shrink_all_memory(SHRINK_BITE); | ||
590 | if (!tmp) | ||
591 | return -ENOMEM; | ||
592 | pages += tmp; | ||
593 | } else if (size > (image_size * 1024 * 1024) / PAGE_SIZE) { | ||
594 | tmp = shrink_all_memory(SHRINK_BITE); | ||
595 | pages += tmp; | ||
596 | } | ||
597 | printk("\b%c", p[i++%4]); | ||
598 | } while (tmp > 0); | ||
599 | printk("\bdone (%lu pages freed)\n", pages); | ||
582 | 600 | ||
601 | return 0; | ||
602 | } | ||
583 | 603 | ||
584 | int swsusp_suspend(void) | 604 | int swsusp_suspend(void) |
585 | { | 605 | { |
@@ -677,7 +697,6 @@ static void copy_page_backup_list(struct pbe *dst, struct pbe *src) | |||
677 | /* We assume both lists contain the same number of elements */ | 697 | /* We assume both lists contain the same number of elements */ |
678 | while (src) { | 698 | while (src) { |
679 | dst->orig_address = src->orig_address; | 699 | dst->orig_address = src->orig_address; |
680 | dst->swap_address = src->swap_address; | ||
681 | dst = dst->next; | 700 | dst = dst->next; |
682 | src = src->next; | 701 | src = src->next; |
683 | } | 702 | } |
@@ -757,198 +776,224 @@ static int bio_write_page(pgoff_t page_off, void *page) | |||
757 | return submit(WRITE, page_off, page); | 776 | return submit(WRITE, page_off, page); |
758 | } | 777 | } |
759 | 778 | ||
760 | /* | 779 | /** |
761 | * Sanity check if this image makes sense with this kernel/swap context | 780 | * The following functions allow us to read data using a swap map |
762 | * I really don't think that it's foolproof but more than nothing.. | 781 | * in a file-alike way |
763 | */ | 782 | */ |
764 | 783 | ||
765 | static const char *sanity_check(void) | 784 | static inline void release_swap_map_reader(struct swap_map_handle *handle) |
766 | { | 785 | { |
767 | dump_info(); | 786 | if (handle->cur) |
768 | if (swsusp_info.version_code != LINUX_VERSION_CODE) | 787 | free_page((unsigned long)handle->cur); |
769 | return "kernel version"; | 788 | handle->cur = NULL; |
770 | if (swsusp_info.num_physpages != num_physpages) | ||
771 | return "memory size"; | ||
772 | if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname)) | ||
773 | return "system type"; | ||
774 | if (strcmp(swsusp_info.uts.release,system_utsname.release)) | ||
775 | return "kernel release"; | ||
776 | if (strcmp(swsusp_info.uts.version,system_utsname.version)) | ||
777 | return "version"; | ||
778 | if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) | ||
779 | return "machine"; | ||
780 | #if 0 | ||
781 | /* We can't use number of online CPUs when we use hotplug to remove them ;-))) */ | ||
782 | if (swsusp_info.cpus != num_possible_cpus()) | ||
783 | return "number of cpus"; | ||
784 | #endif | ||
785 | return NULL; | ||
786 | } | 789 | } |
787 | 790 | ||
788 | 791 | static inline int get_swap_map_reader(struct swap_map_handle *handle, | |
789 | static int check_header(void) | 792 | swp_entry_t start) |
790 | { | 793 | { |
791 | const char *reason = NULL; | ||
792 | int error; | 794 | int error; |
793 | 795 | ||
794 | if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info))) | 796 | if (!swp_offset(start)) |
797 | return -EINVAL; | ||
798 | handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); | ||
799 | if (!handle->cur) | ||
800 | return -ENOMEM; | ||
801 | error = bio_read_page(swp_offset(start), handle->cur); | ||
802 | if (error) { | ||
803 | release_swap_map_reader(handle); | ||
795 | return error; | 804 | return error; |
796 | |||
797 | /* Is this same machine? */ | ||
798 | if ((reason = sanity_check())) { | ||
799 | printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason); | ||
800 | return -EPERM; | ||
801 | } | 805 | } |
802 | nr_copy_pages = swsusp_info.image_pages; | 806 | handle->k = 0; |
803 | return error; | 807 | return 0; |
804 | } | 808 | } |
805 | 809 | ||
806 | static int check_sig(void) | 810 | static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf) |
807 | { | 811 | { |
812 | unsigned long offset; | ||
808 | int error; | 813 | int error; |
809 | 814 | ||
810 | memset(&swsusp_header, 0, sizeof(swsusp_header)); | 815 | if (!handle->cur) |
811 | if ((error = bio_read_page(0, &swsusp_header))) | 816 | return -EINVAL; |
812 | return error; | 817 | offset = swp_offset(handle->cur->entries[handle->k]); |
813 | if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { | 818 | if (!offset) |
814 | memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); | ||
815 | memcpy(key_iv, swsusp_header.key_iv, MAXKEY+MAXIV); | ||
816 | memset(swsusp_header.key_iv, 0, MAXKEY+MAXIV); | ||
817 | |||
818 | /* | ||
819 | * Reset swap signature now. | ||
820 | */ | ||
821 | error = bio_write_page(0, &swsusp_header); | ||
822 | } else { | ||
823 | return -EINVAL; | 819 | return -EINVAL; |
820 | error = bio_read_page(offset, buf); | ||
821 | if (error) | ||
822 | return error; | ||
823 | if (++handle->k >= MAP_PAGE_SIZE) { | ||
824 | handle->k = 0; | ||
825 | offset = swp_offset(handle->cur->next_swap); | ||
826 | if (!offset) | ||
827 | release_swap_map_reader(handle); | ||
828 | else | ||
829 | error = bio_read_page(offset, handle->cur); | ||
824 | } | 830 | } |
825 | if (!error) | ||
826 | pr_debug("swsusp: Signature found, resuming\n"); | ||
827 | return error; | 831 | return error; |
828 | } | 832 | } |
829 | 833 | ||
830 | /** | 834 | static int check_header(void) |
831 | * data_read - Read image pages from swap. | ||
832 | * | ||
833 | * You do not need to check for overlaps, check_pagedir() | ||
834 | * already did that. | ||
835 | */ | ||
836 | |||
837 | static int data_read(struct pbe *pblist) | ||
838 | { | 835 | { |
839 | struct pbe *p; | 836 | char *reason = NULL; |
840 | int error = 0; | ||
841 | int i = 0; | ||
842 | int mod = swsusp_info.image_pages / 100; | ||
843 | void *tfm; | ||
844 | |||
845 | if ((error = crypto_init(0, &tfm))) | ||
846 | return error; | ||
847 | |||
848 | if (!mod) | ||
849 | mod = 1; | ||
850 | |||
851 | printk("swsusp: Reading image data (%lu pages): ", | ||
852 | swsusp_info.image_pages); | ||
853 | |||
854 | for_each_pbe (p, pblist) { | ||
855 | if (!(i % mod)) | ||
856 | printk("\b\b\b\b%3d%%", i / mod); | ||
857 | 837 | ||
858 | if ((error = crypto_read(p, tfm))) { | 838 | dump_info(); |
859 | crypto_exit(tfm); | 839 | if (swsusp_info.version_code != LINUX_VERSION_CODE) |
860 | return error; | 840 | reason = "kernel version"; |
861 | } | 841 | if (swsusp_info.num_physpages != num_physpages) |
862 | 842 | reason = "memory size"; | |
863 | i++; | 843 | if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname)) |
844 | reason = "system type"; | ||
845 | if (strcmp(swsusp_info.uts.release,system_utsname.release)) | ||
846 | reason = "kernel release"; | ||
847 | if (strcmp(swsusp_info.uts.version,system_utsname.version)) | ||
848 | reason = "version"; | ||
849 | if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) | ||
850 | reason = "machine"; | ||
851 | if (reason) { | ||
852 | printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); | ||
853 | return -EPERM; | ||
864 | } | 854 | } |
865 | printk("\b\b\b\bdone\n"); | 855 | return 0; |
866 | crypto_exit(tfm); | ||
867 | return error; | ||
868 | } | 856 | } |
869 | 857 | ||
870 | /** | 858 | /** |
871 | * read_pagedir - Read page backup list pages from swap | 859 | * load_image_data - load the image data using the swap map handle |
860 | * @handle and store them using the page backup list @pblist | ||
861 | * (assume there are @nr_pages pages to load) | ||
872 | */ | 862 | */ |
873 | 863 | ||
874 | static int read_pagedir(struct pbe *pblist) | 864 | static int load_image_data(struct pbe *pblist, |
865 | struct swap_map_handle *handle, | ||
866 | unsigned int nr_pages) | ||
875 | { | 867 | { |
876 | struct pbe *pbpage, *p; | ||
877 | unsigned int i = 0; | ||
878 | int error; | 868 | int error; |
869 | unsigned int m; | ||
870 | struct pbe *p; | ||
879 | 871 | ||
880 | if (!pblist) | 872 | if (!pblist) |
881 | return -EFAULT; | 873 | return -EINVAL; |
882 | 874 | printk("Loading image data pages (%u pages) ... ", nr_pages); | |
883 | printk("swsusp: Reading pagedir (%lu pages)\n", | 875 | m = nr_pages / 100; |
884 | swsusp_info.pagedir_pages); | 876 | if (!m) |
885 | 877 | m = 1; | |
886 | for_each_pb_page (pbpage, pblist) { | 878 | nr_pages = 0; |
887 | unsigned long offset = swp_offset(swsusp_info.pagedir[i++]); | 879 | p = pblist; |
888 | 880 | while (p) { | |
889 | error = -EFAULT; | 881 | error = swap_map_read_page(handle, (void *)p->address); |
890 | if (offset) { | ||
891 | p = (pbpage + PB_PAGE_SKIP)->next; | ||
892 | error = bio_read_page(offset, (void *)pbpage); | ||
893 | (pbpage + PB_PAGE_SKIP)->next = p; | ||
894 | } | ||
895 | if (error) | 882 | if (error) |
896 | break; | 883 | break; |
884 | p = p->next; | ||
885 | if (!(nr_pages % m)) | ||
886 | printk("\b\b\b\b%3d%%", nr_pages / m); | ||
887 | nr_pages++; | ||
897 | } | 888 | } |
898 | |||
899 | if (!error) | 889 | if (!error) |
900 | BUG_ON(i != swsusp_info.pagedir_pages); | 890 | printk("\b\b\b\bdone\n"); |
901 | |||
902 | return error; | 891 | return error; |
903 | } | 892 | } |
904 | 893 | ||
894 | /** | ||
895 | * unpack_orig_addresses - copy the elements of @buf[] (1 page) to | ||
896 | * the PBEs in the list starting at @pbe | ||
897 | */ | ||
905 | 898 | ||
906 | static int check_suspend_image(void) | 899 | static inline struct pbe *unpack_orig_addresses(unsigned long *buf, |
900 | struct pbe *pbe) | ||
907 | { | 901 | { |
908 | int error = 0; | 902 | int j; |
909 | 903 | ||
910 | if ((error = check_sig())) | 904 | for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { |
911 | return error; | 905 | pbe->orig_address = buf[j]; |
912 | 906 | pbe = pbe->next; | |
913 | if ((error = check_header())) | 907 | } |
914 | return error; | 908 | return pbe; |
915 | |||
916 | return 0; | ||
917 | } | 909 | } |
918 | 910 | ||
919 | static int read_suspend_image(void) | 911 | /** |
912 | * load_image_metadata - load the image metadata using the swap map | ||
913 | * handle @handle and put them into the PBEs in the list @pblist | ||
914 | */ | ||
915 | |||
916 | static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle) | ||
920 | { | 917 | { |
921 | int error = 0; | ||
922 | struct pbe *p; | 918 | struct pbe *p; |
919 | unsigned long *buf; | ||
920 | unsigned int n = 0; | ||
921 | int error = 0; | ||
923 | 922 | ||
924 | if (!(p = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 0))) | 923 | printk("Loading image metadata ... "); |
924 | buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); | ||
925 | if (!buf) | ||
925 | return -ENOMEM; | 926 | return -ENOMEM; |
926 | 927 | p = pblist; | |
927 | if ((error = read_pagedir(p))) | 928 | while (p) { |
928 | return error; | 929 | error = swap_map_read_page(handle, buf); |
929 | create_pbe_list(p, nr_copy_pages); | 930 | if (error) |
930 | mark_unsafe_pages(p); | 931 | break; |
931 | pagedir_nosave = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); | 932 | p = unpack_orig_addresses(buf, p); |
932 | if (pagedir_nosave) { | 933 | n++; |
933 | create_pbe_list(pagedir_nosave, nr_copy_pages); | ||
934 | copy_page_backup_list(pagedir_nosave, p); | ||
935 | } | 934 | } |
936 | free_pagedir(p); | 935 | free_page((unsigned long)buf); |
937 | if (!pagedir_nosave) | 936 | if (!error) |
938 | return -ENOMEM; | 937 | printk("done (%u pages loaded)\n", n); |
938 | return error; | ||
939 | } | ||
939 | 940 | ||
940 | /* Allocate memory for the image and read the data from swap */ | 941 | int swsusp_read(struct pbe **pblist_ptr) |
942 | { | ||
943 | int error; | ||
944 | struct pbe *p, *pblist; | ||
945 | struct swap_map_handle handle; | ||
946 | unsigned int nr_pages; | ||
941 | 947 | ||
942 | error = alloc_data_pages(pagedir_nosave, GFP_ATOMIC, 1); | 948 | if (IS_ERR(resume_bdev)) { |
949 | pr_debug("swsusp: block device not initialised\n"); | ||
950 | return PTR_ERR(resume_bdev); | ||
951 | } | ||
943 | 952 | ||
953 | error = get_swap_map_reader(&handle, swsusp_header.image); | ||
944 | if (!error) | 954 | if (!error) |
945 | error = data_read(pagedir_nosave); | 955 | error = swap_map_read_page(&handle, &swsusp_info); |
956 | if (!error) | ||
957 | error = check_header(); | ||
958 | if (error) | ||
959 | return error; | ||
960 | nr_pages = swsusp_info.image_pages; | ||
961 | p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0); | ||
962 | if (!p) | ||
963 | return -ENOMEM; | ||
964 | error = load_image_metadata(p, &handle); | ||
965 | if (!error) { | ||
966 | mark_unsafe_pages(p); | ||
967 | pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); | ||
968 | if (pblist) | ||
969 | copy_page_backup_list(pblist, p); | ||
970 | free_pagedir(p); | ||
971 | if (!pblist) | ||
972 | error = -ENOMEM; | ||
973 | |||
974 | /* Allocate memory for the image and read the data from swap */ | ||
975 | if (!error) | ||
976 | error = alloc_data_pages(pblist, GFP_ATOMIC, 1); | ||
977 | if (!error) { | ||
978 | release_eaten_pages(); | ||
979 | error = load_image_data(pblist, &handle, nr_pages); | ||
980 | } | ||
981 | if (!error) | ||
982 | *pblist_ptr = pblist; | ||
983 | } | ||
984 | release_swap_map_reader(&handle); | ||
946 | 985 | ||
986 | blkdev_put(resume_bdev); | ||
987 | |||
988 | if (!error) | ||
989 | pr_debug("swsusp: Reading resume file was successful\n"); | ||
990 | else | ||
991 | pr_debug("swsusp: Error %d resuming\n", error); | ||
947 | return error; | 992 | return error; |
948 | } | 993 | } |
949 | 994 | ||
950 | /** | 995 | /** |
951 | * swsusp_check - Check for saved image in swap | 996 | * swsusp_check - Check for swsusp signature in the resume device |
952 | */ | 997 | */ |
953 | 998 | ||
954 | int swsusp_check(void) | 999 | int swsusp_check(void) |
@@ -958,40 +1003,27 @@ int swsusp_check(void) | |||
958 | resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); | 1003 | resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); |
959 | if (!IS_ERR(resume_bdev)) { | 1004 | if (!IS_ERR(resume_bdev)) { |
960 | set_blocksize(resume_bdev, PAGE_SIZE); | 1005 | set_blocksize(resume_bdev, PAGE_SIZE); |
961 | error = check_suspend_image(); | 1006 | memset(&swsusp_header, 0, sizeof(swsusp_header)); |
1007 | if ((error = bio_read_page(0, &swsusp_header))) | ||
1008 | return error; | ||
1009 | if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { | ||
1010 | memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); | ||
1011 | /* Reset swap signature now */ | ||
1012 | error = bio_write_page(0, &swsusp_header); | ||
1013 | } else { | ||
1014 | return -EINVAL; | ||
1015 | } | ||
962 | if (error) | 1016 | if (error) |
963 | blkdev_put(resume_bdev); | 1017 | blkdev_put(resume_bdev); |
964 | } else | 1018 | else |
1019 | pr_debug("swsusp: Signature found, resuming\n"); | ||
1020 | } else { | ||
965 | error = PTR_ERR(resume_bdev); | 1021 | error = PTR_ERR(resume_bdev); |
966 | |||
967 | if (!error) | ||
968 | pr_debug("swsusp: resume file found\n"); | ||
969 | else | ||
970 | pr_debug("swsusp: Error %d check for resume file\n", error); | ||
971 | return error; | ||
972 | } | ||
973 | |||
974 | /** | ||
975 | * swsusp_read - Read saved image from swap. | ||
976 | */ | ||
977 | |||
978 | int swsusp_read(void) | ||
979 | { | ||
980 | int error; | ||
981 | |||
982 | if (IS_ERR(resume_bdev)) { | ||
983 | pr_debug("swsusp: block device not initialised\n"); | ||
984 | return PTR_ERR(resume_bdev); | ||
985 | } | 1022 | } |
986 | 1023 | ||
987 | error = read_suspend_image(); | 1024 | if (error) |
988 | blkdev_put(resume_bdev); | 1025 | pr_debug("swsusp: Error %d check for resume file\n", error); |
989 | memset(key_iv, 0, MAXKEY+MAXIV); | ||
990 | 1026 | ||
991 | if (!error) | ||
992 | pr_debug("swsusp: Reading resume file was successful\n"); | ||
993 | else | ||
994 | pr_debug("swsusp: Error %d resuming\n", error); | ||
995 | return error; | 1027 | return error; |
996 | } | 1028 | } |
997 | 1029 | ||
diff --git a/kernel/printk.c b/kernel/printk.c index 5287be83e3e7..13ced0f7828f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -11,7 +11,7 @@ | |||
11 | * Ted Ts'o, 2/11/93. | 11 | * Ted Ts'o, 2/11/93. |
12 | * Modified for sysctl support, 1/8/97, Chris Horn. | 12 | * Modified for sysctl support, 1/8/97, Chris Horn. |
13 | * Fixed SMP synchronization, 08/08/99, Manfred Spraul | 13 | * Fixed SMP synchronization, 08/08/99, Manfred Spraul |
14 | * manfreds@colorfullife.com | 14 | * manfred@colorfullife.com |
15 | * Rewrote bits to get rid of console_lock | 15 | * Rewrote bits to get rid of console_lock |
16 | * 01Mar01 Andrew Morton <andrewm@uow.edu.au> | 16 | * 01Mar01 Andrew Morton <andrewm@uow.edu.au> |
17 | */ | 17 | */ |
@@ -569,7 +569,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
569 | p[1] <= '7' && p[2] == '>') { | 569 | p[1] <= '7' && p[2] == '>') { |
570 | loglev_char = p[1]; | 570 | loglev_char = p[1]; |
571 | p += 3; | 571 | p += 3; |
572 | printed_len += 3; | 572 | printed_len -= 3; |
573 | } else { | 573 | } else { |
574 | loglev_char = default_message_loglevel | 574 | loglev_char = default_message_loglevel |
575 | + '0'; | 575 | + '0'; |
@@ -584,7 +584,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
584 | 584 | ||
585 | for (tp = tbuf; tp < tbuf + tlen; tp++) | 585 | for (tp = tbuf; tp < tbuf + tlen; tp++) |
586 | emit_log_char(*tp); | 586 | emit_log_char(*tp); |
587 | printed_len += tlen - 3; | 587 | printed_len += tlen; |
588 | } else { | 588 | } else { |
589 | if (p[0] != '<' || p[1] < '0' || | 589 | if (p[0] != '<' || p[1] < '0' || |
590 | p[1] > '7' || p[2] != '>') { | 590 | p[1] > '7' || p[2] != '>') { |
@@ -592,8 +592,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
592 | emit_log_char(default_message_loglevel | 592 | emit_log_char(default_message_loglevel |
593 | + '0'); | 593 | + '0'); |
594 | emit_log_char('>'); | 594 | emit_log_char('>'); |
595 | printed_len += 3; | ||
595 | } | 596 | } |
596 | printed_len += 3; | ||
597 | } | 597 | } |
598 | log_level_unknown = 0; | 598 | log_level_unknown = 0; |
599 | if (!*p) | 599 | if (!*p) |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 656476eedb1b..5f33cdb6fff5 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -7,6 +7,7 @@ | |||
7 | * to continually duplicate across every architecture. | 7 | * to continually duplicate across every architecture. |
8 | */ | 8 | */ |
9 | 9 | ||
10 | #include <linux/capability.h> | ||
10 | #include <linux/module.h> | 11 | #include <linux/module.h> |
11 | #include <linux/sched.h> | 12 | #include <linux/sched.h> |
12 | #include <linux/errno.h> | 13 | #include <linux/errno.h> |
@@ -408,54 +409,62 @@ int ptrace_request(struct task_struct *child, long request, | |||
408 | return ret; | 409 | return ret; |
409 | } | 410 | } |
410 | 411 | ||
411 | #ifndef __ARCH_SYS_PTRACE | 412 | /** |
412 | static int ptrace_get_task_struct(long request, long pid, | 413 | * ptrace_traceme -- helper for PTRACE_TRACEME |
413 | struct task_struct **childp) | 414 | * |
415 | * Performs checks and sets PT_PTRACED. | ||
416 | * Should be used by all ptrace implementations for PTRACE_TRACEME. | ||
417 | */ | ||
418 | int ptrace_traceme(void) | ||
414 | { | 419 | { |
415 | struct task_struct *child; | ||
416 | int ret; | 420 | int ret; |
417 | 421 | ||
418 | /* | 422 | /* |
419 | * Callers use child == NULL as an indication to exit early even | 423 | * Are we already being traced? |
420 | * when the return value is 0, so make sure it is non-NULL here. | 424 | */ |
425 | if (current->ptrace & PT_PTRACED) | ||
426 | return -EPERM; | ||
427 | ret = security_ptrace(current->parent, current); | ||
428 | if (ret) | ||
429 | return -EPERM; | ||
430 | /* | ||
431 | * Set the ptrace bit in the process ptrace flags. | ||
421 | */ | 432 | */ |
422 | *childp = NULL; | 433 | current->ptrace |= PT_PTRACED; |
434 | return 0; | ||
435 | } | ||
423 | 436 | ||
424 | if (request == PTRACE_TRACEME) { | 437 | /** |
425 | /* | 438 | * ptrace_get_task_struct -- grab a task struct reference for ptrace |
426 | * Are we already being traced? | 439 | * @pid: process id to grab a task_struct reference of |
427 | */ | 440 | * |
428 | if (current->ptrace & PT_PTRACED) | 441 | * This function is a helper for ptrace implementations. It checks |
429 | return -EPERM; | 442 | * permissions and then grabs a task struct for use of the actual |
430 | ret = security_ptrace(current->parent, current); | 443 | * ptrace implementation. |
431 | if (ret) | 444 | * |
432 | return -EPERM; | 445 | * Returns the task_struct for @pid or an ERR_PTR() on failure. |
433 | /* | 446 | */ |
434 | * Set the ptrace bit in the process ptrace flags. | 447 | struct task_struct *ptrace_get_task_struct(pid_t pid) |
435 | */ | 448 | { |
436 | current->ptrace |= PT_PTRACED; | 449 | struct task_struct *child; |
437 | return 0; | ||
438 | } | ||
439 | 450 | ||
440 | /* | 451 | /* |
441 | * You may not mess with init | 452 | * Tracing init is not allowed. |
442 | */ | 453 | */ |
443 | if (pid == 1) | 454 | if (pid == 1) |
444 | return -EPERM; | 455 | return ERR_PTR(-EPERM); |
445 | 456 | ||
446 | ret = -ESRCH; | ||
447 | read_lock(&tasklist_lock); | 457 | read_lock(&tasklist_lock); |
448 | child = find_task_by_pid(pid); | 458 | child = find_task_by_pid(pid); |
449 | if (child) | 459 | if (child) |
450 | get_task_struct(child); | 460 | get_task_struct(child); |
451 | read_unlock(&tasklist_lock); | 461 | read_unlock(&tasklist_lock); |
452 | if (!child) | 462 | if (!child) |
453 | return -ESRCH; | 463 | return ERR_PTR(-ESRCH); |
454 | 464 | return child; | |
455 | *childp = child; | ||
456 | return 0; | ||
457 | } | 465 | } |
458 | 466 | ||
467 | #ifndef __ARCH_SYS_PTRACE | ||
459 | asmlinkage long sys_ptrace(long request, long pid, long addr, long data) | 468 | asmlinkage long sys_ptrace(long request, long pid, long addr, long data) |
460 | { | 469 | { |
461 | struct task_struct *child; | 470 | struct task_struct *child; |
@@ -465,9 +474,16 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data) | |||
465 | * This lock_kernel fixes a subtle race with suid exec | 474 | * This lock_kernel fixes a subtle race with suid exec |
466 | */ | 475 | */ |
467 | lock_kernel(); | 476 | lock_kernel(); |
468 | ret = ptrace_get_task_struct(request, pid, &child); | 477 | if (request == PTRACE_TRACEME) { |
469 | if (!child) | 478 | ret = ptrace_traceme(); |
470 | goto out; | 479 | goto out; |
480 | } | ||
481 | |||
482 | child = ptrace_get_task_struct(pid); | ||
483 | if (IS_ERR(child)) { | ||
484 | ret = PTR_ERR(child); | ||
485 | goto out; | ||
486 | } | ||
471 | 487 | ||
472 | if (request == PTRACE_ATTACH) { | 488 | if (request == PTRACE_ATTACH) { |
473 | ret = ptrace_attach(child); | 489 | ret = ptrace_attach(child); |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 48d3bce465b8..0cf8146bd585 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/init.h> | 35 | #include <linux/init.h> |
36 | #include <linux/spinlock.h> | 36 | #include <linux/spinlock.h> |
37 | #include <linux/smp.h> | 37 | #include <linux/smp.h> |
38 | #include <linux/rcupdate.h> | ||
38 | #include <linux/interrupt.h> | 39 | #include <linux/interrupt.h> |
39 | #include <linux/sched.h> | 40 | #include <linux/sched.h> |
40 | #include <asm/atomic.h> | 41 | #include <asm/atomic.h> |
@@ -45,26 +46,21 @@ | |||
45 | #include <linux/percpu.h> | 46 | #include <linux/percpu.h> |
46 | #include <linux/notifier.h> | 47 | #include <linux/notifier.h> |
47 | #include <linux/rcupdate.h> | 48 | #include <linux/rcupdate.h> |
48 | #include <linux/rcuref.h> | ||
49 | #include <linux/cpu.h> | 49 | #include <linux/cpu.h> |
50 | 50 | ||
51 | /* Definition for rcupdate control block. */ | 51 | /* Definition for rcupdate control block. */ |
52 | struct rcu_ctrlblk rcu_ctrlblk = | 52 | struct rcu_ctrlblk rcu_ctrlblk = { |
53 | { .cur = -300, .completed = -300 }; | 53 | .cur = -300, |
54 | struct rcu_ctrlblk rcu_bh_ctrlblk = | 54 | .completed = -300, |
55 | { .cur = -300, .completed = -300 }; | 55 | .lock = SPIN_LOCK_UNLOCKED, |
56 | 56 | .cpumask = CPU_MASK_NONE, | |
57 | /* Bookkeeping of the progress of the grace period */ | 57 | }; |
58 | struct rcu_state { | 58 | struct rcu_ctrlblk rcu_bh_ctrlblk = { |
59 | spinlock_t lock; /* Guard this struct and writes to rcu_ctrlblk */ | 59 | .cur = -300, |
60 | cpumask_t cpumask; /* CPUs that need to switch in order */ | 60 | .completed = -300, |
61 | /* for current batch to proceed. */ | 61 | .lock = SPIN_LOCK_UNLOCKED, |
62 | .cpumask = CPU_MASK_NONE, | ||
62 | }; | 63 | }; |
63 | |||
64 | static struct rcu_state rcu_state ____cacheline_maxaligned_in_smp = | ||
65 | {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE }; | ||
66 | static struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp = | ||
67 | {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE }; | ||
68 | 64 | ||
69 | DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; | 65 | DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; |
70 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; | 66 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; |
@@ -73,19 +69,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; | |||
73 | static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; | 69 | static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; |
74 | static int maxbatch = 10000; | 70 | static int maxbatch = 10000; |
75 | 71 | ||
76 | #ifndef __HAVE_ARCH_CMPXCHG | ||
77 | /* | ||
78 | * We use an array of spinlocks for the rcurefs -- similar to ones in sparc | ||
79 | * 32 bit atomic_t implementations, and a hash function similar to that | ||
80 | * for our refcounting needs. | ||
81 | * Can't help multiprocessors which donot have cmpxchg :( | ||
82 | */ | ||
83 | |||
84 | spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = { | ||
85 | [0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED | ||
86 | }; | ||
87 | #endif | ||
88 | |||
89 | /** | 72 | /** |
90 | * call_rcu - Queue an RCU callback for invocation after a grace period. | 73 | * call_rcu - Queue an RCU callback for invocation after a grace period. |
91 | * @head: structure to be used for queueing the RCU updates. | 74 | * @head: structure to be used for queueing the RCU updates. |
@@ -233,13 +216,13 @@ static void rcu_do_batch(struct rcu_data *rdp) | |||
233 | * This is done by rcu_start_batch. The start is not broadcasted to | 216 | * This is done by rcu_start_batch. The start is not broadcasted to |
234 | * all cpus, they must pick this up by comparing rcp->cur with | 217 | * all cpus, they must pick this up by comparing rcp->cur with |
235 | * rdp->quiescbatch. All cpus are recorded in the | 218 | * rdp->quiescbatch. All cpus are recorded in the |
236 | * rcu_state.cpumask bitmap. | 219 | * rcu_ctrlblk.cpumask bitmap. |
237 | * - All cpus must go through a quiescent state. | 220 | * - All cpus must go through a quiescent state. |
238 | * Since the start of the grace period is not broadcasted, at least two | 221 | * Since the start of the grace period is not broadcasted, at least two |
239 | * calls to rcu_check_quiescent_state are required: | 222 | * calls to rcu_check_quiescent_state are required: |
240 | * The first call just notices that a new grace period is running. The | 223 | * The first call just notices that a new grace period is running. The |
241 | * following calls check if there was a quiescent state since the beginning | 224 | * following calls check if there was a quiescent state since the beginning |
242 | * of the grace period. If so, it updates rcu_state.cpumask. If | 225 | * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If |
243 | * the bitmap is empty, then the grace period is completed. | 226 | * the bitmap is empty, then the grace period is completed. |
244 | * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace | 227 | * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace |
245 | * period (if necessary). | 228 | * period (if necessary). |
@@ -247,14 +230,10 @@ static void rcu_do_batch(struct rcu_data *rdp) | |||
247 | /* | 230 | /* |
248 | * Register a new batch of callbacks, and start it up if there is currently no | 231 | * Register a new batch of callbacks, and start it up if there is currently no |
249 | * active batch and the batch to be registered has not already occurred. | 232 | * active batch and the batch to be registered has not already occurred. |
250 | * Caller must hold rcu_state.lock. | 233 | * Caller must hold rcu_ctrlblk.lock. |
251 | */ | 234 | */ |
252 | static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp, | 235 | static void rcu_start_batch(struct rcu_ctrlblk *rcp) |
253 | int next_pending) | ||
254 | { | 236 | { |
255 | if (next_pending) | ||
256 | rcp->next_pending = 1; | ||
257 | |||
258 | if (rcp->next_pending && | 237 | if (rcp->next_pending && |
259 | rcp->completed == rcp->cur) { | 238 | rcp->completed == rcp->cur) { |
260 | rcp->next_pending = 0; | 239 | rcp->next_pending = 0; |
@@ -268,11 +247,11 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp, | |||
268 | /* | 247 | /* |
269 | * Accessing nohz_cpu_mask before incrementing rcp->cur needs a | 248 | * Accessing nohz_cpu_mask before incrementing rcp->cur needs a |
270 | * Barrier Otherwise it can cause tickless idle CPUs to be | 249 | * Barrier Otherwise it can cause tickless idle CPUs to be |
271 | * included in rsp->cpumask, which will extend graceperiods | 250 | * included in rcp->cpumask, which will extend graceperiods |
272 | * unnecessarily. | 251 | * unnecessarily. |
273 | */ | 252 | */ |
274 | smp_mb(); | 253 | smp_mb(); |
275 | cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask); | 254 | cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); |
276 | 255 | ||
277 | } | 256 | } |
278 | } | 257 | } |
@@ -282,13 +261,13 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp, | |||
282 | * Clear it from the cpu mask and complete the grace period if it was the last | 261 | * Clear it from the cpu mask and complete the grace period if it was the last |
283 | * cpu. Start another grace period if someone has further entries pending | 262 | * cpu. Start another grace period if someone has further entries pending |
284 | */ | 263 | */ |
285 | static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp) | 264 | static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) |
286 | { | 265 | { |
287 | cpu_clear(cpu, rsp->cpumask); | 266 | cpu_clear(cpu, rcp->cpumask); |
288 | if (cpus_empty(rsp->cpumask)) { | 267 | if (cpus_empty(rcp->cpumask)) { |
289 | /* batch completed ! */ | 268 | /* batch completed ! */ |
290 | rcp->completed = rcp->cur; | 269 | rcp->completed = rcp->cur; |
291 | rcu_start_batch(rcp, rsp, 0); | 270 | rcu_start_batch(rcp); |
292 | } | 271 | } |
293 | } | 272 | } |
294 | 273 | ||
@@ -298,7 +277,7 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp) | |||
298 | * quiescent cycle, then indicate that it has done so. | 277 | * quiescent cycle, then indicate that it has done so. |
299 | */ | 278 | */ |
300 | static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, | 279 | static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, |
301 | struct rcu_state *rsp, struct rcu_data *rdp) | 280 | struct rcu_data *rdp) |
302 | { | 281 | { |
303 | if (rdp->quiescbatch != rcp->cur) { | 282 | if (rdp->quiescbatch != rcp->cur) { |
304 | /* start new grace period: */ | 283 | /* start new grace period: */ |
@@ -323,15 +302,15 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, | |||
323 | return; | 302 | return; |
324 | rdp->qs_pending = 0; | 303 | rdp->qs_pending = 0; |
325 | 304 | ||
326 | spin_lock(&rsp->lock); | 305 | spin_lock(&rcp->lock); |
327 | /* | 306 | /* |
328 | * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync | 307 | * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync |
329 | * during cpu startup. Ignore the quiescent state. | 308 | * during cpu startup. Ignore the quiescent state. |
330 | */ | 309 | */ |
331 | if (likely(rdp->quiescbatch == rcp->cur)) | 310 | if (likely(rdp->quiescbatch == rcp->cur)) |
332 | cpu_quiet(rdp->cpu, rcp, rsp); | 311 | cpu_quiet(rdp->cpu, rcp); |
333 | 312 | ||
334 | spin_unlock(&rsp->lock); | 313 | spin_unlock(&rcp->lock); |
335 | } | 314 | } |
336 | 315 | ||
337 | 316 | ||
@@ -352,28 +331,29 @@ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, | |||
352 | } | 331 | } |
353 | 332 | ||
354 | static void __rcu_offline_cpu(struct rcu_data *this_rdp, | 333 | static void __rcu_offline_cpu(struct rcu_data *this_rdp, |
355 | struct rcu_ctrlblk *rcp, struct rcu_state *rsp, struct rcu_data *rdp) | 334 | struct rcu_ctrlblk *rcp, struct rcu_data *rdp) |
356 | { | 335 | { |
357 | /* if the cpu going offline owns the grace period | 336 | /* if the cpu going offline owns the grace period |
358 | * we can block indefinitely waiting for it, so flush | 337 | * we can block indefinitely waiting for it, so flush |
359 | * it here | 338 | * it here |
360 | */ | 339 | */ |
361 | spin_lock_bh(&rsp->lock); | 340 | spin_lock_bh(&rcp->lock); |
362 | if (rcp->cur != rcp->completed) | 341 | if (rcp->cur != rcp->completed) |
363 | cpu_quiet(rdp->cpu, rcp, rsp); | 342 | cpu_quiet(rdp->cpu, rcp); |
364 | spin_unlock_bh(&rsp->lock); | 343 | spin_unlock_bh(&rcp->lock); |
365 | rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); | 344 | rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); |
366 | rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); | 345 | rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); |
367 | 346 | rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); | |
368 | } | 347 | } |
348 | |||
369 | static void rcu_offline_cpu(int cpu) | 349 | static void rcu_offline_cpu(int cpu) |
370 | { | 350 | { |
371 | struct rcu_data *this_rdp = &get_cpu_var(rcu_data); | 351 | struct rcu_data *this_rdp = &get_cpu_var(rcu_data); |
372 | struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); | 352 | struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); |
373 | 353 | ||
374 | __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, &rcu_state, | 354 | __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, |
375 | &per_cpu(rcu_data, cpu)); | 355 | &per_cpu(rcu_data, cpu)); |
376 | __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, &rcu_bh_state, | 356 | __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, |
377 | &per_cpu(rcu_bh_data, cpu)); | 357 | &per_cpu(rcu_bh_data, cpu)); |
378 | put_cpu_var(rcu_data); | 358 | put_cpu_var(rcu_data); |
379 | put_cpu_var(rcu_bh_data); | 359 | put_cpu_var(rcu_bh_data); |
@@ -392,7 +372,7 @@ static void rcu_offline_cpu(int cpu) | |||
392 | * This does the RCU processing work from tasklet context. | 372 | * This does the RCU processing work from tasklet context. |
393 | */ | 373 | */ |
394 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, | 374 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, |
395 | struct rcu_state *rsp, struct rcu_data *rdp) | 375 | struct rcu_data *rdp) |
396 | { | 376 | { |
397 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { | 377 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { |
398 | *rdp->donetail = rdp->curlist; | 378 | *rdp->donetail = rdp->curlist; |
@@ -422,24 +402,53 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, | |||
422 | 402 | ||
423 | if (!rcp->next_pending) { | 403 | if (!rcp->next_pending) { |
424 | /* and start it/schedule start if it's a new batch */ | 404 | /* and start it/schedule start if it's a new batch */ |
425 | spin_lock(&rsp->lock); | 405 | spin_lock(&rcp->lock); |
426 | rcu_start_batch(rcp, rsp, 1); | 406 | rcp->next_pending = 1; |
427 | spin_unlock(&rsp->lock); | 407 | rcu_start_batch(rcp); |
408 | spin_unlock(&rcp->lock); | ||
428 | } | 409 | } |
429 | } else { | 410 | } else { |
430 | local_irq_enable(); | 411 | local_irq_enable(); |
431 | } | 412 | } |
432 | rcu_check_quiescent_state(rcp, rsp, rdp); | 413 | rcu_check_quiescent_state(rcp, rdp); |
433 | if (rdp->donelist) | 414 | if (rdp->donelist) |
434 | rcu_do_batch(rdp); | 415 | rcu_do_batch(rdp); |
435 | } | 416 | } |
436 | 417 | ||
437 | static void rcu_process_callbacks(unsigned long unused) | 418 | static void rcu_process_callbacks(unsigned long unused) |
438 | { | 419 | { |
439 | __rcu_process_callbacks(&rcu_ctrlblk, &rcu_state, | 420 | __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); |
440 | &__get_cpu_var(rcu_data)); | 421 | __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); |
441 | __rcu_process_callbacks(&rcu_bh_ctrlblk, &rcu_bh_state, | 422 | } |
442 | &__get_cpu_var(rcu_bh_data)); | 423 | |
424 | static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
425 | { | ||
426 | /* This cpu has pending rcu entries and the grace period | ||
427 | * for them has completed. | ||
428 | */ | ||
429 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) | ||
430 | return 1; | ||
431 | |||
432 | /* This cpu has no pending entries, but there are new entries */ | ||
433 | if (!rdp->curlist && rdp->nxtlist) | ||
434 | return 1; | ||
435 | |||
436 | /* This cpu has finished callbacks to invoke */ | ||
437 | if (rdp->donelist) | ||
438 | return 1; | ||
439 | |||
440 | /* The rcu core waits for a quiescent state from the cpu */ | ||
441 | if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) | ||
442 | return 1; | ||
443 | |||
444 | /* nothing to do */ | ||
445 | return 0; | ||
446 | } | ||
447 | |||
448 | int rcu_pending(int cpu) | ||
449 | { | ||
450 | return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || | ||
451 | __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); | ||
443 | } | 452 | } |
444 | 453 | ||
445 | void rcu_check_callbacks(int cpu, int user) | 454 | void rcu_check_callbacks(int cpu, int user) |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 49fbbeff201c..773219907dd8 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -39,7 +39,6 @@ | |||
39 | #include <linux/moduleparam.h> | 39 | #include <linux/moduleparam.h> |
40 | #include <linux/percpu.h> | 40 | #include <linux/percpu.h> |
41 | #include <linux/notifier.h> | 41 | #include <linux/notifier.h> |
42 | #include <linux/rcuref.h> | ||
43 | #include <linux/cpu.h> | 42 | #include <linux/cpu.h> |
44 | #include <linux/random.h> | 43 | #include <linux/random.h> |
45 | #include <linux/delay.h> | 44 | #include <linux/delay.h> |
@@ -49,9 +48,11 @@ | |||
49 | MODULE_LICENSE("GPL"); | 48 | MODULE_LICENSE("GPL"); |
50 | 49 | ||
51 | static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */ | 50 | static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */ |
52 | static int stat_interval = 0; /* Interval between stats, in seconds. */ | 51 | static int stat_interval; /* Interval between stats, in seconds. */ |
53 | /* Defaults to "only at end of test". */ | 52 | /* Defaults to "only at end of test". */ |
54 | static int verbose = 0; /* Print more debug info. */ | 53 | static int verbose; /* Print more debug info. */ |
54 | static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ | ||
55 | static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ | ||
55 | 56 | ||
56 | MODULE_PARM(nreaders, "i"); | 57 | MODULE_PARM(nreaders, "i"); |
57 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | 58 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); |
@@ -59,6 +60,10 @@ MODULE_PARM(stat_interval, "i"); | |||
59 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); | 60 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); |
60 | MODULE_PARM(verbose, "i"); | 61 | MODULE_PARM(verbose, "i"); |
61 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); | 62 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); |
63 | MODULE_PARM(test_no_idle_hz, "i"); | ||
64 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); | ||
65 | MODULE_PARM(shuffle_interval, "i"); | ||
66 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); | ||
62 | #define TORTURE_FLAG "rcutorture: " | 67 | #define TORTURE_FLAG "rcutorture: " |
63 | #define PRINTK_STRING(s) \ | 68 | #define PRINTK_STRING(s) \ |
64 | do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) | 69 | do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) |
@@ -73,6 +78,7 @@ static int nrealreaders; | |||
73 | static struct task_struct *writer_task; | 78 | static struct task_struct *writer_task; |
74 | static struct task_struct **reader_tasks; | 79 | static struct task_struct **reader_tasks; |
75 | static struct task_struct *stats_task; | 80 | static struct task_struct *stats_task; |
81 | static struct task_struct *shuffler_task; | ||
76 | 82 | ||
77 | #define RCU_TORTURE_PIPE_LEN 10 | 83 | #define RCU_TORTURE_PIPE_LEN 10 |
78 | 84 | ||
@@ -103,7 +109,7 @@ atomic_t n_rcu_torture_error; | |||
103 | /* | 109 | /* |
104 | * Allocate an element from the rcu_tortures pool. | 110 | * Allocate an element from the rcu_tortures pool. |
105 | */ | 111 | */ |
106 | struct rcu_torture * | 112 | static struct rcu_torture * |
107 | rcu_torture_alloc(void) | 113 | rcu_torture_alloc(void) |
108 | { | 114 | { |
109 | struct list_head *p; | 115 | struct list_head *p; |
@@ -376,12 +382,77 @@ rcu_torture_stats(void *arg) | |||
376 | return 0; | 382 | return 0; |
377 | } | 383 | } |
378 | 384 | ||
385 | static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ | ||
386 | |||
387 | /* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case | ||
388 | * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. | ||
389 | */ | ||
390 | void rcu_torture_shuffle_tasks(void) | ||
391 | { | ||
392 | cpumask_t tmp_mask = CPU_MASK_ALL; | ||
393 | int i; | ||
394 | |||
395 | lock_cpu_hotplug(); | ||
396 | |||
397 | /* No point in shuffling if there is only one online CPU (ex: UP) */ | ||
398 | if (num_online_cpus() == 1) { | ||
399 | unlock_cpu_hotplug(); | ||
400 | return; | ||
401 | } | ||
402 | |||
403 | if (rcu_idle_cpu != -1) | ||
404 | cpu_clear(rcu_idle_cpu, tmp_mask); | ||
405 | |||
406 | set_cpus_allowed(current, tmp_mask); | ||
407 | |||
408 | if (reader_tasks != NULL) { | ||
409 | for (i = 0; i < nrealreaders; i++) | ||
410 | if (reader_tasks[i]) | ||
411 | set_cpus_allowed(reader_tasks[i], tmp_mask); | ||
412 | } | ||
413 | |||
414 | if (writer_task) | ||
415 | set_cpus_allowed(writer_task, tmp_mask); | ||
416 | |||
417 | if (stats_task) | ||
418 | set_cpus_allowed(stats_task, tmp_mask); | ||
419 | |||
420 | if (rcu_idle_cpu == -1) | ||
421 | rcu_idle_cpu = num_online_cpus() - 1; | ||
422 | else | ||
423 | rcu_idle_cpu--; | ||
424 | |||
425 | unlock_cpu_hotplug(); | ||
426 | } | ||
427 | |||
428 | /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the | ||
429 | * system to become idle at a time and cut off its timer ticks. This is meant | ||
430 | * to test the support for such tickless idle CPU in RCU. | ||
431 | */ | ||
432 | static int | ||
433 | rcu_torture_shuffle(void *arg) | ||
434 | { | ||
435 | VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started"); | ||
436 | do { | ||
437 | schedule_timeout_interruptible(shuffle_interval * HZ); | ||
438 | rcu_torture_shuffle_tasks(); | ||
439 | } while (!kthread_should_stop()); | ||
440 | VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); | ||
441 | return 0; | ||
442 | } | ||
443 | |||
379 | static void | 444 | static void |
380 | rcu_torture_cleanup(void) | 445 | rcu_torture_cleanup(void) |
381 | { | 446 | { |
382 | int i; | 447 | int i; |
383 | 448 | ||
384 | fullstop = 1; | 449 | fullstop = 1; |
450 | if (shuffler_task != NULL) { | ||
451 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); | ||
452 | kthread_stop(shuffler_task); | ||
453 | } | ||
454 | shuffler_task = NULL; | ||
455 | |||
385 | if (writer_task != NULL) { | 456 | if (writer_task != NULL) { |
386 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); | 457 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); |
387 | kthread_stop(writer_task); | 458 | kthread_stop(writer_task); |
@@ -430,9 +501,11 @@ rcu_torture_init(void) | |||
430 | nrealreaders = nreaders; | 501 | nrealreaders = nreaders; |
431 | else | 502 | else |
432 | nrealreaders = 2 * num_online_cpus(); | 503 | nrealreaders = 2 * num_online_cpus(); |
433 | printk(KERN_ALERT TORTURE_FLAG | 504 | printk(KERN_ALERT TORTURE_FLAG "--- Start of test: nreaders=%d " |
434 | "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n", | 505 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " |
435 | nrealreaders, stat_interval, verbose); | 506 | "shuffle_interval = %d\n", |
507 | nrealreaders, stat_interval, verbose, test_no_idle_hz, | ||
508 | shuffle_interval); | ||
436 | fullstop = 0; | 509 | fullstop = 0; |
437 | 510 | ||
438 | /* Set up the freelist. */ | 511 | /* Set up the freelist. */ |
@@ -502,6 +575,18 @@ rcu_torture_init(void) | |||
502 | goto unwind; | 575 | goto unwind; |
503 | } | 576 | } |
504 | } | 577 | } |
578 | if (test_no_idle_hz) { | ||
579 | rcu_idle_cpu = num_online_cpus() - 1; | ||
580 | /* Create the shuffler thread */ | ||
581 | shuffler_task = kthread_run(rcu_torture_shuffle, NULL, | ||
582 | "rcu_torture_shuffle"); | ||
583 | if (IS_ERR(shuffler_task)) { | ||
584 | firsterr = PTR_ERR(shuffler_task); | ||
585 | VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); | ||
586 | shuffler_task = NULL; | ||
587 | goto unwind; | ||
588 | } | ||
589 | } | ||
505 | return 0; | 590 | return 0; |
506 | 591 | ||
507 | unwind: | 592 | unwind: |
diff --git a/kernel/resource.c b/kernel/resource.c index 92285d822de6..e3080fcc66a3 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -464,7 +464,7 @@ struct resource * __request_region(struct resource *parent, unsigned long start, | |||
464 | 464 | ||
465 | EXPORT_SYMBOL(__request_region); | 465 | EXPORT_SYMBOL(__request_region); |
466 | 466 | ||
467 | int __deprecated __check_region(struct resource *parent, unsigned long start, unsigned long n) | 467 | int __check_region(struct resource *parent, unsigned long start, unsigned long n) |
468 | { | 468 | { |
469 | struct resource * res; | 469 | struct resource * res; |
470 | 470 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 6f46c94cc29e..3ee2ae45125f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -27,12 +27,14 @@ | |||
27 | #include <linux/smp_lock.h> | 27 | #include <linux/smp_lock.h> |
28 | #include <asm/mmu_context.h> | 28 | #include <asm/mmu_context.h> |
29 | #include <linux/interrupt.h> | 29 | #include <linux/interrupt.h> |
30 | #include <linux/capability.h> | ||
30 | #include <linux/completion.h> | 31 | #include <linux/completion.h> |
31 | #include <linux/kernel_stat.h> | 32 | #include <linux/kernel_stat.h> |
32 | #include <linux/security.h> | 33 | #include <linux/security.h> |
33 | #include <linux/notifier.h> | 34 | #include <linux/notifier.h> |
34 | #include <linux/profile.h> | 35 | #include <linux/profile.h> |
35 | #include <linux/suspend.h> | 36 | #include <linux/suspend.h> |
37 | #include <linux/vmalloc.h> | ||
36 | #include <linux/blkdev.h> | 38 | #include <linux/blkdev.h> |
37 | #include <linux/delay.h> | 39 | #include <linux/delay.h> |
38 | #include <linux/smp.h> | 40 | #include <linux/smp.h> |
@@ -176,6 +178,13 @@ static unsigned int task_timeslice(task_t *p) | |||
176 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ | 178 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ |
177 | < (long long) (sd)->cache_hot_time) | 179 | < (long long) (sd)->cache_hot_time) |
178 | 180 | ||
181 | void __put_task_struct_cb(struct rcu_head *rhp) | ||
182 | { | ||
183 | __put_task_struct(container_of(rhp, struct task_struct, rcu)); | ||
184 | } | ||
185 | |||
186 | EXPORT_SYMBOL_GPL(__put_task_struct_cb); | ||
187 | |||
179 | /* | 188 | /* |
180 | * These are the runqueue data structures: | 189 | * These are the runqueue data structures: |
181 | */ | 190 | */ |
@@ -512,7 +521,7 @@ static inline void sched_info_dequeued(task_t *t) | |||
512 | * long it was waiting to run. We also note when it began so that we | 521 | * long it was waiting to run. We also note when it began so that we |
513 | * can keep stats on how long its timeslice is. | 522 | * can keep stats on how long its timeslice is. |
514 | */ | 523 | */ |
515 | static inline void sched_info_arrive(task_t *t) | 524 | static void sched_info_arrive(task_t *t) |
516 | { | 525 | { |
517 | unsigned long now = jiffies, diff = 0; | 526 | unsigned long now = jiffies, diff = 0; |
518 | struct runqueue *rq = task_rq(t); | 527 | struct runqueue *rq = task_rq(t); |
@@ -739,10 +748,14 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
739 | unsigned long long __sleep_time = now - p->timestamp; | 748 | unsigned long long __sleep_time = now - p->timestamp; |
740 | unsigned long sleep_time; | 749 | unsigned long sleep_time; |
741 | 750 | ||
742 | if (__sleep_time > NS_MAX_SLEEP_AVG) | 751 | if (unlikely(p->policy == SCHED_BATCH)) |
743 | sleep_time = NS_MAX_SLEEP_AVG; | 752 | sleep_time = 0; |
744 | else | 753 | else { |
745 | sleep_time = (unsigned long)__sleep_time; | 754 | if (__sleep_time > NS_MAX_SLEEP_AVG) |
755 | sleep_time = NS_MAX_SLEEP_AVG; | ||
756 | else | ||
757 | sleep_time = (unsigned long)__sleep_time; | ||
758 | } | ||
746 | 759 | ||
747 | if (likely(sleep_time > 0)) { | 760 | if (likely(sleep_time > 0)) { |
748 | /* | 761 | /* |
@@ -994,7 +1007,7 @@ void kick_process(task_t *p) | |||
994 | * We want to under-estimate the load of migration sources, to | 1007 | * We want to under-estimate the load of migration sources, to |
995 | * balance conservatively. | 1008 | * balance conservatively. |
996 | */ | 1009 | */ |
997 | static inline unsigned long __source_load(int cpu, int type, enum idle_type idle) | 1010 | static unsigned long __source_load(int cpu, int type, enum idle_type idle) |
998 | { | 1011 | { |
999 | runqueue_t *rq = cpu_rq(cpu); | 1012 | runqueue_t *rq = cpu_rq(cpu); |
1000 | unsigned long running = rq->nr_running; | 1013 | unsigned long running = rq->nr_running; |
@@ -1281,6 +1294,9 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync) | |||
1281 | } | 1294 | } |
1282 | } | 1295 | } |
1283 | 1296 | ||
1297 | if (p->last_waker_cpu != this_cpu) | ||
1298 | goto out_set_cpu; | ||
1299 | |||
1284 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | 1300 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) |
1285 | goto out_set_cpu; | 1301 | goto out_set_cpu; |
1286 | 1302 | ||
@@ -1351,6 +1367,8 @@ out_set_cpu: | |||
1351 | cpu = task_cpu(p); | 1367 | cpu = task_cpu(p); |
1352 | } | 1368 | } |
1353 | 1369 | ||
1370 | p->last_waker_cpu = this_cpu; | ||
1371 | |||
1354 | out_activate: | 1372 | out_activate: |
1355 | #endif /* CONFIG_SMP */ | 1373 | #endif /* CONFIG_SMP */ |
1356 | if (old_state == TASK_UNINTERRUPTIBLE) { | 1374 | if (old_state == TASK_UNINTERRUPTIBLE) { |
@@ -1432,9 +1450,12 @@ void fastcall sched_fork(task_t *p, int clone_flags) | |||
1432 | #ifdef CONFIG_SCHEDSTATS | 1450 | #ifdef CONFIG_SCHEDSTATS |
1433 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 1451 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
1434 | #endif | 1452 | #endif |
1435 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 1453 | #if defined(CONFIG_SMP) |
1454 | p->last_waker_cpu = cpu; | ||
1455 | #if defined(__ARCH_WANT_UNLOCKED_CTXSW) | ||
1436 | p->oncpu = 0; | 1456 | p->oncpu = 0; |
1437 | #endif | 1457 | #endif |
1458 | #endif | ||
1438 | #ifdef CONFIG_PREEMPT | 1459 | #ifdef CONFIG_PREEMPT |
1439 | /* Want to start with kernel preemption disabled. */ | 1460 | /* Want to start with kernel preemption disabled. */ |
1440 | task_thread_info(p)->preempt_count = 1; | 1461 | task_thread_info(p)->preempt_count = 1; |
@@ -1849,7 +1870,7 @@ void sched_exec(void) | |||
1849 | * pull_task - move a task from a remote runqueue to the local runqueue. | 1870 | * pull_task - move a task from a remote runqueue to the local runqueue. |
1850 | * Both runqueues must be locked. | 1871 | * Both runqueues must be locked. |
1851 | */ | 1872 | */ |
1852 | static inline | 1873 | static |
1853 | void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | 1874 | void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, |
1854 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) | 1875 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) |
1855 | { | 1876 | { |
@@ -1871,7 +1892,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | |||
1871 | /* | 1892 | /* |
1872 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 1893 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
1873 | */ | 1894 | */ |
1874 | static inline | 1895 | static |
1875 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | 1896 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, |
1876 | struct sched_domain *sd, enum idle_type idle, | 1897 | struct sched_domain *sd, enum idle_type idle, |
1877 | int *all_pinned) | 1898 | int *all_pinned) |
@@ -2357,7 +2378,7 @@ out_balanced: | |||
2357 | * idle_balance is called by schedule() if this_cpu is about to become | 2378 | * idle_balance is called by schedule() if this_cpu is about to become |
2358 | * idle. Attempts to pull tasks from other CPUs. | 2379 | * idle. Attempts to pull tasks from other CPUs. |
2359 | */ | 2380 | */ |
2360 | static inline void idle_balance(int this_cpu, runqueue_t *this_rq) | 2381 | static void idle_balance(int this_cpu, runqueue_t *this_rq) |
2361 | { | 2382 | { |
2362 | struct sched_domain *sd; | 2383 | struct sched_domain *sd; |
2363 | 2384 | ||
@@ -2741,7 +2762,7 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq) | |||
2741 | resched_task(rq->idle); | 2762 | resched_task(rq->idle); |
2742 | } | 2763 | } |
2743 | 2764 | ||
2744 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 2765 | static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) |
2745 | { | 2766 | { |
2746 | struct sched_domain *tmp, *sd = NULL; | 2767 | struct sched_domain *tmp, *sd = NULL; |
2747 | cpumask_t sibling_map; | 2768 | cpumask_t sibling_map; |
@@ -2795,7 +2816,7 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) | |||
2795 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; | 2816 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; |
2796 | } | 2817 | } |
2797 | 2818 | ||
2798 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 2819 | static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) |
2799 | { | 2820 | { |
2800 | struct sched_domain *tmp, *sd = NULL; | 2821 | struct sched_domain *tmp, *sd = NULL; |
2801 | cpumask_t sibling_map; | 2822 | cpumask_t sibling_map; |
@@ -3543,7 +3564,7 @@ void set_user_nice(task_t *p, long nice) | |||
3543 | * The RT priorities are set via sched_setscheduler(), but we still | 3564 | * The RT priorities are set via sched_setscheduler(), but we still |
3544 | * allow the 'normal' nice value to be set - but as expected | 3565 | * allow the 'normal' nice value to be set - but as expected |
3545 | * it wont have any effect on scheduling until the task is | 3566 | * it wont have any effect on scheduling until the task is |
3546 | * not SCHED_NORMAL: | 3567 | * not SCHED_NORMAL/SCHED_BATCH: |
3547 | */ | 3568 | */ |
3548 | if (rt_task(p)) { | 3569 | if (rt_task(p)) { |
3549 | p->static_prio = NICE_TO_PRIO(nice); | 3570 | p->static_prio = NICE_TO_PRIO(nice); |
@@ -3689,10 +3710,16 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) | |||
3689 | BUG_ON(p->array); | 3710 | BUG_ON(p->array); |
3690 | p->policy = policy; | 3711 | p->policy = policy; |
3691 | p->rt_priority = prio; | 3712 | p->rt_priority = prio; |
3692 | if (policy != SCHED_NORMAL) | 3713 | if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { |
3693 | p->prio = MAX_RT_PRIO-1 - p->rt_priority; | 3714 | p->prio = MAX_RT_PRIO-1 - p->rt_priority; |
3694 | else | 3715 | } else { |
3695 | p->prio = p->static_prio; | 3716 | p->prio = p->static_prio; |
3717 | /* | ||
3718 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: | ||
3719 | */ | ||
3720 | if (policy == SCHED_BATCH) | ||
3721 | p->sleep_avg = 0; | ||
3722 | } | ||
3696 | } | 3723 | } |
3697 | 3724 | ||
3698 | /** | 3725 | /** |
@@ -3716,29 +3743,35 @@ recheck: | |||
3716 | if (policy < 0) | 3743 | if (policy < 0) |
3717 | policy = oldpolicy = p->policy; | 3744 | policy = oldpolicy = p->policy; |
3718 | else if (policy != SCHED_FIFO && policy != SCHED_RR && | 3745 | else if (policy != SCHED_FIFO && policy != SCHED_RR && |
3719 | policy != SCHED_NORMAL) | 3746 | policy != SCHED_NORMAL && policy != SCHED_BATCH) |
3720 | return -EINVAL; | 3747 | return -EINVAL; |
3721 | /* | 3748 | /* |
3722 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 3749 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
3723 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. | 3750 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and |
3751 | * SCHED_BATCH is 0. | ||
3724 | */ | 3752 | */ |
3725 | if (param->sched_priority < 0 || | 3753 | if (param->sched_priority < 0 || |
3726 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || | 3754 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || |
3727 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) | 3755 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) |
3728 | return -EINVAL; | 3756 | return -EINVAL; |
3729 | if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) | 3757 | if ((policy == SCHED_NORMAL || policy == SCHED_BATCH) |
3758 | != (param->sched_priority == 0)) | ||
3730 | return -EINVAL; | 3759 | return -EINVAL; |
3731 | 3760 | ||
3732 | /* | 3761 | /* |
3733 | * Allow unprivileged RT tasks to decrease priority: | 3762 | * Allow unprivileged RT tasks to decrease priority: |
3734 | */ | 3763 | */ |
3735 | if (!capable(CAP_SYS_NICE)) { | 3764 | if (!capable(CAP_SYS_NICE)) { |
3736 | /* can't change policy */ | 3765 | /* |
3737 | if (policy != p->policy && | 3766 | * can't change policy, except between SCHED_NORMAL |
3738 | !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) | 3767 | * and SCHED_BATCH: |
3768 | */ | ||
3769 | if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) && | ||
3770 | (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) && | ||
3771 | !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) | ||
3739 | return -EPERM; | 3772 | return -EPERM; |
3740 | /* can't increase priority */ | 3773 | /* can't increase priority */ |
3741 | if (policy != SCHED_NORMAL && | 3774 | if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) && |
3742 | param->sched_priority > p->rt_priority && | 3775 | param->sched_priority > p->rt_priority && |
3743 | param->sched_priority > | 3776 | param->sched_priority > |
3744 | p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) | 3777 | p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) |
@@ -3817,6 +3850,10 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | |||
3817 | asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, | 3850 | asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, |
3818 | struct sched_param __user *param) | 3851 | struct sched_param __user *param) |
3819 | { | 3852 | { |
3853 | /* negative values for policy are not valid */ | ||
3854 | if (policy < 0) | ||
3855 | return -EINVAL; | ||
3856 | |||
3820 | return do_sched_setscheduler(pid, policy, param); | 3857 | return do_sched_setscheduler(pid, policy, param); |
3821 | } | 3858 | } |
3822 | 3859 | ||
@@ -3972,12 +4009,12 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, | |||
3972 | * method, such as ACPI for e.g. | 4009 | * method, such as ACPI for e.g. |
3973 | */ | 4010 | */ |
3974 | 4011 | ||
3975 | cpumask_t cpu_present_map; | 4012 | cpumask_t cpu_present_map __read_mostly; |
3976 | EXPORT_SYMBOL(cpu_present_map); | 4013 | EXPORT_SYMBOL(cpu_present_map); |
3977 | 4014 | ||
3978 | #ifndef CONFIG_SMP | 4015 | #ifndef CONFIG_SMP |
3979 | cpumask_t cpu_online_map = CPU_MASK_ALL; | 4016 | cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; |
3980 | cpumask_t cpu_possible_map = CPU_MASK_ALL; | 4017 | cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; |
3981 | #endif | 4018 | #endif |
3982 | 4019 | ||
3983 | long sched_getaffinity(pid_t pid, cpumask_t *mask) | 4020 | long sched_getaffinity(pid_t pid, cpumask_t *mask) |
@@ -4216,6 +4253,7 @@ asmlinkage long sys_sched_get_priority_max(int policy) | |||
4216 | ret = MAX_USER_RT_PRIO-1; | 4253 | ret = MAX_USER_RT_PRIO-1; |
4217 | break; | 4254 | break; |
4218 | case SCHED_NORMAL: | 4255 | case SCHED_NORMAL: |
4256 | case SCHED_BATCH: | ||
4219 | ret = 0; | 4257 | ret = 0; |
4220 | break; | 4258 | break; |
4221 | } | 4259 | } |
@@ -4239,6 +4277,7 @@ asmlinkage long sys_sched_get_priority_min(int policy) | |||
4239 | ret = 1; | 4277 | ret = 1; |
4240 | break; | 4278 | break; |
4241 | case SCHED_NORMAL: | 4279 | case SCHED_NORMAL: |
4280 | case SCHED_BATCH: | ||
4242 | ret = 0; | 4281 | ret = 0; |
4243 | } | 4282 | } |
4244 | return ret; | 4283 | return ret; |
@@ -4379,6 +4418,7 @@ void show_state(void) | |||
4379 | } while_each_thread(g, p); | 4418 | } while_each_thread(g, p); |
4380 | 4419 | ||
4381 | read_unlock(&tasklist_lock); | 4420 | read_unlock(&tasklist_lock); |
4421 | mutex_debug_show_all_locks(); | ||
4382 | } | 4422 | } |
4383 | 4423 | ||
4384 | /** | 4424 | /** |
@@ -5073,7 +5113,470 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, | |||
5073 | 5113 | ||
5074 | #define SD_NODES_PER_DOMAIN 16 | 5114 | #define SD_NODES_PER_DOMAIN 16 |
5075 | 5115 | ||
5116 | /* | ||
5117 | * Self-tuning task migration cost measurement between source and target CPUs. | ||
5118 | * | ||
5119 | * This is done by measuring the cost of manipulating buffers of varying | ||
5120 | * sizes. For a given buffer-size here are the steps that are taken: | ||
5121 | * | ||
5122 | * 1) the source CPU reads+dirties a shared buffer | ||
5123 | * 2) the target CPU reads+dirties the same shared buffer | ||
5124 | * | ||
5125 | * We measure how long they take, in the following 4 scenarios: | ||
5126 | * | ||
5127 | * - source: CPU1, target: CPU2 | cost1 | ||
5128 | * - source: CPU2, target: CPU1 | cost2 | ||
5129 | * - source: CPU1, target: CPU1 | cost3 | ||
5130 | * - source: CPU2, target: CPU2 | cost4 | ||
5131 | * | ||
5132 | * We then calculate the cost3+cost4-cost1-cost2 difference - this is | ||
5133 | * the cost of migration. | ||
5134 | * | ||
5135 | * We then start off from a small buffer-size and iterate up to larger | ||
5136 | * buffer sizes, in 5% steps - measuring each buffer-size separately, and | ||
5137 | * doing a maximum search for the cost. (The maximum cost for a migration | ||
5138 | * normally occurs when the working set size is around the effective cache | ||
5139 | * size.) | ||
5140 | */ | ||
5141 | #define SEARCH_SCOPE 2 | ||
5142 | #define MIN_CACHE_SIZE (64*1024U) | ||
5143 | #define DEFAULT_CACHE_SIZE (5*1024*1024U) | ||
5144 | #define ITERATIONS 2 | ||
5145 | #define SIZE_THRESH 130 | ||
5146 | #define COST_THRESH 130 | ||
5147 | |||
5148 | /* | ||
5149 | * The migration cost is a function of 'domain distance'. Domain | ||
5150 | * distance is the number of steps a CPU has to iterate down its | ||
5151 | * domain tree to share a domain with the other CPU. The farther | ||
5152 | * two CPUs are from each other, the larger the distance gets. | ||
5153 | * | ||
5154 | * Note that we use the distance only to cache measurement results, | ||
5155 | * the distance value is not used numerically otherwise. When two | ||
5156 | * CPUs have the same distance it is assumed that the migration | ||
5157 | * cost is the same. (this is a simplification but quite practical) | ||
5158 | */ | ||
5159 | #define MAX_DOMAIN_DISTANCE 32 | ||
5160 | |||
5161 | static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = | ||
5162 | { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL }; | ||
5163 | |||
5164 | /* | ||
5165 | * Allow override of migration cost - in units of microseconds. | ||
5166 | * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost | ||
5167 | * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: | ||
5168 | */ | ||
5169 | static int __init migration_cost_setup(char *str) | ||
5170 | { | ||
5171 | int ints[MAX_DOMAIN_DISTANCE+1], i; | ||
5172 | |||
5173 | str = get_options(str, ARRAY_SIZE(ints), ints); | ||
5174 | |||
5175 | printk("#ints: %d\n", ints[0]); | ||
5176 | for (i = 1; i <= ints[0]; i++) { | ||
5177 | migration_cost[i-1] = (unsigned long long)ints[i]*1000; | ||
5178 | printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); | ||
5179 | } | ||
5180 | return 1; | ||
5181 | } | ||
5182 | |||
5183 | __setup ("migration_cost=", migration_cost_setup); | ||
5184 | |||
5185 | /* | ||
5186 | * Global multiplier (divisor) for migration-cutoff values, | ||
5187 | * in percentiles. E.g. use a value of 150 to get 1.5 times | ||
5188 | * longer cache-hot cutoff times. | ||
5189 | * | ||
5190 | * (We scale it from 100 to 128 to long long handling easier.) | ||
5191 | */ | ||
5192 | |||
5193 | #define MIGRATION_FACTOR_SCALE 128 | ||
5194 | |||
5195 | static unsigned int migration_factor = MIGRATION_FACTOR_SCALE; | ||
5196 | |||
5197 | static int __init setup_migration_factor(char *str) | ||
5198 | { | ||
5199 | get_option(&str, &migration_factor); | ||
5200 | migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; | ||
5201 | return 1; | ||
5202 | } | ||
5203 | |||
5204 | __setup("migration_factor=", setup_migration_factor); | ||
5205 | |||
5206 | /* | ||
5207 | * Estimated distance of two CPUs, measured via the number of domains | ||
5208 | * we have to pass for the two CPUs to be in the same span: | ||
5209 | */ | ||
5210 | static unsigned long domain_distance(int cpu1, int cpu2) | ||
5211 | { | ||
5212 | unsigned long distance = 0; | ||
5213 | struct sched_domain *sd; | ||
5214 | |||
5215 | for_each_domain(cpu1, sd) { | ||
5216 | WARN_ON(!cpu_isset(cpu1, sd->span)); | ||
5217 | if (cpu_isset(cpu2, sd->span)) | ||
5218 | return distance; | ||
5219 | distance++; | ||
5220 | } | ||
5221 | if (distance >= MAX_DOMAIN_DISTANCE) { | ||
5222 | WARN_ON(1); | ||
5223 | distance = MAX_DOMAIN_DISTANCE-1; | ||
5224 | } | ||
5225 | |||
5226 | return distance; | ||
5227 | } | ||
5228 | |||
5229 | static unsigned int migration_debug; | ||
5230 | |||
5231 | static int __init setup_migration_debug(char *str) | ||
5232 | { | ||
5233 | get_option(&str, &migration_debug); | ||
5234 | return 1; | ||
5235 | } | ||
5236 | |||
5237 | __setup("migration_debug=", setup_migration_debug); | ||
5238 | |||
5239 | /* | ||
5240 | * Maximum cache-size that the scheduler should try to measure. | ||
5241 | * Architectures with larger caches should tune this up during | ||
5242 | * bootup. Gets used in the domain-setup code (i.e. during SMP | ||
5243 | * bootup). | ||
5244 | */ | ||
5245 | unsigned int max_cache_size; | ||
5246 | |||
5247 | static int __init setup_max_cache_size(char *str) | ||
5248 | { | ||
5249 | get_option(&str, &max_cache_size); | ||
5250 | return 1; | ||
5251 | } | ||
5252 | |||
5253 | __setup("max_cache_size=", setup_max_cache_size); | ||
5254 | |||
5255 | /* | ||
5256 | * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This | ||
5257 | * is the operation that is timed, so we try to generate unpredictable | ||
5258 | * cachemisses that still end up filling the L2 cache: | ||
5259 | */ | ||
5260 | static void touch_cache(void *__cache, unsigned long __size) | ||
5261 | { | ||
5262 | unsigned long size = __size/sizeof(long), chunk1 = size/3, | ||
5263 | chunk2 = 2*size/3; | ||
5264 | unsigned long *cache = __cache; | ||
5265 | int i; | ||
5266 | |||
5267 | for (i = 0; i < size/6; i += 8) { | ||
5268 | switch (i % 6) { | ||
5269 | case 0: cache[i]++; | ||
5270 | case 1: cache[size-1-i]++; | ||
5271 | case 2: cache[chunk1-i]++; | ||
5272 | case 3: cache[chunk1+i]++; | ||
5273 | case 4: cache[chunk2-i]++; | ||
5274 | case 5: cache[chunk2+i]++; | ||
5275 | } | ||
5276 | } | ||
5277 | } | ||
5278 | |||
5279 | /* | ||
5280 | * Measure the cache-cost of one task migration. Returns in units of nsec. | ||
5281 | */ | ||
5282 | static unsigned long long measure_one(void *cache, unsigned long size, | ||
5283 | int source, int target) | ||
5284 | { | ||
5285 | cpumask_t mask, saved_mask; | ||
5286 | unsigned long long t0, t1, t2, t3, cost; | ||
5287 | |||
5288 | saved_mask = current->cpus_allowed; | ||
5289 | |||
5290 | /* | ||
5291 | * Flush source caches to RAM and invalidate them: | ||
5292 | */ | ||
5293 | sched_cacheflush(); | ||
5294 | |||
5295 | /* | ||
5296 | * Migrate to the source CPU: | ||
5297 | */ | ||
5298 | mask = cpumask_of_cpu(source); | ||
5299 | set_cpus_allowed(current, mask); | ||
5300 | WARN_ON(smp_processor_id() != source); | ||
5301 | |||
5302 | /* | ||
5303 | * Dirty the working set: | ||
5304 | */ | ||
5305 | t0 = sched_clock(); | ||
5306 | touch_cache(cache, size); | ||
5307 | t1 = sched_clock(); | ||
5308 | |||
5309 | /* | ||
5310 | * Migrate to the target CPU, dirty the L2 cache and access | ||
5311 | * the shared buffer. (which represents the working set | ||
5312 | * of a migrated task.) | ||
5313 | */ | ||
5314 | mask = cpumask_of_cpu(target); | ||
5315 | set_cpus_allowed(current, mask); | ||
5316 | WARN_ON(smp_processor_id() != target); | ||
5317 | |||
5318 | t2 = sched_clock(); | ||
5319 | touch_cache(cache, size); | ||
5320 | t3 = sched_clock(); | ||
5321 | |||
5322 | cost = t1-t0 + t3-t2; | ||
5323 | |||
5324 | if (migration_debug >= 2) | ||
5325 | printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", | ||
5326 | source, target, t1-t0, t1-t0, t3-t2, cost); | ||
5327 | /* | ||
5328 | * Flush target caches to RAM and invalidate them: | ||
5329 | */ | ||
5330 | sched_cacheflush(); | ||
5331 | |||
5332 | set_cpus_allowed(current, saved_mask); | ||
5333 | |||
5334 | return cost; | ||
5335 | } | ||
5336 | |||
5337 | /* | ||
5338 | * Measure a series of task migrations and return the average | ||
5339 | * result. Since this code runs early during bootup the system | ||
5340 | * is 'undisturbed' and the average latency makes sense. | ||
5341 | * | ||
5342 | * The algorithm in essence auto-detects the relevant cache-size, | ||
5343 | * so it will properly detect different cachesizes for different | ||
5344 | * cache-hierarchies, depending on how the CPUs are connected. | ||
5345 | * | ||
5346 | * Architectures can prime the upper limit of the search range via | ||
5347 | * max_cache_size, otherwise the search range defaults to 20MB...64K. | ||
5348 | */ | ||
5349 | static unsigned long long | ||
5350 | measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) | ||
5351 | { | ||
5352 | unsigned long long cost1, cost2; | ||
5353 | int i; | ||
5354 | |||
5355 | /* | ||
5356 | * Measure the migration cost of 'size' bytes, over an | ||
5357 | * average of 10 runs: | ||
5358 | * | ||
5359 | * (We perturb the cache size by a small (0..4k) | ||
5360 | * value to compensate size/alignment related artifacts. | ||
5361 | * We also subtract the cost of the operation done on | ||
5362 | * the same CPU.) | ||
5363 | */ | ||
5364 | cost1 = 0; | ||
5365 | |||
5366 | /* | ||
5367 | * dry run, to make sure we start off cache-cold on cpu1, | ||
5368 | * and to get any vmalloc pagefaults in advance: | ||
5369 | */ | ||
5370 | measure_one(cache, size, cpu1, cpu2); | ||
5371 | for (i = 0; i < ITERATIONS; i++) | ||
5372 | cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); | ||
5373 | |||
5374 | measure_one(cache, size, cpu2, cpu1); | ||
5375 | for (i = 0; i < ITERATIONS; i++) | ||
5376 | cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); | ||
5377 | |||
5378 | /* | ||
5379 | * (We measure the non-migrating [cached] cost on both | ||
5380 | * cpu1 and cpu2, to handle CPUs with different speeds) | ||
5381 | */ | ||
5382 | cost2 = 0; | ||
5383 | |||
5384 | measure_one(cache, size, cpu1, cpu1); | ||
5385 | for (i = 0; i < ITERATIONS; i++) | ||
5386 | cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); | ||
5387 | |||
5388 | measure_one(cache, size, cpu2, cpu2); | ||
5389 | for (i = 0; i < ITERATIONS; i++) | ||
5390 | cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); | ||
5391 | |||
5392 | /* | ||
5393 | * Get the per-iteration migration cost: | ||
5394 | */ | ||
5395 | do_div(cost1, 2*ITERATIONS); | ||
5396 | do_div(cost2, 2*ITERATIONS); | ||
5397 | |||
5398 | return cost1 - cost2; | ||
5399 | } | ||
5400 | |||
5401 | static unsigned long long measure_migration_cost(int cpu1, int cpu2) | ||
5402 | { | ||
5403 | unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; | ||
5404 | unsigned int max_size, size, size_found = 0; | ||
5405 | long long cost = 0, prev_cost; | ||
5406 | void *cache; | ||
5407 | |||
5408 | /* | ||
5409 | * Search from max_cache_size*5 down to 64K - the real relevant | ||
5410 | * cachesize has to lie somewhere inbetween. | ||
5411 | */ | ||
5412 | if (max_cache_size) { | ||
5413 | max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); | ||
5414 | size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); | ||
5415 | } else { | ||
5416 | /* | ||
5417 | * Since we have no estimation about the relevant | ||
5418 | * search range | ||
5419 | */ | ||
5420 | max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; | ||
5421 | size = MIN_CACHE_SIZE; | ||
5422 | } | ||
5423 | |||
5424 | if (!cpu_online(cpu1) || !cpu_online(cpu2)) { | ||
5425 | printk("cpu %d and %d not both online!\n", cpu1, cpu2); | ||
5426 | return 0; | ||
5427 | } | ||
5428 | |||
5429 | /* | ||
5430 | * Allocate the working set: | ||
5431 | */ | ||
5432 | cache = vmalloc(max_size); | ||
5433 | if (!cache) { | ||
5434 | printk("could not vmalloc %d bytes for cache!\n", 2*max_size); | ||
5435 | return 1000000; // return 1 msec on very small boxen | ||
5436 | } | ||
5437 | |||
5438 | while (size <= max_size) { | ||
5439 | prev_cost = cost; | ||
5440 | cost = measure_cost(cpu1, cpu2, cache, size); | ||
5441 | |||
5442 | /* | ||
5443 | * Update the max: | ||
5444 | */ | ||
5445 | if (cost > 0) { | ||
5446 | if (max_cost < cost) { | ||
5447 | max_cost = cost; | ||
5448 | size_found = size; | ||
5449 | } | ||
5450 | } | ||
5451 | /* | ||
5452 | * Calculate average fluctuation, we use this to prevent | ||
5453 | * noise from triggering an early break out of the loop: | ||
5454 | */ | ||
5455 | fluct = abs(cost - prev_cost); | ||
5456 | avg_fluct = (avg_fluct + fluct)/2; | ||
5457 | |||
5458 | if (migration_debug) | ||
5459 | printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", | ||
5460 | cpu1, cpu2, size, | ||
5461 | (long)cost / 1000000, | ||
5462 | ((long)cost / 100000) % 10, | ||
5463 | (long)max_cost / 1000000, | ||
5464 | ((long)max_cost / 100000) % 10, | ||
5465 | domain_distance(cpu1, cpu2), | ||
5466 | cost, avg_fluct); | ||
5467 | |||
5468 | /* | ||
5469 | * If we iterated at least 20% past the previous maximum, | ||
5470 | * and the cost has dropped by more than 20% already, | ||
5471 | * (taking fluctuations into account) then we assume to | ||
5472 | * have found the maximum and break out of the loop early: | ||
5473 | */ | ||
5474 | if (size_found && (size*100 > size_found*SIZE_THRESH)) | ||
5475 | if (cost+avg_fluct <= 0 || | ||
5476 | max_cost*100 > (cost+avg_fluct)*COST_THRESH) { | ||
5477 | |||
5478 | if (migration_debug) | ||
5479 | printk("-> found max.\n"); | ||
5480 | break; | ||
5481 | } | ||
5482 | /* | ||
5483 | * Increase the cachesize in 5% steps: | ||
5484 | */ | ||
5485 | size = size * 20 / 19; | ||
5486 | } | ||
5487 | |||
5488 | if (migration_debug) | ||
5489 | printk("[%d][%d] working set size found: %d, cost: %Ld\n", | ||
5490 | cpu1, cpu2, size_found, max_cost); | ||
5491 | |||
5492 | vfree(cache); | ||
5493 | |||
5494 | /* | ||
5495 | * A task is considered 'cache cold' if at least 2 times | ||
5496 | * the worst-case cost of migration has passed. | ||
5497 | * | ||
5498 | * (this limit is only listened to if the load-balancing | ||
5499 | * situation is 'nice' - if there is a large imbalance we | ||
5500 | * ignore it for the sake of CPU utilization and | ||
5501 | * processing fairness.) | ||
5502 | */ | ||
5503 | return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; | ||
5504 | } | ||
5505 | |||
5506 | static void calibrate_migration_costs(const cpumask_t *cpu_map) | ||
5507 | { | ||
5508 | int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id(); | ||
5509 | unsigned long j0, j1, distance, max_distance = 0; | ||
5510 | struct sched_domain *sd; | ||
5511 | |||
5512 | j0 = jiffies; | ||
5513 | |||
5514 | /* | ||
5515 | * First pass - calculate the cacheflush times: | ||
5516 | */ | ||
5517 | for_each_cpu_mask(cpu1, *cpu_map) { | ||
5518 | for_each_cpu_mask(cpu2, *cpu_map) { | ||
5519 | if (cpu1 == cpu2) | ||
5520 | continue; | ||
5521 | distance = domain_distance(cpu1, cpu2); | ||
5522 | max_distance = max(max_distance, distance); | ||
5523 | /* | ||
5524 | * No result cached yet? | ||
5525 | */ | ||
5526 | if (migration_cost[distance] == -1LL) | ||
5527 | migration_cost[distance] = | ||
5528 | measure_migration_cost(cpu1, cpu2); | ||
5529 | } | ||
5530 | } | ||
5531 | /* | ||
5532 | * Second pass - update the sched domain hierarchy with | ||
5533 | * the new cache-hot-time estimations: | ||
5534 | */ | ||
5535 | for_each_cpu_mask(cpu, *cpu_map) { | ||
5536 | distance = 0; | ||
5537 | for_each_domain(cpu, sd) { | ||
5538 | sd->cache_hot_time = migration_cost[distance]; | ||
5539 | distance++; | ||
5540 | } | ||
5541 | } | ||
5542 | /* | ||
5543 | * Print the matrix: | ||
5544 | */ | ||
5545 | if (migration_debug) | ||
5546 | printk("migration: max_cache_size: %d, cpu: %d MHz:\n", | ||
5547 | max_cache_size, | ||
5548 | #ifdef CONFIG_X86 | ||
5549 | cpu_khz/1000 | ||
5550 | #else | ||
5551 | -1 | ||
5552 | #endif | ||
5553 | ); | ||
5554 | printk("migration_cost="); | ||
5555 | for (distance = 0; distance <= max_distance; distance++) { | ||
5556 | if (distance) | ||
5557 | printk(","); | ||
5558 | printk("%ld", (long)migration_cost[distance] / 1000); | ||
5559 | } | ||
5560 | printk("\n"); | ||
5561 | j1 = jiffies; | ||
5562 | if (migration_debug) | ||
5563 | printk("migration: %ld seconds\n", (j1-j0)/HZ); | ||
5564 | |||
5565 | /* | ||
5566 | * Move back to the original CPU. NUMA-Q gets confused | ||
5567 | * if we migrate to another quad during bootup. | ||
5568 | */ | ||
5569 | if (raw_smp_processor_id() != orig_cpu) { | ||
5570 | cpumask_t mask = cpumask_of_cpu(orig_cpu), | ||
5571 | saved_mask = current->cpus_allowed; | ||
5572 | |||
5573 | set_cpus_allowed(current, mask); | ||
5574 | set_cpus_allowed(current, saved_mask); | ||
5575 | } | ||
5576 | } | ||
5577 | |||
5076 | #ifdef CONFIG_NUMA | 5578 | #ifdef CONFIG_NUMA |
5579 | |||
5077 | /** | 5580 | /** |
5078 | * find_next_best_node - find the next node to include in a sched_domain | 5581 | * find_next_best_node - find the next node to include in a sched_domain |
5079 | * @node: node whose sched_domain we're building | 5582 | * @node: node whose sched_domain we're building |
@@ -5439,6 +5942,10 @@ next_sg: | |||
5439 | #endif | 5942 | #endif |
5440 | cpu_attach_domain(sd, i); | 5943 | cpu_attach_domain(sd, i); |
5441 | } | 5944 | } |
5945 | /* | ||
5946 | * Tune cache-hot values: | ||
5947 | */ | ||
5948 | calibrate_migration_costs(cpu_map); | ||
5442 | } | 5949 | } |
5443 | /* | 5950 | /* |
5444 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 5951 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
@@ -5505,7 +6012,7 @@ next_sg: | |||
5505 | * Detach sched domains from a group of cpus specified in cpu_map | 6012 | * Detach sched domains from a group of cpus specified in cpu_map |
5506 | * These cpus will now be attached to the NULL domain | 6013 | * These cpus will now be attached to the NULL domain |
5507 | */ | 6014 | */ |
5508 | static inline void detach_destroy_domains(const cpumask_t *cpu_map) | 6015 | static void detach_destroy_domains(const cpumask_t *cpu_map) |
5509 | { | 6016 | { |
5510 | int i; | 6017 | int i; |
5511 | 6018 | ||
diff --git a/kernel/signal.c b/kernel/signal.c index d7611f189ef7..d3efafd8109a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/posix-timers.h> | 25 | #include <linux/posix-timers.h> |
26 | #include <linux/signal.h> | 26 | #include <linux/signal.h> |
27 | #include <linux/audit.h> | 27 | #include <linux/audit.h> |
28 | #include <linux/capability.h> | ||
28 | #include <asm/param.h> | 29 | #include <asm/param.h> |
29 | #include <asm/uaccess.h> | 30 | #include <asm/uaccess.h> |
30 | #include <asm/unistd.h> | 31 | #include <asm/unistd.h> |
@@ -329,13 +330,20 @@ void __exit_sighand(struct task_struct *tsk) | |||
329 | /* Ok, we're done with the signal handlers */ | 330 | /* Ok, we're done with the signal handlers */ |
330 | tsk->sighand = NULL; | 331 | tsk->sighand = NULL; |
331 | if (atomic_dec_and_test(&sighand->count)) | 332 | if (atomic_dec_and_test(&sighand->count)) |
332 | kmem_cache_free(sighand_cachep, sighand); | 333 | sighand_free(sighand); |
333 | } | 334 | } |
334 | 335 | ||
335 | void exit_sighand(struct task_struct *tsk) | 336 | void exit_sighand(struct task_struct *tsk) |
336 | { | 337 | { |
337 | write_lock_irq(&tasklist_lock); | 338 | write_lock_irq(&tasklist_lock); |
338 | __exit_sighand(tsk); | 339 | rcu_read_lock(); |
340 | if (tsk->sighand != NULL) { | ||
341 | struct sighand_struct *sighand = rcu_dereference(tsk->sighand); | ||
342 | spin_lock(&sighand->siglock); | ||
343 | __exit_sighand(tsk); | ||
344 | spin_unlock(&sighand->siglock); | ||
345 | } | ||
346 | rcu_read_unlock(); | ||
339 | write_unlock_irq(&tasklist_lock); | 347 | write_unlock_irq(&tasklist_lock); |
340 | } | 348 | } |
341 | 349 | ||
@@ -345,19 +353,20 @@ void exit_sighand(struct task_struct *tsk) | |||
345 | void __exit_signal(struct task_struct *tsk) | 353 | void __exit_signal(struct task_struct *tsk) |
346 | { | 354 | { |
347 | struct signal_struct * sig = tsk->signal; | 355 | struct signal_struct * sig = tsk->signal; |
348 | struct sighand_struct * sighand = tsk->sighand; | 356 | struct sighand_struct * sighand; |
349 | 357 | ||
350 | if (!sig) | 358 | if (!sig) |
351 | BUG(); | 359 | BUG(); |
352 | if (!atomic_read(&sig->count)) | 360 | if (!atomic_read(&sig->count)) |
353 | BUG(); | 361 | BUG(); |
362 | rcu_read_lock(); | ||
363 | sighand = rcu_dereference(tsk->sighand); | ||
354 | spin_lock(&sighand->siglock); | 364 | spin_lock(&sighand->siglock); |
355 | posix_cpu_timers_exit(tsk); | 365 | posix_cpu_timers_exit(tsk); |
356 | if (atomic_dec_and_test(&sig->count)) { | 366 | if (atomic_dec_and_test(&sig->count)) { |
357 | posix_cpu_timers_exit_group(tsk); | 367 | posix_cpu_timers_exit_group(tsk); |
358 | if (tsk == sig->curr_target) | ||
359 | sig->curr_target = next_thread(tsk); | ||
360 | tsk->signal = NULL; | 368 | tsk->signal = NULL; |
369 | __exit_sighand(tsk); | ||
361 | spin_unlock(&sighand->siglock); | 370 | spin_unlock(&sighand->siglock); |
362 | flush_sigqueue(&sig->shared_pending); | 371 | flush_sigqueue(&sig->shared_pending); |
363 | } else { | 372 | } else { |
@@ -389,9 +398,11 @@ void __exit_signal(struct task_struct *tsk) | |||
389 | sig->nvcsw += tsk->nvcsw; | 398 | sig->nvcsw += tsk->nvcsw; |
390 | sig->nivcsw += tsk->nivcsw; | 399 | sig->nivcsw += tsk->nivcsw; |
391 | sig->sched_time += tsk->sched_time; | 400 | sig->sched_time += tsk->sched_time; |
401 | __exit_sighand(tsk); | ||
392 | spin_unlock(&sighand->siglock); | 402 | spin_unlock(&sighand->siglock); |
393 | sig = NULL; /* Marker for below. */ | 403 | sig = NULL; /* Marker for below. */ |
394 | } | 404 | } |
405 | rcu_read_unlock(); | ||
395 | clear_tsk_thread_flag(tsk,TIF_SIGPENDING); | 406 | clear_tsk_thread_flag(tsk,TIF_SIGPENDING); |
396 | flush_sigqueue(&tsk->pending); | 407 | flush_sigqueue(&tsk->pending); |
397 | if (sig) { | 408 | if (sig) { |
@@ -465,7 +476,7 @@ unblock_all_signals(void) | |||
465 | spin_unlock_irqrestore(¤t->sighand->siglock, flags); | 476 | spin_unlock_irqrestore(¤t->sighand->siglock, flags); |
466 | } | 477 | } |
467 | 478 | ||
468 | static inline int collect_signal(int sig, struct sigpending *list, siginfo_t *info) | 479 | static int collect_signal(int sig, struct sigpending *list, siginfo_t *info) |
469 | { | 480 | { |
470 | struct sigqueue *q, *first = NULL; | 481 | struct sigqueue *q, *first = NULL; |
471 | int still_pending = 0; | 482 | int still_pending = 0; |
@@ -613,6 +624,33 @@ void signal_wake_up(struct task_struct *t, int resume) | |||
613 | * Returns 1 if any signals were found. | 624 | * Returns 1 if any signals were found. |
614 | * | 625 | * |
615 | * All callers must be holding the siglock. | 626 | * All callers must be holding the siglock. |
627 | * | ||
628 | * This version takes a sigset mask and looks at all signals, | ||
629 | * not just those in the first mask word. | ||
630 | */ | ||
631 | static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) | ||
632 | { | ||
633 | struct sigqueue *q, *n; | ||
634 | sigset_t m; | ||
635 | |||
636 | sigandsets(&m, mask, &s->signal); | ||
637 | if (sigisemptyset(&m)) | ||
638 | return 0; | ||
639 | |||
640 | signandsets(&s->signal, &s->signal, mask); | ||
641 | list_for_each_entry_safe(q, n, &s->list, list) { | ||
642 | if (sigismember(mask, q->info.si_signo)) { | ||
643 | list_del_init(&q->list); | ||
644 | __sigqueue_free(q); | ||
645 | } | ||
646 | } | ||
647 | return 1; | ||
648 | } | ||
649 | /* | ||
650 | * Remove signals in mask from the pending set and queue. | ||
651 | * Returns 1 if any signals were found. | ||
652 | * | ||
653 | * All callers must be holding the siglock. | ||
616 | */ | 654 | */ |
617 | static int rm_from_queue(unsigned long mask, struct sigpending *s) | 655 | static int rm_from_queue(unsigned long mask, struct sigpending *s) |
618 | { | 656 | { |
@@ -1080,18 +1118,29 @@ void zap_other_threads(struct task_struct *p) | |||
1080 | } | 1118 | } |
1081 | 1119 | ||
1082 | /* | 1120 | /* |
1083 | * Must be called with the tasklist_lock held for reading! | 1121 | * Must be called under rcu_read_lock() or with tasklist_lock read-held. |
1084 | */ | 1122 | */ |
1085 | int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) | 1123 | int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) |
1086 | { | 1124 | { |
1087 | unsigned long flags; | 1125 | unsigned long flags; |
1126 | struct sighand_struct *sp; | ||
1088 | int ret; | 1127 | int ret; |
1089 | 1128 | ||
1129 | retry: | ||
1090 | ret = check_kill_permission(sig, info, p); | 1130 | ret = check_kill_permission(sig, info, p); |
1091 | if (!ret && sig && p->sighand) { | 1131 | if (!ret && sig && (sp = rcu_dereference(p->sighand))) { |
1092 | spin_lock_irqsave(&p->sighand->siglock, flags); | 1132 | spin_lock_irqsave(&sp->siglock, flags); |
1133 | if (p->sighand != sp) { | ||
1134 | spin_unlock_irqrestore(&sp->siglock, flags); | ||
1135 | goto retry; | ||
1136 | } | ||
1137 | if ((atomic_read(&sp->count) == 0) || | ||
1138 | (atomic_read(&p->usage) == 0)) { | ||
1139 | spin_unlock_irqrestore(&sp->siglock, flags); | ||
1140 | return -ESRCH; | ||
1141 | } | ||
1093 | ret = __group_send_sig_info(sig, info, p); | 1142 | ret = __group_send_sig_info(sig, info, p); |
1094 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 1143 | spin_unlock_irqrestore(&sp->siglock, flags); |
1095 | } | 1144 | } |
1096 | 1145 | ||
1097 | return ret; | 1146 | return ret; |
@@ -1136,14 +1185,21 @@ int | |||
1136 | kill_proc_info(int sig, struct siginfo *info, pid_t pid) | 1185 | kill_proc_info(int sig, struct siginfo *info, pid_t pid) |
1137 | { | 1186 | { |
1138 | int error; | 1187 | int error; |
1188 | int acquired_tasklist_lock = 0; | ||
1139 | struct task_struct *p; | 1189 | struct task_struct *p; |
1140 | 1190 | ||
1141 | read_lock(&tasklist_lock); | 1191 | rcu_read_lock(); |
1192 | if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) { | ||
1193 | read_lock(&tasklist_lock); | ||
1194 | acquired_tasklist_lock = 1; | ||
1195 | } | ||
1142 | p = find_task_by_pid(pid); | 1196 | p = find_task_by_pid(pid); |
1143 | error = -ESRCH; | 1197 | error = -ESRCH; |
1144 | if (p) | 1198 | if (p) |
1145 | error = group_send_sig_info(sig, info, p); | 1199 | error = group_send_sig_info(sig, info, p); |
1146 | read_unlock(&tasklist_lock); | 1200 | if (unlikely(acquired_tasklist_lock)) |
1201 | read_unlock(&tasklist_lock); | ||
1202 | rcu_read_unlock(); | ||
1147 | return error; | 1203 | return error; |
1148 | } | 1204 | } |
1149 | 1205 | ||
@@ -1163,8 +1219,7 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, | |||
1163 | ret = -ESRCH; | 1219 | ret = -ESRCH; |
1164 | goto out_unlock; | 1220 | goto out_unlock; |
1165 | } | 1221 | } |
1166 | if ((!info || ((unsigned long)info != 1 && | 1222 | if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) |
1167 | (unsigned long)info != 2 && SI_FROMUSER(info))) | ||
1168 | && (euid != p->suid) && (euid != p->uid) | 1223 | && (euid != p->suid) && (euid != p->uid) |
1169 | && (uid != p->suid) && (uid != p->uid)) { | 1224 | && (uid != p->suid) && (uid != p->uid)) { |
1170 | ret = -EPERM; | 1225 | ret = -EPERM; |
@@ -1355,16 +1410,54 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
1355 | { | 1410 | { |
1356 | unsigned long flags; | 1411 | unsigned long flags; |
1357 | int ret = 0; | 1412 | int ret = 0; |
1413 | struct sighand_struct *sh; | ||
1358 | 1414 | ||
1359 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); | 1415 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); |
1360 | read_lock(&tasklist_lock); | 1416 | |
1417 | /* | ||
1418 | * The rcu based delayed sighand destroy makes it possible to | ||
1419 | * run this without tasklist lock held. The task struct itself | ||
1420 | * cannot go away as create_timer did get_task_struct(). | ||
1421 | * | ||
1422 | * We return -1, when the task is marked exiting, so | ||
1423 | * posix_timer_event can redirect it to the group leader | ||
1424 | */ | ||
1425 | rcu_read_lock(); | ||
1361 | 1426 | ||
1362 | if (unlikely(p->flags & PF_EXITING)) { | 1427 | if (unlikely(p->flags & PF_EXITING)) { |
1363 | ret = -1; | 1428 | ret = -1; |
1364 | goto out_err; | 1429 | goto out_err; |
1365 | } | 1430 | } |
1366 | 1431 | ||
1367 | spin_lock_irqsave(&p->sighand->siglock, flags); | 1432 | retry: |
1433 | sh = rcu_dereference(p->sighand); | ||
1434 | |||
1435 | spin_lock_irqsave(&sh->siglock, flags); | ||
1436 | if (p->sighand != sh) { | ||
1437 | /* We raced with exec() in a multithreaded process... */ | ||
1438 | spin_unlock_irqrestore(&sh->siglock, flags); | ||
1439 | goto retry; | ||
1440 | } | ||
1441 | |||
1442 | /* | ||
1443 | * We do the check here again to handle the following scenario: | ||
1444 | * | ||
1445 | * CPU 0 CPU 1 | ||
1446 | * send_sigqueue | ||
1447 | * check PF_EXITING | ||
1448 | * interrupt exit code running | ||
1449 | * __exit_signal | ||
1450 | * lock sighand->siglock | ||
1451 | * unlock sighand->siglock | ||
1452 | * lock sh->siglock | ||
1453 | * add(tsk->pending) flush_sigqueue(tsk->pending) | ||
1454 | * | ||
1455 | */ | ||
1456 | |||
1457 | if (unlikely(p->flags & PF_EXITING)) { | ||
1458 | ret = -1; | ||
1459 | goto out; | ||
1460 | } | ||
1368 | 1461 | ||
1369 | if (unlikely(!list_empty(&q->list))) { | 1462 | if (unlikely(!list_empty(&q->list))) { |
1370 | /* | 1463 | /* |
@@ -1388,9 +1481,9 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
1388 | signal_wake_up(p, sig == SIGKILL); | 1481 | signal_wake_up(p, sig == SIGKILL); |
1389 | 1482 | ||
1390 | out: | 1483 | out: |
1391 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 1484 | spin_unlock_irqrestore(&sh->siglock, flags); |
1392 | out_err: | 1485 | out_err: |
1393 | read_unlock(&tasklist_lock); | 1486 | rcu_read_unlock(); |
1394 | 1487 | ||
1395 | return ret; | 1488 | return ret; |
1396 | } | 1489 | } |
@@ -1402,7 +1495,9 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
1402 | int ret = 0; | 1495 | int ret = 0; |
1403 | 1496 | ||
1404 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); | 1497 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); |
1498 | |||
1405 | read_lock(&tasklist_lock); | 1499 | read_lock(&tasklist_lock); |
1500 | /* Since it_lock is held, p->sighand cannot be NULL. */ | ||
1406 | spin_lock_irqsave(&p->sighand->siglock, flags); | 1501 | spin_lock_irqsave(&p->sighand->siglock, flags); |
1407 | handle_stop_signal(sig, p); | 1502 | handle_stop_signal(sig, p); |
1408 | 1503 | ||
@@ -1436,7 +1531,7 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
1436 | out: | 1531 | out: |
1437 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 1532 | spin_unlock_irqrestore(&p->sighand->siglock, flags); |
1438 | read_unlock(&tasklist_lock); | 1533 | read_unlock(&tasklist_lock); |
1439 | return(ret); | 1534 | return ret; |
1440 | } | 1535 | } |
1441 | 1536 | ||
1442 | /* | 1537 | /* |
@@ -1786,7 +1881,7 @@ do_signal_stop(int signr) | |||
1786 | * We return zero if we still hold the siglock and should look | 1881 | * We return zero if we still hold the siglock and should look |
1787 | * for another signal without checking group_stop_count again. | 1882 | * for another signal without checking group_stop_count again. |
1788 | */ | 1883 | */ |
1789 | static inline int handle_group_stop(void) | 1884 | static int handle_group_stop(void) |
1790 | { | 1885 | { |
1791 | int stop_count; | 1886 | int stop_count; |
1792 | 1887 | ||
@@ -2338,6 +2433,7 @@ int | |||
2338 | do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) | 2433 | do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) |
2339 | { | 2434 | { |
2340 | struct k_sigaction *k; | 2435 | struct k_sigaction *k; |
2436 | sigset_t mask; | ||
2341 | 2437 | ||
2342 | if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) | 2438 | if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) |
2343 | return -EINVAL; | 2439 | return -EINVAL; |
@@ -2385,9 +2481,11 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) | |||
2385 | *k = *act; | 2481 | *k = *act; |
2386 | sigdelsetmask(&k->sa.sa_mask, | 2482 | sigdelsetmask(&k->sa.sa_mask, |
2387 | sigmask(SIGKILL) | sigmask(SIGSTOP)); | 2483 | sigmask(SIGKILL) | sigmask(SIGSTOP)); |
2388 | rm_from_queue(sigmask(sig), &t->signal->shared_pending); | 2484 | sigemptyset(&mask); |
2485 | sigaddset(&mask, sig); | ||
2486 | rm_from_queue_full(&mask, &t->signal->shared_pending); | ||
2389 | do { | 2487 | do { |
2390 | rm_from_queue(sigmask(sig), &t->pending); | 2488 | rm_from_queue_full(&mask, &t->pending); |
2391 | recalc_sigpending_tsk(t); | 2489 | recalc_sigpending_tsk(t); |
2392 | t = next_thread(t); | 2490 | t = next_thread(t); |
2393 | } while (t != current); | 2491 | } while (t != current); |
@@ -2623,6 +2721,32 @@ sys_pause(void) | |||
2623 | 2721 | ||
2624 | #endif | 2722 | #endif |
2625 | 2723 | ||
2724 | #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND | ||
2725 | asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize) | ||
2726 | { | ||
2727 | sigset_t newset; | ||
2728 | |||
2729 | /* XXX: Don't preclude handling different sized sigset_t's. */ | ||
2730 | if (sigsetsize != sizeof(sigset_t)) | ||
2731 | return -EINVAL; | ||
2732 | |||
2733 | if (copy_from_user(&newset, unewset, sizeof(newset))) | ||
2734 | return -EFAULT; | ||
2735 | sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); | ||
2736 | |||
2737 | spin_lock_irq(¤t->sighand->siglock); | ||
2738 | current->saved_sigmask = current->blocked; | ||
2739 | current->blocked = newset; | ||
2740 | recalc_sigpending(); | ||
2741 | spin_unlock_irq(¤t->sighand->siglock); | ||
2742 | |||
2743 | current->state = TASK_INTERRUPTIBLE; | ||
2744 | schedule(); | ||
2745 | set_thread_flag(TIF_RESTORE_SIGMASK); | ||
2746 | return -ERESTARTNOHAND; | ||
2747 | } | ||
2748 | #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ | ||
2749 | |||
2626 | void __init signals_init(void) | 2750 | void __init signals_init(void) |
2627 | { | 2751 | { |
2628 | sigqueue_cachep = | 2752 | sigqueue_cachep = |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index b3d4dc858e35..dcfb5d731466 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -87,13 +87,9 @@ static int stop_machine(void) | |||
87 | { | 87 | { |
88 | int i, ret = 0; | 88 | int i, ret = 0; |
89 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 89 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; |
90 | mm_segment_t old_fs = get_fs(); | ||
91 | 90 | ||
92 | /* One high-prio thread per cpu. We'll do this one. */ | 91 | /* One high-prio thread per cpu. We'll do this one. */ |
93 | set_fs(KERNEL_DS); | 92 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
94 | sys_sched_setscheduler(current->pid, SCHED_FIFO, | ||
95 | (struct sched_param __user *)¶m); | ||
96 | set_fs(old_fs); | ||
97 | 93 | ||
98 | atomic_set(&stopmachine_thread_ack, 0); | 94 | atomic_set(&stopmachine_thread_ack, 0); |
99 | stopmachine_num_threads = 0; | 95 | stopmachine_num_threads = 0; |
diff --git a/kernel/sys.c b/kernel/sys.c index eecf84526afe..0929c698affc 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/kernel.h> | 19 | #include <linux/kernel.h> |
20 | #include <linux/kexec.h> | 20 | #include <linux/kexec.h> |
21 | #include <linux/workqueue.h> | 21 | #include <linux/workqueue.h> |
22 | #include <linux/capability.h> | ||
22 | #include <linux/device.h> | 23 | #include <linux/device.h> |
23 | #include <linux/key.h> | 24 | #include <linux/key.h> |
24 | #include <linux/times.h> | 25 | #include <linux/times.h> |
@@ -223,6 +224,18 @@ int unregister_reboot_notifier(struct notifier_block * nb) | |||
223 | 224 | ||
224 | EXPORT_SYMBOL(unregister_reboot_notifier); | 225 | EXPORT_SYMBOL(unregister_reboot_notifier); |
225 | 226 | ||
227 | #ifndef CONFIG_SECURITY | ||
228 | int capable(int cap) | ||
229 | { | ||
230 | if (cap_raised(current->cap_effective, cap)) { | ||
231 | current->flags |= PF_SUPERPRIV; | ||
232 | return 1; | ||
233 | } | ||
234 | return 0; | ||
235 | } | ||
236 | EXPORT_SYMBOL(capable); | ||
237 | #endif | ||
238 | |||
226 | static int set_one_prio(struct task_struct *p, int niceval, int error) | 239 | static int set_one_prio(struct task_struct *p, int niceval, int error) |
227 | { | 240 | { |
228 | int no_nice; | 241 | int no_nice; |
@@ -427,23 +440,25 @@ void kernel_kexec(void) | |||
427 | } | 440 | } |
428 | EXPORT_SYMBOL_GPL(kernel_kexec); | 441 | EXPORT_SYMBOL_GPL(kernel_kexec); |
429 | 442 | ||
443 | void kernel_shutdown_prepare(enum system_states state) | ||
444 | { | ||
445 | notifier_call_chain(&reboot_notifier_list, | ||
446 | (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); | ||
447 | system_state = state; | ||
448 | device_shutdown(); | ||
449 | } | ||
430 | /** | 450 | /** |
431 | * kernel_halt - halt the system | 451 | * kernel_halt - halt the system |
432 | * | 452 | * |
433 | * Shutdown everything and perform a clean system halt. | 453 | * Shutdown everything and perform a clean system halt. |
434 | */ | 454 | */ |
435 | void kernel_halt_prepare(void) | ||
436 | { | ||
437 | notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); | ||
438 | system_state = SYSTEM_HALT; | ||
439 | device_shutdown(); | ||
440 | } | ||
441 | void kernel_halt(void) | 455 | void kernel_halt(void) |
442 | { | 456 | { |
443 | kernel_halt_prepare(); | 457 | kernel_shutdown_prepare(SYSTEM_HALT); |
444 | printk(KERN_EMERG "System halted.\n"); | 458 | printk(KERN_EMERG "System halted.\n"); |
445 | machine_halt(); | 459 | machine_halt(); |
446 | } | 460 | } |
461 | |||
447 | EXPORT_SYMBOL_GPL(kernel_halt); | 462 | EXPORT_SYMBOL_GPL(kernel_halt); |
448 | 463 | ||
449 | /** | 464 | /** |
@@ -451,20 +466,13 @@ EXPORT_SYMBOL_GPL(kernel_halt); | |||
451 | * | 466 | * |
452 | * Shutdown everything and perform a clean system power_off. | 467 | * Shutdown everything and perform a clean system power_off. |
453 | */ | 468 | */ |
454 | void kernel_power_off_prepare(void) | ||
455 | { | ||
456 | notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); | ||
457 | system_state = SYSTEM_POWER_OFF; | ||
458 | device_shutdown(); | ||
459 | } | ||
460 | void kernel_power_off(void) | 469 | void kernel_power_off(void) |
461 | { | 470 | { |
462 | kernel_power_off_prepare(); | 471 | kernel_shutdown_prepare(SYSTEM_POWER_OFF); |
463 | printk(KERN_EMERG "Power down.\n"); | 472 | printk(KERN_EMERG "Power down.\n"); |
464 | machine_power_off(); | 473 | machine_power_off(); |
465 | } | 474 | } |
466 | EXPORT_SYMBOL_GPL(kernel_power_off); | 475 | EXPORT_SYMBOL_GPL(kernel_power_off); |
467 | |||
468 | /* | 476 | /* |
469 | * Reboot system call: for obvious reasons only root may call it, | 477 | * Reboot system call: for obvious reasons only root may call it, |
470 | * and even root needs to set up some magic numbers in the registers | 478 | * and even root needs to set up some magic numbers in the registers |
@@ -489,6 +497,12 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user | |||
489 | magic2 != LINUX_REBOOT_MAGIC2C)) | 497 | magic2 != LINUX_REBOOT_MAGIC2C)) |
490 | return -EINVAL; | 498 | return -EINVAL; |
491 | 499 | ||
500 | /* Instead of trying to make the power_off code look like | ||
501 | * halt when pm_power_off is not set do it the easy way. | ||
502 | */ | ||
503 | if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) | ||
504 | cmd = LINUX_REBOOT_CMD_HALT; | ||
505 | |||
492 | lock_kernel(); | 506 | lock_kernel(); |
493 | switch (cmd) { | 507 | switch (cmd) { |
494 | case LINUX_REBOOT_CMD_RESTART: | 508 | case LINUX_REBOOT_CMD_RESTART: |
@@ -1084,10 +1098,11 @@ asmlinkage long sys_times(struct tms __user * tbuf) | |||
1084 | asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | 1098 | asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) |
1085 | { | 1099 | { |
1086 | struct task_struct *p; | 1100 | struct task_struct *p; |
1101 | struct task_struct *group_leader = current->group_leader; | ||
1087 | int err = -EINVAL; | 1102 | int err = -EINVAL; |
1088 | 1103 | ||
1089 | if (!pid) | 1104 | if (!pid) |
1090 | pid = current->pid; | 1105 | pid = group_leader->pid; |
1091 | if (!pgid) | 1106 | if (!pgid) |
1092 | pgid = pid; | 1107 | pgid = pid; |
1093 | if (pgid < 0) | 1108 | if (pgid < 0) |
@@ -1107,16 +1122,16 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | |||
1107 | if (!thread_group_leader(p)) | 1122 | if (!thread_group_leader(p)) |
1108 | goto out; | 1123 | goto out; |
1109 | 1124 | ||
1110 | if (p->parent == current || p->real_parent == current) { | 1125 | if (p->real_parent == group_leader) { |
1111 | err = -EPERM; | 1126 | err = -EPERM; |
1112 | if (p->signal->session != current->signal->session) | 1127 | if (p->signal->session != group_leader->signal->session) |
1113 | goto out; | 1128 | goto out; |
1114 | err = -EACCES; | 1129 | err = -EACCES; |
1115 | if (p->did_exec) | 1130 | if (p->did_exec) |
1116 | goto out; | 1131 | goto out; |
1117 | } else { | 1132 | } else { |
1118 | err = -ESRCH; | 1133 | err = -ESRCH; |
1119 | if (p != current) | 1134 | if (p != group_leader) |
1120 | goto out; | 1135 | goto out; |
1121 | } | 1136 | } |
1122 | 1137 | ||
@@ -1128,7 +1143,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | |||
1128 | struct task_struct *p; | 1143 | struct task_struct *p; |
1129 | 1144 | ||
1130 | do_each_task_pid(pgid, PIDTYPE_PGID, p) { | 1145 | do_each_task_pid(pgid, PIDTYPE_PGID, p) { |
1131 | if (p->signal->session == current->signal->session) | 1146 | if (p->signal->session == group_leader->signal->session) |
1132 | goto ok_pgid; | 1147 | goto ok_pgid; |
1133 | } while_each_task_pid(pgid, PIDTYPE_PGID, p); | 1148 | } while_each_task_pid(pgid, PIDTYPE_PGID, p); |
1134 | goto out; | 1149 | goto out; |
@@ -1208,24 +1223,22 @@ asmlinkage long sys_getsid(pid_t pid) | |||
1208 | 1223 | ||
1209 | asmlinkage long sys_setsid(void) | 1224 | asmlinkage long sys_setsid(void) |
1210 | { | 1225 | { |
1226 | struct task_struct *group_leader = current->group_leader; | ||
1211 | struct pid *pid; | 1227 | struct pid *pid; |
1212 | int err = -EPERM; | 1228 | int err = -EPERM; |
1213 | 1229 | ||
1214 | if (!thread_group_leader(current)) | ||
1215 | return -EINVAL; | ||
1216 | |||
1217 | down(&tty_sem); | 1230 | down(&tty_sem); |
1218 | write_lock_irq(&tasklist_lock); | 1231 | write_lock_irq(&tasklist_lock); |
1219 | 1232 | ||
1220 | pid = find_pid(PIDTYPE_PGID, current->pid); | 1233 | pid = find_pid(PIDTYPE_PGID, group_leader->pid); |
1221 | if (pid) | 1234 | if (pid) |
1222 | goto out; | 1235 | goto out; |
1223 | 1236 | ||
1224 | current->signal->leader = 1; | 1237 | group_leader->signal->leader = 1; |
1225 | __set_special_pids(current->pid, current->pid); | 1238 | __set_special_pids(group_leader->pid, group_leader->pid); |
1226 | current->signal->tty = NULL; | 1239 | group_leader->signal->tty = NULL; |
1227 | current->signal->tty_old_pgrp = 0; | 1240 | group_leader->signal->tty_old_pgrp = 0; |
1228 | err = process_group(current); | 1241 | err = process_group(group_leader); |
1229 | out: | 1242 | out: |
1230 | write_unlock_irq(&tasklist_lock); | 1243 | write_unlock_irq(&tasklist_lock); |
1231 | up(&tty_sem); | 1244 | up(&tty_sem); |
@@ -1687,7 +1700,10 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1687 | if (unlikely(!p->signal)) | 1700 | if (unlikely(!p->signal)) |
1688 | return; | 1701 | return; |
1689 | 1702 | ||
1703 | utime = stime = cputime_zero; | ||
1704 | |||
1690 | switch (who) { | 1705 | switch (who) { |
1706 | case RUSAGE_BOTH: | ||
1691 | case RUSAGE_CHILDREN: | 1707 | case RUSAGE_CHILDREN: |
1692 | spin_lock_irqsave(&p->sighand->siglock, flags); | 1708 | spin_lock_irqsave(&p->sighand->siglock, flags); |
1693 | utime = p->signal->cutime; | 1709 | utime = p->signal->cutime; |
@@ -1697,22 +1713,11 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1697 | r->ru_minflt = p->signal->cmin_flt; | 1713 | r->ru_minflt = p->signal->cmin_flt; |
1698 | r->ru_majflt = p->signal->cmaj_flt; | 1714 | r->ru_majflt = p->signal->cmaj_flt; |
1699 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 1715 | spin_unlock_irqrestore(&p->sighand->siglock, flags); |
1700 | cputime_to_timeval(utime, &r->ru_utime); | 1716 | |
1701 | cputime_to_timeval(stime, &r->ru_stime); | 1717 | if (who == RUSAGE_CHILDREN) |
1702 | break; | 1718 | break; |
1719 | |||
1703 | case RUSAGE_SELF: | 1720 | case RUSAGE_SELF: |
1704 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
1705 | utime = stime = cputime_zero; | ||
1706 | goto sum_group; | ||
1707 | case RUSAGE_BOTH: | ||
1708 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
1709 | utime = p->signal->cutime; | ||
1710 | stime = p->signal->cstime; | ||
1711 | r->ru_nvcsw = p->signal->cnvcsw; | ||
1712 | r->ru_nivcsw = p->signal->cnivcsw; | ||
1713 | r->ru_minflt = p->signal->cmin_flt; | ||
1714 | r->ru_majflt = p->signal->cmaj_flt; | ||
1715 | sum_group: | ||
1716 | utime = cputime_add(utime, p->signal->utime); | 1721 | utime = cputime_add(utime, p->signal->utime); |
1717 | stime = cputime_add(stime, p->signal->stime); | 1722 | stime = cputime_add(stime, p->signal->stime); |
1718 | r->ru_nvcsw += p->signal->nvcsw; | 1723 | r->ru_nvcsw += p->signal->nvcsw; |
@@ -1729,13 +1734,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1729 | r->ru_majflt += t->maj_flt; | 1734 | r->ru_majflt += t->maj_flt; |
1730 | t = next_thread(t); | 1735 | t = next_thread(t); |
1731 | } while (t != p); | 1736 | } while (t != p); |
1732 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | ||
1733 | cputime_to_timeval(utime, &r->ru_utime); | ||
1734 | cputime_to_timeval(stime, &r->ru_stime); | ||
1735 | break; | 1737 | break; |
1738 | |||
1736 | default: | 1739 | default: |
1737 | BUG(); | 1740 | BUG(); |
1738 | } | 1741 | } |
1742 | |||
1743 | cputime_to_timeval(utime, &r->ru_utime); | ||
1744 | cputime_to_timeval(stime, &r->ru_stime); | ||
1739 | } | 1745 | } |
1740 | 1746 | ||
1741 | int getrusage(struct task_struct *p, int who, struct rusage __user *ru) | 1747 | int getrusage(struct task_struct *p, int who, struct rusage __user *ru) |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 1ab2370e2efa..17313b99e53d 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -82,6 +82,28 @@ cond_syscall(compat_sys_socketcall); | |||
82 | cond_syscall(sys_inotify_init); | 82 | cond_syscall(sys_inotify_init); |
83 | cond_syscall(sys_inotify_add_watch); | 83 | cond_syscall(sys_inotify_add_watch); |
84 | cond_syscall(sys_inotify_rm_watch); | 84 | cond_syscall(sys_inotify_rm_watch); |
85 | cond_syscall(sys_migrate_pages); | ||
86 | cond_syscall(sys_chown16); | ||
87 | cond_syscall(sys_fchown16); | ||
88 | cond_syscall(sys_getegid16); | ||
89 | cond_syscall(sys_geteuid16); | ||
90 | cond_syscall(sys_getgid16); | ||
91 | cond_syscall(sys_getgroups16); | ||
92 | cond_syscall(sys_getresgid16); | ||
93 | cond_syscall(sys_getresuid16); | ||
94 | cond_syscall(sys_getuid16); | ||
95 | cond_syscall(sys_lchown16); | ||
96 | cond_syscall(sys_setfsgid16); | ||
97 | cond_syscall(sys_setfsuid16); | ||
98 | cond_syscall(sys_setgid16); | ||
99 | cond_syscall(sys_setgroups16); | ||
100 | cond_syscall(sys_setregid16); | ||
101 | cond_syscall(sys_setresgid16); | ||
102 | cond_syscall(sys_setresuid16); | ||
103 | cond_syscall(sys_setreuid16); | ||
104 | cond_syscall(sys_setuid16); | ||
105 | cond_syscall(sys_vm86old); | ||
106 | cond_syscall(sys_vm86); | ||
85 | 107 | ||
86 | /* arch-specific weak syscall entries */ | 108 | /* arch-specific weak syscall entries */ |
87 | cond_syscall(sys_pciconfig_read); | 109 | cond_syscall(sys_pciconfig_read); |
@@ -90,3 +112,5 @@ cond_syscall(sys_pciconfig_iobase); | |||
90 | cond_syscall(sys32_ipc); | 112 | cond_syscall(sys32_ipc); |
91 | cond_syscall(sys32_sysctl); | 113 | cond_syscall(sys32_sysctl); |
92 | cond_syscall(ppc_rtas); | 114 | cond_syscall(ppc_rtas); |
115 | cond_syscall(sys_spu_run); | ||
116 | cond_syscall(sys_spu_create); | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9990e10192e8..cb99a42f8b37 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -25,12 +25,14 @@ | |||
25 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
26 | #include <linux/sysctl.h> | 26 | #include <linux/sysctl.h> |
27 | #include <linux/proc_fs.h> | 27 | #include <linux/proc_fs.h> |
28 | #include <linux/capability.h> | ||
28 | #include <linux/ctype.h> | 29 | #include <linux/ctype.h> |
29 | #include <linux/utsname.h> | 30 | #include <linux/utsname.h> |
30 | #include <linux/capability.h> | 31 | #include <linux/capability.h> |
31 | #include <linux/smp_lock.h> | 32 | #include <linux/smp_lock.h> |
32 | #include <linux/init.h> | 33 | #include <linux/init.h> |
33 | #include <linux/kernel.h> | 34 | #include <linux/kernel.h> |
35 | #include <linux/kobject.h> | ||
34 | #include <linux/net.h> | 36 | #include <linux/net.h> |
35 | #include <linux/sysrq.h> | 37 | #include <linux/sysrq.h> |
36 | #include <linux/highuid.h> | 38 | #include <linux/highuid.h> |
@@ -67,6 +69,8 @@ extern int min_free_kbytes; | |||
67 | extern int printk_ratelimit_jiffies; | 69 | extern int printk_ratelimit_jiffies; |
68 | extern int printk_ratelimit_burst; | 70 | extern int printk_ratelimit_burst; |
69 | extern int pid_max_min, pid_max_max; | 71 | extern int pid_max_min, pid_max_max; |
72 | extern int sysctl_drop_caches; | ||
73 | extern int percpu_pagelist_fraction; | ||
70 | 74 | ||
71 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | 75 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) |
72 | int unknown_nmi_panic; | 76 | int unknown_nmi_panic; |
@@ -77,15 +81,13 @@ extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *, | |||
77 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ | 81 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ |
78 | static int maxolduid = 65535; | 82 | static int maxolduid = 65535; |
79 | static int minolduid; | 83 | static int minolduid; |
84 | static int min_percpu_pagelist_fract = 8; | ||
80 | 85 | ||
81 | static int ngroups_max = NGROUPS_MAX; | 86 | static int ngroups_max = NGROUPS_MAX; |
82 | 87 | ||
83 | #ifdef CONFIG_KMOD | 88 | #ifdef CONFIG_KMOD |
84 | extern char modprobe_path[]; | 89 | extern char modprobe_path[]; |
85 | #endif | 90 | #endif |
86 | #ifdef CONFIG_HOTPLUG | ||
87 | extern char hotplug_path[]; | ||
88 | #endif | ||
89 | #ifdef CONFIG_CHR_DEV_SG | 91 | #ifdef CONFIG_CHR_DEV_SG |
90 | extern int sg_big_buff; | 92 | extern int sg_big_buff; |
91 | #endif | 93 | #endif |
@@ -110,7 +112,7 @@ extern int pwrsw_enabled; | |||
110 | extern int unaligned_enabled; | 112 | extern int unaligned_enabled; |
111 | #endif | 113 | #endif |
112 | 114 | ||
113 | #ifdef CONFIG_ARCH_S390 | 115 | #ifdef CONFIG_S390 |
114 | #ifdef CONFIG_MATHEMU | 116 | #ifdef CONFIG_MATHEMU |
115 | extern int sysctl_ieee_emulation_warnings; | 117 | extern int sysctl_ieee_emulation_warnings; |
116 | #endif | 118 | #endif |
@@ -397,8 +399,8 @@ static ctl_table kern_table[] = { | |||
397 | { | 399 | { |
398 | .ctl_name = KERN_HOTPLUG, | 400 | .ctl_name = KERN_HOTPLUG, |
399 | .procname = "hotplug", | 401 | .procname = "hotplug", |
400 | .data = &hotplug_path, | 402 | .data = &uevent_helper, |
401 | .maxlen = HOTPLUG_PATH_LEN, | 403 | .maxlen = UEVENT_HELPER_PATH_LEN, |
402 | .mode = 0644, | 404 | .mode = 0644, |
403 | .proc_handler = &proc_dostring, | 405 | .proc_handler = &proc_dostring, |
404 | .strategy = &sysctl_string, | 406 | .strategy = &sysctl_string, |
@@ -544,7 +546,7 @@ static ctl_table kern_table[] = { | |||
544 | .extra1 = &minolduid, | 546 | .extra1 = &minolduid, |
545 | .extra2 = &maxolduid, | 547 | .extra2 = &maxolduid, |
546 | }, | 548 | }, |
547 | #ifdef CONFIG_ARCH_S390 | 549 | #ifdef CONFIG_S390 |
548 | #ifdef CONFIG_MATHEMU | 550 | #ifdef CONFIG_MATHEMU |
549 | { | 551 | { |
550 | .ctl_name = KERN_IEEE_EMULATION_WARNINGS, | 552 | .ctl_name = KERN_IEEE_EMULATION_WARNINGS, |
@@ -646,7 +648,7 @@ static ctl_table kern_table[] = { | |||
646 | .mode = 0644, | 648 | .mode = 0644, |
647 | .proc_handler = &proc_dointvec, | 649 | .proc_handler = &proc_dointvec, |
648 | }, | 650 | }, |
649 | #if defined(CONFIG_ARCH_S390) | 651 | #if defined(CONFIG_S390) && defined(CONFIG_SMP) |
650 | { | 652 | { |
651 | .ctl_name = KERN_SPIN_RETRY, | 653 | .ctl_name = KERN_SPIN_RETRY, |
652 | .procname = "spin_retry", | 654 | .procname = "spin_retry", |
@@ -777,6 +779,15 @@ static ctl_table vm_table[] = { | |||
777 | .strategy = &sysctl_intvec, | 779 | .strategy = &sysctl_intvec, |
778 | }, | 780 | }, |
779 | { | 781 | { |
782 | .ctl_name = VM_DROP_PAGECACHE, | ||
783 | .procname = "drop_caches", | ||
784 | .data = &sysctl_drop_caches, | ||
785 | .maxlen = sizeof(int), | ||
786 | .mode = 0644, | ||
787 | .proc_handler = drop_caches_sysctl_handler, | ||
788 | .strategy = &sysctl_intvec, | ||
789 | }, | ||
790 | { | ||
780 | .ctl_name = VM_MIN_FREE_KBYTES, | 791 | .ctl_name = VM_MIN_FREE_KBYTES, |
781 | .procname = "min_free_kbytes", | 792 | .procname = "min_free_kbytes", |
782 | .data = &min_free_kbytes, | 793 | .data = &min_free_kbytes, |
@@ -786,6 +797,16 @@ static ctl_table vm_table[] = { | |||
786 | .strategy = &sysctl_intvec, | 797 | .strategy = &sysctl_intvec, |
787 | .extra1 = &zero, | 798 | .extra1 = &zero, |
788 | }, | 799 | }, |
800 | { | ||
801 | .ctl_name = VM_PERCPU_PAGELIST_FRACTION, | ||
802 | .procname = "percpu_pagelist_fraction", | ||
803 | .data = &percpu_pagelist_fraction, | ||
804 | .maxlen = sizeof(percpu_pagelist_fraction), | ||
805 | .mode = 0644, | ||
806 | .proc_handler = &percpu_pagelist_fraction_sysctl_handler, | ||
807 | .strategy = &sysctl_intvec, | ||
808 | .extra1 = &min_percpu_pagelist_fract, | ||
809 | }, | ||
789 | #ifdef CONFIG_MMU | 810 | #ifdef CONFIG_MMU |
790 | { | 811 | { |
791 | .ctl_name = VM_MAX_MAP_COUNT, | 812 | .ctl_name = VM_MAX_MAP_COUNT, |
@@ -849,6 +870,17 @@ static ctl_table vm_table[] = { | |||
849 | .strategy = &sysctl_jiffies, | 870 | .strategy = &sysctl_jiffies, |
850 | }, | 871 | }, |
851 | #endif | 872 | #endif |
873 | #ifdef CONFIG_NUMA | ||
874 | { | ||
875 | .ctl_name = VM_ZONE_RECLAIM_MODE, | ||
876 | .procname = "zone_reclaim_mode", | ||
877 | .data = &zone_reclaim_mode, | ||
878 | .maxlen = sizeof(zone_reclaim_mode), | ||
879 | .mode = 0644, | ||
880 | .proc_handler = &proc_dointvec, | ||
881 | .strategy = &zero, | ||
882 | }, | ||
883 | #endif | ||
852 | { .ctl_name = 0 } | 884 | { .ctl_name = 0 } |
853 | }; | 885 | }; |
854 | 886 | ||
@@ -2192,29 +2224,32 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen, | |||
2192 | void __user *oldval, size_t __user *oldlenp, | 2224 | void __user *oldval, size_t __user *oldlenp, |
2193 | void __user *newval, size_t newlen, void **context) | 2225 | void __user *newval, size_t newlen, void **context) |
2194 | { | 2226 | { |
2195 | size_t l, len; | ||
2196 | |||
2197 | if (!table->data || !table->maxlen) | 2227 | if (!table->data || !table->maxlen) |
2198 | return -ENOTDIR; | 2228 | return -ENOTDIR; |
2199 | 2229 | ||
2200 | if (oldval && oldlenp) { | 2230 | if (oldval && oldlenp) { |
2201 | if (get_user(len, oldlenp)) | 2231 | size_t bufsize; |
2232 | if (get_user(bufsize, oldlenp)) | ||
2202 | return -EFAULT; | 2233 | return -EFAULT; |
2203 | if (len) { | 2234 | if (bufsize) { |
2204 | l = strlen(table->data); | 2235 | size_t len = strlen(table->data), copied; |
2205 | if (len > l) len = l; | 2236 | |
2206 | if (len >= table->maxlen) | 2237 | /* This shouldn't trigger for a well-formed sysctl */ |
2238 | if (len > table->maxlen) | ||
2207 | len = table->maxlen; | 2239 | len = table->maxlen; |
2208 | if(copy_to_user(oldval, table->data, len)) | 2240 | |
2209 | return -EFAULT; | 2241 | /* Copy up to a max of bufsize-1 bytes of the string */ |
2210 | if(put_user(0, ((char __user *) oldval) + len)) | 2242 | copied = (len >= bufsize) ? bufsize - 1 : len; |
2243 | |||
2244 | if (copy_to_user(oldval, table->data, copied) || | ||
2245 | put_user(0, (char __user *)(oldval + copied))) | ||
2211 | return -EFAULT; | 2246 | return -EFAULT; |
2212 | if(put_user(len, oldlenp)) | 2247 | if (put_user(len, oldlenp)) |
2213 | return -EFAULT; | 2248 | return -EFAULT; |
2214 | } | 2249 | } |
2215 | } | 2250 | } |
2216 | if (newval && newlen) { | 2251 | if (newval && newlen) { |
2217 | len = newlen; | 2252 | size_t len = newlen; |
2218 | if (len > table->maxlen) | 2253 | if (len > table->maxlen) |
2219 | len = table->maxlen; | 2254 | len = table->maxlen; |
2220 | if(copy_from_user(table->data, newval, len)) | 2255 | if(copy_from_user(table->data, newval, len)) |
@@ -2223,7 +2258,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen, | |||
2223 | len--; | 2258 | len--; |
2224 | ((char *) table->data)[len] = 0; | 2259 | ((char *) table->data)[len] = 0; |
2225 | } | 2260 | } |
2226 | return 0; | 2261 | return 1; |
2227 | } | 2262 | } |
2228 | 2263 | ||
2229 | /* | 2264 | /* |
diff --git a/kernel/time.c b/kernel/time.c index b94bfa8c03e0..7477b1d2079e 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -29,6 +29,7 @@ | |||
29 | 29 | ||
30 | #include <linux/module.h> | 30 | #include <linux/module.h> |
31 | #include <linux/timex.h> | 31 | #include <linux/timex.h> |
32 | #include <linux/capability.h> | ||
32 | #include <linux/errno.h> | 33 | #include <linux/errno.h> |
33 | #include <linux/smp_lock.h> | 34 | #include <linux/smp_lock.h> |
34 | #include <linux/syscalls.h> | 35 | #include <linux/syscalls.h> |
@@ -154,6 +155,9 @@ int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) | |||
154 | static int firsttime = 1; | 155 | static int firsttime = 1; |
155 | int error = 0; | 156 | int error = 0; |
156 | 157 | ||
158 | if (!timespec_valid(tv)) | ||
159 | return -EINVAL; | ||
160 | |||
157 | error = security_settime(tv, tz); | 161 | error = security_settime(tv, tz); |
158 | if (error) | 162 | if (error) |
159 | return error; | 163 | return error; |
@@ -561,27 +565,107 @@ void getnstimeofday(struct timespec *tv) | |||
561 | EXPORT_SYMBOL_GPL(getnstimeofday); | 565 | EXPORT_SYMBOL_GPL(getnstimeofday); |
562 | #endif | 566 | #endif |
563 | 567 | ||
564 | void getnstimestamp(struct timespec *ts) | 568 | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. |
569 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 | ||
570 | * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. | ||
571 | * | ||
572 | * [For the Julian calendar (which was used in Russia before 1917, | ||
573 | * Britain & colonies before 1752, anywhere else before 1582, | ||
574 | * and is still in use by some communities) leave out the | ||
575 | * -year/100+year/400 terms, and add 10.] | ||
576 | * | ||
577 | * This algorithm was first published by Gauss (I think). | ||
578 | * | ||
579 | * WARNING: this function will overflow on 2106-02-07 06:28:16 on | ||
580 | * machines were long is 32-bit! (However, as time_t is signed, we | ||
581 | * will already get problems at other places on 2038-01-19 03:14:08) | ||
582 | */ | ||
583 | unsigned long | ||
584 | mktime(const unsigned int year0, const unsigned int mon0, | ||
585 | const unsigned int day, const unsigned int hour, | ||
586 | const unsigned int min, const unsigned int sec) | ||
565 | { | 587 | { |
566 | unsigned int seq; | 588 | unsigned int mon = mon0, year = year0; |
567 | struct timespec wall2mono; | ||
568 | 589 | ||
569 | /* synchronize with settimeofday() changes */ | 590 | /* 1..12 -> 11,12,1..10 */ |
570 | do { | 591 | if (0 >= (int) (mon -= 2)) { |
571 | seq = read_seqbegin(&xtime_lock); | 592 | mon += 12; /* Puts Feb last since it has leap day */ |
572 | getnstimeofday(ts); | 593 | year -= 1; |
573 | wall2mono = wall_to_monotonic; | ||
574 | } while(unlikely(read_seqretry(&xtime_lock, seq))); | ||
575 | |||
576 | /* adjust to monotonicaly-increasing values */ | ||
577 | ts->tv_sec += wall2mono.tv_sec; | ||
578 | ts->tv_nsec += wall2mono.tv_nsec; | ||
579 | while (unlikely(ts->tv_nsec >= NSEC_PER_SEC)) { | ||
580 | ts->tv_nsec -= NSEC_PER_SEC; | ||
581 | ts->tv_sec++; | ||
582 | } | 594 | } |
595 | |||
596 | return ((((unsigned long) | ||
597 | (year/4 - year/100 + year/400 + 367*mon/12 + day) + | ||
598 | year*365 - 719499 | ||
599 | )*24 + hour /* now have hours */ | ||
600 | )*60 + min /* now have minutes */ | ||
601 | )*60 + sec; /* finally seconds */ | ||
602 | } | ||
603 | |||
604 | EXPORT_SYMBOL(mktime); | ||
605 | |||
606 | /** | ||
607 | * set_normalized_timespec - set timespec sec and nsec parts and normalize | ||
608 | * | ||
609 | * @ts: pointer to timespec variable to be set | ||
610 | * @sec: seconds to set | ||
611 | * @nsec: nanoseconds to set | ||
612 | * | ||
613 | * Set seconds and nanoseconds field of a timespec variable and | ||
614 | * normalize to the timespec storage format | ||
615 | * | ||
616 | * Note: The tv_nsec part is always in the range of | ||
617 | * 0 <= tv_nsec < NSEC_PER_SEC | ||
618 | * For negative values only the tv_sec field is negative ! | ||
619 | */ | ||
620 | void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) | ||
621 | { | ||
622 | while (nsec >= NSEC_PER_SEC) { | ||
623 | nsec -= NSEC_PER_SEC; | ||
624 | ++sec; | ||
625 | } | ||
626 | while (nsec < 0) { | ||
627 | nsec += NSEC_PER_SEC; | ||
628 | --sec; | ||
629 | } | ||
630 | ts->tv_sec = sec; | ||
631 | ts->tv_nsec = nsec; | ||
632 | } | ||
633 | |||
634 | /** | ||
635 | * ns_to_timespec - Convert nanoseconds to timespec | ||
636 | * @nsec: the nanoseconds value to be converted | ||
637 | * | ||
638 | * Returns the timespec representation of the nsec parameter. | ||
639 | */ | ||
640 | inline struct timespec ns_to_timespec(const nsec_t nsec) | ||
641 | { | ||
642 | struct timespec ts; | ||
643 | |||
644 | if (nsec) | ||
645 | ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC, | ||
646 | &ts.tv_nsec); | ||
647 | else | ||
648 | ts.tv_sec = ts.tv_nsec = 0; | ||
649 | |||
650 | return ts; | ||
651 | } | ||
652 | |||
653 | /** | ||
654 | * ns_to_timeval - Convert nanoseconds to timeval | ||
655 | * @nsec: the nanoseconds value to be converted | ||
656 | * | ||
657 | * Returns the timeval representation of the nsec parameter. | ||
658 | */ | ||
659 | struct timeval ns_to_timeval(const nsec_t nsec) | ||
660 | { | ||
661 | struct timespec ts = ns_to_timespec(nsec); | ||
662 | struct timeval tv; | ||
663 | |||
664 | tv.tv_sec = ts.tv_sec; | ||
665 | tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000; | ||
666 | |||
667 | return tv; | ||
583 | } | 668 | } |
584 | EXPORT_SYMBOL_GPL(getnstimestamp); | ||
585 | 669 | ||
586 | #if (BITS_PER_LONG < 64) | 670 | #if (BITS_PER_LONG < 64) |
587 | u64 get_jiffies_64(void) | 671 | u64 get_jiffies_64(void) |
diff --git a/kernel/timer.c b/kernel/timer.c index fd74268d8663..4f1cb0ab5251 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/posix-timers.h> | 33 | #include <linux/posix-timers.h> |
34 | #include <linux/cpu.h> | 34 | #include <linux/cpu.h> |
35 | #include <linux/syscalls.h> | 35 | #include <linux/syscalls.h> |
36 | #include <linux/delay.h> | ||
36 | 37 | ||
37 | #include <asm/uaccess.h> | 38 | #include <asm/uaccess.h> |
38 | #include <asm/unistd.h> | 39 | #include <asm/unistd.h> |
@@ -857,6 +858,7 @@ static void run_timer_softirq(struct softirq_action *h) | |||
857 | { | 858 | { |
858 | tvec_base_t *base = &__get_cpu_var(tvec_bases); | 859 | tvec_base_t *base = &__get_cpu_var(tvec_bases); |
859 | 860 | ||
861 | hrtimer_run_queues(); | ||
860 | if (time_after_eq(jiffies, base->timer_jiffies)) | 862 | if (time_after_eq(jiffies, base->timer_jiffies)) |
861 | __run_timers(base); | 863 | __run_timers(base); |
862 | } | 864 | } |
@@ -1118,62 +1120,6 @@ asmlinkage long sys_gettid(void) | |||
1118 | return current->pid; | 1120 | return current->pid; |
1119 | } | 1121 | } |
1120 | 1122 | ||
1121 | static long __sched nanosleep_restart(struct restart_block *restart) | ||
1122 | { | ||
1123 | unsigned long expire = restart->arg0, now = jiffies; | ||
1124 | struct timespec __user *rmtp = (struct timespec __user *) restart->arg1; | ||
1125 | long ret; | ||
1126 | |||
1127 | /* Did it expire while we handled signals? */ | ||
1128 | if (!time_after(expire, now)) | ||
1129 | return 0; | ||
1130 | |||
1131 | expire = schedule_timeout_interruptible(expire - now); | ||
1132 | |||
1133 | ret = 0; | ||
1134 | if (expire) { | ||
1135 | struct timespec t; | ||
1136 | jiffies_to_timespec(expire, &t); | ||
1137 | |||
1138 | ret = -ERESTART_RESTARTBLOCK; | ||
1139 | if (rmtp && copy_to_user(rmtp, &t, sizeof(t))) | ||
1140 | ret = -EFAULT; | ||
1141 | /* The 'restart' block is already filled in */ | ||
1142 | } | ||
1143 | return ret; | ||
1144 | } | ||
1145 | |||
1146 | asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) | ||
1147 | { | ||
1148 | struct timespec t; | ||
1149 | unsigned long expire; | ||
1150 | long ret; | ||
1151 | |||
1152 | if (copy_from_user(&t, rqtp, sizeof(t))) | ||
1153 | return -EFAULT; | ||
1154 | |||
1155 | if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0)) | ||
1156 | return -EINVAL; | ||
1157 | |||
1158 | expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); | ||
1159 | expire = schedule_timeout_interruptible(expire); | ||
1160 | |||
1161 | ret = 0; | ||
1162 | if (expire) { | ||
1163 | struct restart_block *restart; | ||
1164 | jiffies_to_timespec(expire, &t); | ||
1165 | if (rmtp && copy_to_user(rmtp, &t, sizeof(t))) | ||
1166 | return -EFAULT; | ||
1167 | |||
1168 | restart = ¤t_thread_info()->restart_block; | ||
1169 | restart->fn = nanosleep_restart; | ||
1170 | restart->arg0 = jiffies + expire; | ||
1171 | restart->arg1 = (unsigned long) rmtp; | ||
1172 | ret = -ERESTART_RESTARTBLOCK; | ||
1173 | } | ||
1174 | return ret; | ||
1175 | } | ||
1176 | |||
1177 | /* | 1123 | /* |
1178 | * sys_sysinfo - fill in sysinfo struct | 1124 | * sys_sysinfo - fill in sysinfo struct |
1179 | */ | 1125 | */ |
diff --git a/kernel/uid16.c b/kernel/uid16.c index f669941e8b26..aa25605027c8 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/notifier.h> | 10 | #include <linux/notifier.h> |
11 | #include <linux/reboot.h> | 11 | #include <linux/reboot.h> |
12 | #include <linux/prctl.h> | 12 | #include <linux/prctl.h> |
13 | #include <linux/capability.h> | ||
13 | #include <linux/init.h> | 14 | #include <linux/init.h> |
14 | #include <linux/highuid.h> | 15 | #include <linux/highuid.h> |
15 | #include <linux/security.h> | 16 | #include <linux/security.h> |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2bd5aee1c736..b052e2c4c710 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -29,7 +29,8 @@ | |||
29 | #include <linux/kthread.h> | 29 | #include <linux/kthread.h> |
30 | 30 | ||
31 | /* | 31 | /* |
32 | * The per-CPU workqueue (if single thread, we always use cpu 0's). | 32 | * The per-CPU workqueue (if single thread, we always use the first |
33 | * possible cpu). | ||
33 | * | 34 | * |
34 | * The sequence counters are for flush_scheduled_work(). It wants to wait | 35 | * The sequence counters are for flush_scheduled_work(). It wants to wait |
35 | * until until all currently-scheduled works are completed, but it doesn't | 36 | * until until all currently-scheduled works are completed, but it doesn't |
@@ -69,6 +70,8 @@ struct workqueue_struct { | |||
69 | static DEFINE_SPINLOCK(workqueue_lock); | 70 | static DEFINE_SPINLOCK(workqueue_lock); |
70 | static LIST_HEAD(workqueues); | 71 | static LIST_HEAD(workqueues); |
71 | 72 | ||
73 | static int singlethread_cpu; | ||
74 | |||
72 | /* If it's single threaded, it isn't in the list of workqueues. */ | 75 | /* If it's single threaded, it isn't in the list of workqueues. */ |
73 | static inline int is_single_threaded(struct workqueue_struct *wq) | 76 | static inline int is_single_threaded(struct workqueue_struct *wq) |
74 | { | 77 | { |
@@ -102,7 +105,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) | |||
102 | 105 | ||
103 | if (!test_and_set_bit(0, &work->pending)) { | 106 | if (!test_and_set_bit(0, &work->pending)) { |
104 | if (unlikely(is_single_threaded(wq))) | 107 | if (unlikely(is_single_threaded(wq))) |
105 | cpu = any_online_cpu(cpu_online_map); | 108 | cpu = singlethread_cpu; |
106 | BUG_ON(!list_empty(&work->entry)); | 109 | BUG_ON(!list_empty(&work->entry)); |
107 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); | 110 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); |
108 | ret = 1; | 111 | ret = 1; |
@@ -118,7 +121,7 @@ static void delayed_work_timer_fn(unsigned long __data) | |||
118 | int cpu = smp_processor_id(); | 121 | int cpu = smp_processor_id(); |
119 | 122 | ||
120 | if (unlikely(is_single_threaded(wq))) | 123 | if (unlikely(is_single_threaded(wq))) |
121 | cpu = any_online_cpu(cpu_online_map); | 124 | cpu = singlethread_cpu; |
122 | 125 | ||
123 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); | 126 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); |
124 | } | 127 | } |
@@ -144,7 +147,7 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq, | |||
144 | return ret; | 147 | return ret; |
145 | } | 148 | } |
146 | 149 | ||
147 | static inline void run_workqueue(struct cpu_workqueue_struct *cwq) | 150 | static void run_workqueue(struct cpu_workqueue_struct *cwq) |
148 | { | 151 | { |
149 | unsigned long flags; | 152 | unsigned long flags; |
150 | 153 | ||
@@ -267,7 +270,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) | |||
267 | 270 | ||
268 | if (is_single_threaded(wq)) { | 271 | if (is_single_threaded(wq)) { |
269 | /* Always use first cpu's area. */ | 272 | /* Always use first cpu's area. */ |
270 | flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, any_online_cpu(cpu_online_map))); | 273 | flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu)); |
271 | } else { | 274 | } else { |
272 | int cpu; | 275 | int cpu; |
273 | 276 | ||
@@ -315,12 +318,17 @@ struct workqueue_struct *__create_workqueue(const char *name, | |||
315 | return NULL; | 318 | return NULL; |
316 | 319 | ||
317 | wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); | 320 | wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); |
321 | if (!wq->cpu_wq) { | ||
322 | kfree(wq); | ||
323 | return NULL; | ||
324 | } | ||
325 | |||
318 | wq->name = name; | 326 | wq->name = name; |
319 | /* We don't need the distraction of CPUs appearing and vanishing. */ | 327 | /* We don't need the distraction of CPUs appearing and vanishing. */ |
320 | lock_cpu_hotplug(); | 328 | lock_cpu_hotplug(); |
321 | if (singlethread) { | 329 | if (singlethread) { |
322 | INIT_LIST_HEAD(&wq->list); | 330 | INIT_LIST_HEAD(&wq->list); |
323 | p = create_workqueue_thread(wq, any_online_cpu(cpu_online_map)); | 331 | p = create_workqueue_thread(wq, singlethread_cpu); |
324 | if (!p) | 332 | if (!p) |
325 | destroy = 1; | 333 | destroy = 1; |
326 | else | 334 | else |
@@ -374,7 +382,7 @@ void destroy_workqueue(struct workqueue_struct *wq) | |||
374 | /* We don't need the distraction of CPUs appearing and vanishing. */ | 382 | /* We don't need the distraction of CPUs appearing and vanishing. */ |
375 | lock_cpu_hotplug(); | 383 | lock_cpu_hotplug(); |
376 | if (is_single_threaded(wq)) | 384 | if (is_single_threaded(wq)) |
377 | cleanup_workqueue_thread(wq, any_online_cpu(cpu_online_map)); | 385 | cleanup_workqueue_thread(wq, singlethread_cpu); |
378 | else { | 386 | else { |
379 | for_each_online_cpu(cpu) | 387 | for_each_online_cpu(cpu) |
380 | cleanup_workqueue_thread(wq, cpu); | 388 | cleanup_workqueue_thread(wq, cpu); |
@@ -419,6 +427,25 @@ int schedule_delayed_work_on(int cpu, | |||
419 | return ret; | 427 | return ret; |
420 | } | 428 | } |
421 | 429 | ||
430 | int schedule_on_each_cpu(void (*func) (void *info), void *info) | ||
431 | { | ||
432 | int cpu; | ||
433 | struct work_struct *work; | ||
434 | |||
435 | work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL); | ||
436 | |||
437 | if (!work) | ||
438 | return -ENOMEM; | ||
439 | for_each_online_cpu(cpu) { | ||
440 | INIT_WORK(work + cpu, func, info); | ||
441 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), | ||
442 | work + cpu); | ||
443 | } | ||
444 | flush_workqueue(keventd_wq); | ||
445 | kfree(work); | ||
446 | return 0; | ||
447 | } | ||
448 | |||
422 | void flush_scheduled_work(void) | 449 | void flush_scheduled_work(void) |
423 | { | 450 | { |
424 | flush_workqueue(keventd_wq); | 451 | flush_workqueue(keventd_wq); |
@@ -543,6 +570,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
543 | 570 | ||
544 | void init_workqueues(void) | 571 | void init_workqueues(void) |
545 | { | 572 | { |
573 | singlethread_cpu = first_cpu(cpu_possible_map); | ||
546 | hotcpu_notifier(workqueue_cpu_callback, 0); | 574 | hotcpu_notifier(workqueue_cpu_callback, 0); |
547 | keventd_wq = create_workqueue("events"); | 575 | keventd_wq = create_workqueue("events"); |
548 | BUG_ON(!keventd_wq); | 576 | BUG_ON(!keventd_wq); |